aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c1
-rw-r--r--fs/9p/v9fs_vfs.h1
-rw-r--r--fs/9p/vfs_inode.c6
-rw-r--r--fs/9p/vfs_inode_dotl.c16
-rw-r--r--fs/9p/xattr.c13
-rw-r--r--fs/9p/xattr.h2
-rw-r--r--fs/Kconfig4
-rw-r--r--fs/Makefile1
-rw-r--r--fs/adfs/inode.c13
-rw-r--r--fs/affs/amigaffs.c4
-rw-r--r--fs/affs/inode.c17
-rw-r--r--fs/affs/namei.c1
-rw-r--r--fs/afs/cell.c6
-rw-r--r--fs/afs/dynroot.c35
-rw-r--r--fs/afs/inode.c8
-rw-r--r--fs/afs/internal.h9
-rw-r--r--fs/afs/main.c2
-rw-r--r--fs/afs/rxrpc.c2
-rw-r--r--fs/afs/server_list.c2
-rw-r--r--fs/afs/super.c4
-rw-r--r--fs/afs/vl_rotate.c10
-rw-r--r--fs/afs/volume.c26
-rw-r--r--fs/afs/write.c2
-rw-r--r--fs/afs/xattr.c2
-rw-r--r--fs/anon_inodes.c4
-rw-r--r--fs/attr.c4
-rw-r--r--fs/autofs/autofs_i.h20
-rw-r--r--fs/autofs/init.c9
-rw-r--r--fs/autofs/inode.c463
-rw-r--r--fs/autofs/root.c6
-rw-r--r--fs/bad_inode.c2
-rw-r--r--fs/bcachefs/Kconfig95
-rw-r--r--fs/bcachefs/Makefile91
-rw-r--r--fs/bcachefs/acl.c464
-rw-r--r--fs/bcachefs/acl.h60
-rw-r--r--fs/bcachefs/alloc_background.c2159
-rw-r--r--fs/bcachefs/alloc_background.h259
-rw-r--r--fs/bcachefs/alloc_foreground.c1638
-rw-r--r--fs/bcachefs/alloc_foreground.h224
-rw-r--r--fs/bcachefs/alloc_types.h126
-rw-r--r--fs/bcachefs/backpointers.c860
-rw-r--r--fs/bcachefs/backpointers.h140
-rw-r--r--fs/bcachefs/bbpos.h37
-rw-r--r--fs/bcachefs/bbpos_types.h18
-rw-r--r--fs/bcachefs/bcachefs.h1164
-rw-r--r--fs/bcachefs/bcachefs_format.h2454
-rw-r--r--fs/bcachefs/bcachefs_ioctl.h368
-rw-r--r--fs/bcachefs/bkey.c1120
-rw-r--r--fs/bcachefs/bkey.h778
-rw-r--r--fs/bcachefs/bkey_buf.h61
-rw-r--r--fs/bcachefs/bkey_cmp.h129
-rw-r--r--fs/bcachefs/bkey_methods.c459
-rw-r--r--fs/bcachefs/bkey_methods.h179
-rw-r--r--fs/bcachefs/bkey_sort.c201
-rw-r--r--fs/bcachefs/bkey_sort.h54
-rw-r--r--fs/bcachefs/bset.c1592
-rw-r--r--fs/bcachefs/bset.h541
-rw-r--r--fs/bcachefs/btree_cache.c1215
-rw-r--r--fs/bcachefs/btree_cache.h131
-rw-r--r--fs/bcachefs/btree_gc.c2146
-rw-r--r--fs/bcachefs/btree_gc.h114
-rw-r--r--fs/bcachefs/btree_io.c2297
-rw-r--r--fs/bcachefs/btree_io.h225
-rw-r--r--fs/bcachefs/btree_iter.c3261
-rw-r--r--fs/bcachefs/btree_iter.h944
-rw-r--r--fs/bcachefs/btree_journal_iter.c543
-rw-r--r--fs/bcachefs/btree_journal_iter.h65
-rw-r--r--fs/bcachefs/btree_key_cache.c1074
-rw-r--r--fs/bcachefs/btree_key_cache.h48
-rw-r--r--fs/bcachefs/btree_key_cache_types.h34
-rw-r--r--fs/bcachefs/btree_locking.c817
-rw-r--r--fs/bcachefs/btree_locking.h433
-rw-r--r--fs/bcachefs/btree_trans_commit.c1162
-rw-r--r--fs/bcachefs/btree_types.h725
-rw-r--r--fs/bcachefs/btree_update.c950
-rw-r--r--fs/bcachefs/btree_update.h340
-rw-r--r--fs/bcachefs/btree_update_interior.c2476
-rw-r--r--fs/bcachefs/btree_update_interior.h332
-rw-r--r--fs/bcachefs/btree_write_buffer.c375
-rw-r--r--fs/bcachefs/btree_write_buffer.h14
-rw-r--r--fs/bcachefs/btree_write_buffer_types.h44
-rw-r--r--fs/bcachefs/buckets.c2170
-rw-r--r--fs/bcachefs/buckets.h458
-rw-r--r--fs/bcachefs/buckets_types.h92
-rw-r--r--fs/bcachefs/buckets_waiting_for_journal.c166
-rw-r--r--fs/bcachefs/buckets_waiting_for_journal.h15
-rw-r--r--fs/bcachefs/buckets_waiting_for_journal_types.h23
-rw-r--r--fs/bcachefs/chardev.c784
-rw-r--r--fs/bcachefs/chardev.h31
-rw-r--r--fs/bcachefs/checksum.c804
-rw-r--r--fs/bcachefs/checksum.h213
-rw-r--r--fs/bcachefs/clock.c193
-rw-r--r--fs/bcachefs/clock.h38
-rw-r--r--fs/bcachefs/clock_types.h37
-rw-r--r--fs/bcachefs/compress.c732
-rw-r--r--fs/bcachefs/compress.h73
-rw-r--r--fs/bcachefs/counters.c107
-rw-r--r--fs/bcachefs/counters.h17
-rw-r--r--fs/bcachefs/darray.c24
-rw-r--r--fs/bcachefs/darray.h105
-rw-r--r--fs/bcachefs/data_update.c653
-rw-r--r--fs/bcachefs/data_update.h49
-rw-r--r--fs/bcachefs/debug.c954
-rw-r--r--fs/bcachefs/debug.h32
-rw-r--r--fs/bcachefs/dirent.c580
-rw-r--r--fs/bcachefs/dirent.h71
-rw-r--r--fs/bcachefs/disk_groups.c622
-rw-r--r--fs/bcachefs/disk_groups.h111
-rw-r--r--fs/bcachefs/disk_groups_types.h18
-rw-r--r--fs/bcachefs/ec.c1981
-rw-r--r--fs/bcachefs/ec.h260
-rw-r--r--fs/bcachefs/ec_types.h41
-rw-r--r--fs/bcachefs/errcode.c68
-rw-r--r--fs/bcachefs/errcode.h273
-rw-r--r--fs/bcachefs/error.c302
-rw-r--r--fs/bcachefs/error.h242
-rw-r--r--fs/bcachefs/extent_update.c173
-rw-r--r--fs/bcachefs/extent_update.h12
-rw-r--r--fs/bcachefs/extents.c1511
-rw-r--r--fs/bcachefs/extents.h765
-rw-r--r--fs/bcachefs/extents_types.h40
-rw-r--r--fs/bcachefs/eytzinger.h281
-rw-r--r--fs/bcachefs/fifo.h127
-rw-r--r--fs/bcachefs/fs-common.c501
-rw-r--r--fs/bcachefs/fs-common.h43
-rw-r--r--fs/bcachefs/fs-io-buffered.c1106
-rw-r--r--fs/bcachefs/fs-io-buffered.h27
-rw-r--r--fs/bcachefs/fs-io-direct.c677
-rw-r--r--fs/bcachefs/fs-io-direct.h16
-rw-r--r--fs/bcachefs/fs-io-pagecache.c791
-rw-r--r--fs/bcachefs/fs-io-pagecache.h176
-rw-r--r--fs/bcachefs/fs-io.c1072
-rw-r--r--fs/bcachefs/fs-io.h184
-rw-r--r--fs/bcachefs/fs-ioctl.c570
-rw-r--r--fs/bcachefs/fs-ioctl.h81
-rw-r--r--fs/bcachefs/fs.c2010
-rw-r--r--fs/bcachefs/fs.h209
-rw-r--r--fs/bcachefs/fsck.c2490
-rw-r--r--fs/bcachefs/fsck.h15
-rw-r--r--fs/bcachefs/inode.c1205
-rw-r--r--fs/bcachefs/inode.h217
-rw-r--r--fs/bcachefs/io_misc.c524
-rw-r--r--fs/bcachefs/io_misc.h34
-rw-r--r--fs/bcachefs/io_read.c1210
-rw-r--r--fs/bcachefs/io_read.h158
-rw-r--r--fs/bcachefs/io_write.c1675
-rw-r--r--fs/bcachefs/io_write.h109
-rw-r--r--fs/bcachefs/io_write_types.h96
-rw-r--r--fs/bcachefs/journal.c1439
-rw-r--r--fs/bcachefs/journal.h450
-rw-r--r--fs/bcachefs/journal_io.c1966
-rw-r--r--fs/bcachefs/journal_io.h65
-rw-r--r--fs/bcachefs/journal_reclaim.c867
-rw-r--r--fs/bcachefs/journal_reclaim.h87
-rw-r--r--fs/bcachefs/journal_sb.c219
-rw-r--r--fs/bcachefs/journal_sb.h24
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c320
-rw-r--r--fs/bcachefs/journal_seq_blacklist.h22
-rw-r--r--fs/bcachefs/journal_types.h319
-rw-r--r--fs/bcachefs/keylist.c52
-rw-r--r--fs/bcachefs/keylist.h74
-rw-r--r--fs/bcachefs/keylist_types.h16
-rw-r--r--fs/bcachefs/logged_ops.c112
-rw-r--r--fs/bcachefs/logged_ops.h20
-rw-r--r--fs/bcachefs/lru.c164
-rw-r--r--fs/bcachefs/lru.h69
-rw-r--r--fs/bcachefs/mean_and_variance.c159
-rw-r--r--fs/bcachefs/mean_and_variance.h198
-rw-r--r--fs/bcachefs/mean_and_variance_test.c240
-rw-r--r--fs/bcachefs/migrate.c179
-rw-r--r--fs/bcachefs/migrate.h7
-rw-r--r--fs/bcachefs/move.c1154
-rw-r--r--fs/bcachefs/move.h158
-rw-r--r--fs/bcachefs/move_types.h36
-rw-r--r--fs/bcachefs/movinggc.c431
-rw-r--r--fs/bcachefs/movinggc.h12
-rw-r--r--fs/bcachefs/nocow_locking.c144
-rw-r--r--fs/bcachefs/nocow_locking.h50
-rw-r--r--fs/bcachefs/nocow_locking_types.h20
-rw-r--r--fs/bcachefs/opts.c602
-rw-r--r--fs/bcachefs/opts.h564
-rw-r--r--fs/bcachefs/printbuf.c447
-rw-r--r--fs/bcachefs/printbuf.h286
-rw-r--r--fs/bcachefs/quota.c979
-rw-r--r--fs/bcachefs/quota.h74
-rw-r--r--fs/bcachefs/quota_types.h43
-rw-r--r--fs/bcachefs/rebalance.c464
-rw-r--r--fs/bcachefs/rebalance.h27
-rw-r--r--fs/bcachefs/rebalance_types.h37
-rw-r--r--fs/bcachefs/recovery.c1157
-rw-r--r--fs/bcachefs/recovery.h39
-rw-r--r--fs/bcachefs/recovery_types.h65
-rw-r--r--fs/bcachefs/reflink.c414
-rw-r--r--fs/bcachefs/reflink.h81
-rw-r--r--fs/bcachefs/replicas.c1059
-rw-r--r--fs/bcachefs/replicas.h93
-rw-r--r--fs/bcachefs/replicas_types.h27
-rw-r--r--fs/bcachefs/sb-clean.c396
-rw-r--r--fs/bcachefs/sb-clean.h16
-rw-r--r--fs/bcachefs/sb-downgrade.c188
-rw-r--r--fs/bcachefs/sb-downgrade.h10
-rw-r--r--fs/bcachefs/sb-errors.c170
-rw-r--r--fs/bcachefs/sb-errors.h19
-rw-r--r--fs/bcachefs/sb-errors_types.h269
-rw-r--r--fs/bcachefs/sb-members.c420
-rw-r--r--fs/bcachefs/sb-members.h227
-rw-r--r--fs/bcachefs/seqmutex.h48
-rw-r--r--fs/bcachefs/siphash.c173
-rw-r--r--fs/bcachefs/siphash.h87
-rw-r--r--fs/bcachefs/six.c920
-rw-r--r--fs/bcachefs/six.h393
-rw-r--r--fs/bcachefs/snapshot.c1713
-rw-r--r--fs/bcachefs/snapshot.h268
-rw-r--r--fs/bcachefs/str_hash.h370
-rw-r--r--fs/bcachefs/subvolume.c455
-rw-r--r--fs/bcachefs/subvolume.h38
-rw-r--r--fs/bcachefs/subvolume_types.h31
-rw-r--r--fs/bcachefs/super-io.c1353
-rw-r--r--fs/bcachefs/super-io.h104
-rw-r--r--fs/bcachefs/super.c2030
-rw-r--r--fs/bcachefs/super.h52
-rw-r--r--fs/bcachefs/super_types.h41
-rw-r--r--fs/bcachefs/sysfs.c1034
-rw-r--r--fs/bcachefs/sysfs.h48
-rw-r--r--fs/bcachefs/tests.c919
-rw-r--r--fs/bcachefs/tests.h15
-rw-r--r--fs/bcachefs/trace.c17
-rw-r--r--fs/bcachefs/trace.h1327
-rw-r--r--fs/bcachefs/two_state_shared_lock.c8
-rw-r--r--fs/bcachefs/two_state_shared_lock.h59
-rw-r--r--fs/bcachefs/util.c1159
-rw-r--r--fs/bcachefs/util.h834
-rw-r--r--fs/bcachefs/varint.c129
-rw-r--r--fs/bcachefs/varint.h11
-rw-r--r--fs/bcachefs/vstructs.h63
-rw-r--r--fs/bcachefs/xattr.c653
-rw-r--r--fs/bcachefs/xattr.h50
-rw-r--r--fs/befs/linuxvfs.c11
-rw-r--r--fs/bfs/dir.c9
-rw-r--r--fs/bfs/inode.c12
-rw-r--r--fs/binfmt_elf.c215
-rw-r--r--fs/binfmt_elf_fdpic.c20
-rw-r--r--fs/binfmt_misc.c388
-rw-r--r--fs/btrfs/Kconfig21
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/accessors.h16
-rw-r--r--fs/btrfs/async-thread.c12
-rw-r--r--fs/btrfs/async-thread.h6
-rw-r--r--fs/btrfs/backref.c5
-rw-r--r--fs/btrfs/backref.h10
-rw-r--r--fs/btrfs/bio.c47
-rw-r--r--fs/btrfs/block-group.c174
-rw-r--r--fs/btrfs/block-rsv.c24
-rw-r--r--fs/btrfs/btrfs_inode.h80
-rw-r--r--fs/btrfs/check-integrity.c2871
-rw-r--r--fs/btrfs/check-integrity.h20
-rw-r--r--fs/btrfs/compression.c6
-rw-r--r--fs/btrfs/ctree.c342
-rw-r--r--fs/btrfs/ctree.h142
-rw-r--r--fs/btrfs/defrag.c152
-rw-r--r--fs/btrfs/defrag.h2
-rw-r--r--fs/btrfs/delalloc-space.c8
-rw-r--r--fs/btrfs/delayed-inode.c47
-rw-r--r--fs/btrfs/delayed-inode.h1
-rw-r--r--fs/btrfs/delayed-ref.c199
-rw-r--r--fs/btrfs/delayed-ref.h70
-rw-r--r--fs/btrfs/dev-replace.c17
-rw-r--r--fs/btrfs/dir-item.c8
-rw-r--r--fs/btrfs/dir-item.h9
-rw-r--r--fs/btrfs/disk-io.c171
-rw-r--r--fs/btrfs/disk-io.h3
-rw-r--r--fs/btrfs/extent-io-tree.c272
-rw-r--r--fs/btrfs/extent-io-tree.h7
-rw-r--r--fs/btrfs/extent-tree.c577
-rw-r--r--fs/btrfs/extent-tree.h18
-rw-r--r--fs/btrfs/extent_io.c53
-rw-r--r--fs/btrfs/extent_io.h4
-rw-r--r--fs/btrfs/file-item.c17
-rw-r--r--fs/btrfs/file.c81
-rw-r--r--fs/btrfs/free-space-cache.c28
-rw-r--r--fs/btrfs/free-space-tree.c17
-rw-r--r--fs/btrfs/fs.h69
-rw-r--r--fs/btrfs/inode-item.c21
-rw-r--r--fs/btrfs/inode-item.h8
-rw-r--r--fs/btrfs/inode.c249
-rw-r--r--fs/btrfs/ioctl.c61
-rw-r--r--fs/btrfs/locking.c20
-rw-r--r--fs/btrfs/messages.c32
-rw-r--r--fs/btrfs/messages.h14
-rw-r--r--fs/btrfs/ordered-data.c138
-rw-r--r--fs/btrfs/ordered-data.h17
-rw-r--r--fs/btrfs/print-tree.c35
-rw-r--r--fs/btrfs/props.c1
-rw-r--r--fs/btrfs/qgroup.c914
-rw-r--r--fs/btrfs/qgroup.h152
-rw-r--r--fs/btrfs/raid-stripe-tree.c274
-rw-r--r--fs/btrfs/raid-stripe-tree.h50
-rw-r--r--fs/btrfs/ref-verify.c11
-rw-r--r--fs/btrfs/reflink.c5
-rw-r--r--fs/btrfs/relocation.c208
-rw-r--r--fs/btrfs/relocation.h9
-rw-r--r--fs/btrfs/root-tree.c12
-rw-r--r--fs/btrfs/root-tree.h8
-rw-r--r--fs/btrfs/scrub.c88
-rw-r--r--fs/btrfs/send.c8
-rw-r--r--fs/btrfs/space-info.c64
-rw-r--r--fs/btrfs/space-info.h3
-rw-r--r--fs/btrfs/super.c94
-rw-r--r--fs/btrfs/sysfs.c53
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c6
-rw-r--r--fs/btrfs/tests/inode-tests.c12
-rw-r--r--fs/btrfs/transaction.c234
-rw-r--r--fs/btrfs/transaction.h23
-rw-r--r--fs/btrfs/tree-checker.c87
-rw-r--r--fs/btrfs/tree-log.c93
-rw-r--r--fs/btrfs/ulist.c3
-rw-r--r--fs/btrfs/uuid-tree.c6
-rw-r--r--fs/btrfs/verity.c4
-rw-r--r--fs/btrfs/volumes.c533
-rw-r--r--fs/btrfs/volumes.h45
-rw-r--r--fs/btrfs/xattr.c14
-rw-r--r--fs/btrfs/xattr.h2
-rw-r--r--fs/btrfs/zoned.c459
-rw-r--r--fs/btrfs/zstd.c11
-rw-r--r--fs/buffer.c129
-rw-r--r--fs/ceph/acl.c12
-rw-r--r--fs/ceph/addr.c309
-rw-r--r--fs/ceph/cache.c2
-rw-r--r--fs/ceph/caps.c767
-rw-r--r--fs/ceph/crypto.c44
-rw-r--r--fs/ceph/debugfs.c10
-rw-r--r--fs/ceph/dir.c242
-rw-r--r--fs/ceph/export.c49
-rw-r--r--fs/ceph/file.c284
-rw-r--r--fs/ceph/inode.c567
-rw-r--r--fs/ceph/ioctl.c21
-rw-r--r--fs/ceph/locks.c57
-rw-r--r--fs/ceph/mds_client.c686
-rw-r--r--fs/ceph/mds_client.h13
-rw-r--r--fs/ceph/mdsmap.c29
-rw-r--r--fs/ceph/mdsmap.h75
-rw-r--r--fs/ceph/metric.c5
-rw-r--r--fs/ceph/quota.c29
-rw-r--r--fs/ceph/snap.c196
-rw-r--r--fs/ceph/super.c99
-rw-r--r--fs/ceph/super.h25
-rw-r--r--fs/ceph/xattr.c110
-rw-r--r--fs/char_dev.c4
-rw-r--r--fs/coda/coda_linux.c6
-rw-r--r--fs/coda/dir.c2
-rw-r--r--fs/coda/file.c2
-rw-r--r--fs/configfs/inode.c8
-rw-r--r--fs/cramfs/inode.c6
-rw-r--r--fs/crypto/bio.c39
-rw-r--r--fs/crypto/crypto.c163
-rw-r--r--fs/crypto/fname.c6
-rw-r--r--fs/crypto/fscrypt_private.h164
-rw-r--r--fs/crypto/hooks.c4
-rw-r--r--fs/crypto/inline_crypt.c32
-rw-r--r--fs/crypto/keyring.c82
-rw-r--r--fs/crypto/keysetup.c62
-rw-r--r--fs/crypto/keysetup_v1.c22
-rw-r--r--fs/crypto/policy.c83
-rw-r--r--fs/dax.c24
-rw-r--r--fs/dcache.c20
-rw-r--r--fs/debugfs/file.c94
-rw-r--r--fs/debugfs/inode.c66
-rw-r--r--fs/debugfs/internal.h15
-rw-r--r--fs/devpts/inode.c6
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/dlm/debug_fs.c18
-rw-r--r--fs/dlm/lowcomms.c12
-rw-r--r--fs/dlm/midcomms.c39
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h2
-rw-r--r--fs/ecryptfs/inode.c14
-rw-r--r--fs/efivarfs/file.c2
-rw-r--r--fs/efivarfs/inode.c6
-rw-r--r--fs/efivarfs/internal.h9
-rw-r--r--fs/efivarfs/super.c70
-rw-r--r--fs/efs/inode.c5
-rw-r--r--fs/efs/super.c1
-rw-r--r--fs/erofs/Kconfig9
-rw-r--r--fs/erofs/compress.h6
-rw-r--r--fs/erofs/data.c7
-rw-r--r--fs/erofs/decompressor.c63
-rw-r--r--fs/erofs/decompressor_deflate.c6
-rw-r--r--fs/erofs/decompressor_lzma.c7
-rw-r--r--fs/erofs/inode.c101
-rw-r--r--fs/erofs/internal.h44
-rw-r--r--fs/erofs/super.c102
-rw-r--r--fs/erofs/utils.c27
-rw-r--r--fs/erofs/xattr.c2
-rw-r--r--fs/erofs/xattr.h4
-rw-r--r--fs/erofs/zdata.c1
-rw-r--r--fs/eventpoll.c6
-rw-r--r--fs/exec.c4
-rw-r--r--fs/exfat/dir.c20
-rw-r--r--fs/exfat/exfat_fs.h15
-rw-r--r--fs/exfat/exfat_raw.h19
-rw-r--r--fs/exfat/file.c103
-rw-r--r--fs/exfat/inode.c37
-rw-r--r--fs/exfat/misc.c8
-rw-r--r--fs/exfat/namei.c83
-rw-r--r--fs/exfat/super.c15
-rw-r--r--fs/exportfs/expfs.c57
-rw-r--r--fs/ext2/dir.c220
-rw-r--r--fs/ext2/ext2.h23
-rw-r--r--fs/ext2/file.c1
-rw-r--r--fs/ext2/ialloc.c2
-rw-r--r--fs/ext2/inode.c13
-rw-r--r--fs/ext2/namei.c32
-rw-r--r--fs/ext2/super.c3
-rw-r--r--fs/ext2/xattr.c4
-rw-r--r--fs/ext2/xattr.h2
-rw-r--r--fs/ext4/acl.h5
-rw-r--r--fs/ext4/balloc.c16
-rw-r--r--fs/ext4/crypto.c13
-rw-r--r--fs/ext4/ext4.h32
-rw-r--r--fs/ext4/extents.c25
-rw-r--r--fs/ext4/extents_status.c150
-rw-r--r--fs/ext4/fast_commit.c8
-rw-r--r--fs/ext4/file.c171
-rw-r--r--fs/ext4/fsmap.c9
-rw-r--r--fs/ext4/ialloc.c4
-rw-r--r--fs/ext4/inline.c4
-rw-r--r--fs/ext4/inode.c47
-rw-r--r--fs/ext4/ioctl.c13
-rw-r--r--fs/ext4/mballoc-test.c349
-rw-r--r--fs/ext4/mballoc.c575
-rw-r--r--fs/ext4/move_extent.c11
-rw-r--r--fs/ext4/namei.c13
-rw-r--r--fs/ext4/readpage.c14
-rw-r--r--fs/ext4/resize.c94
-rw-r--r--fs/ext4/super.c85
-rw-r--r--fs/ext4/xattr.c10
-rw-r--r--fs/ext4/xattr.h2
-rw-r--r--fs/f2fs/compress.c63
-rw-r--r--fs/f2fs/data.c35
-rw-r--r--fs/f2fs/dir.c6
-rw-r--r--fs/f2fs/extent_cache.c53
-rw-r--r--fs/f2fs/f2fs.h11
-rw-r--r--fs/f2fs/file.c31
-rw-r--r--fs/f2fs/inline.c2
-rw-r--r--fs/f2fs/inode.c26
-rw-r--r--fs/f2fs/namei.c4
-rw-r--r--fs/f2fs/node.c20
-rw-r--r--fs/f2fs/recovery.c8
-rw-r--r--fs/f2fs/segment.c92
-rw-r--r--fs/f2fs/segment.h4
-rw-r--r--fs/f2fs/super.c158
-rw-r--r--fs/f2fs/xattr.c24
-rw-r--r--fs/f2fs/xattr.h2
-rw-r--r--fs/fat/inode.c25
-rw-r--r--fs/fat/misc.c6
-rw-r--r--fs/fat/nfs.c1
-rw-r--r--fs/fcntl.c2
-rw-r--r--fs/fhandle.c6
-rw-r--r--fs/file.c153
-rw-r--r--fs/file_table.c51
-rw-r--r--fs/freevxfs/vxfs_inode.c6
-rw-r--r--fs/freevxfs/vxfs_super.c2
-rw-r--r--fs/fs-writeback.c41
-rw-r--r--fs/fsopen.c1
-rw-r--r--fs/fuse/control.c2
-rw-r--r--fs/fuse/dax.c1
-rw-r--r--fs/fuse/dir.c10
-rw-r--r--fs/fuse/file.c8
-rw-r--r--fs/fuse/fuse_i.h21
-rw-r--r--fs/fuse/inode.c117
-rw-r--r--fs/fuse/readdir.c6
-rw-r--r--fs/fuse/xattr.c2
-rw-r--r--fs/gfs2/acl.h8
-rw-r--r--fs/gfs2/aops.c74
-rw-r--r--fs/gfs2/aops.h6
-rw-r--r--fs/gfs2/bmap.c75
-rw-r--r--fs/gfs2/bmap.h38
-rw-r--r--fs/gfs2/dir.c12
-rw-r--r--fs/gfs2/dir.h38
-rw-r--r--fs/gfs2/file.c18
-rw-r--r--fs/gfs2/glock.c31
-rw-r--r--fs/gfs2/glock.h113
-rw-r--r--fs/gfs2/glops.c24
-rw-r--r--fs/gfs2/glops.h4
-rw-r--r--fs/gfs2/incore.h2
-rw-r--r--fs/gfs2/inode.c40
-rw-r--r--fs/gfs2/inode.h60
-rw-r--r--fs/gfs2/log.h46
-rw-r--r--fs/gfs2/lops.h22
-rw-r--r--fs/gfs2/main.c6
-rw-r--r--fs/gfs2/meta_io.c61
-rw-r--r--fs/gfs2/meta_io.h20
-rw-r--r--fs/gfs2/ops_fstype.c37
-rw-r--r--fs/gfs2/quota.c95
-rw-r--r--fs/gfs2/quota.h41
-rw-r--r--fs/gfs2/recovery.h18
-rw-r--r--fs/gfs2/rgrp.c12
-rw-r--r--fs/gfs2/rgrp.h85
-rw-r--r--fs/gfs2/super.c41
-rw-r--r--fs/gfs2/super.h54
-rw-r--r--fs/gfs2/trans.h24
-rw-r--r--fs/gfs2/util.h8
-rw-r--r--fs/gfs2/xattr.c10
-rw-r--r--fs/gfs2/xattr.h12
-rw-r--r--fs/hfs/attr.c2
-rw-r--r--fs/hfs/catalog.c8
-rw-r--r--fs/hfs/hfs_fs.h2
-rw-r--r--fs/hfs/inode.c16
-rw-r--r--fs/hfs/sysdep.c10
-rw-r--r--fs/hfsplus/catalog.c8
-rw-r--r--fs/hfsplus/inode.c22
-rw-r--r--fs/hfsplus/xattr.c2
-rw-r--r--fs/hfsplus/xattr.h2
-rw-r--r--fs/hostfs/hostfs_kern.c12
-rw-r--r--fs/hpfs/dir.c12
-rw-r--r--fs/hpfs/inode.c16
-rw-r--r--fs/hpfs/namei.c22
-rw-r--r--fs/hpfs/super.c10
-rw-r--r--fs/hugetlbfs/inode.c96
-rw-r--r--fs/init.c6
-rw-r--r--fs/inode.c53
-rw-r--r--fs/internal.h22
-rw-r--r--fs/iomap/buffered-io.c57
-rw-r--r--fs/isofs/inode.c4
-rw-r--r--fs/isofs/rock.c18
-rw-r--r--fs/jbd2/commit.c10
-rw-r--r--fs/jbd2/journal.c53
-rw-r--r--fs/jbd2/recovery.c13
-rw-r--r--fs/jffs2/dir.c35
-rw-r--r--fs/jffs2/file.c4
-rw-r--r--fs/jffs2/fs.c20
-rw-r--r--fs/jffs2/os-linux.h4
-rw-r--r--fs/jffs2/super.c1
-rw-r--r--fs/jffs2/xattr.c2
-rw-r--r--fs/jffs2/xattr.h2
-rw-r--r--fs/jfs/inode.c2
-rw-r--r--fs/jfs/jfs_dinode.h2
-rw-r--r--fs/jfs/jfs_dmap.c23
-rw-r--r--fs/jfs/jfs_imap.c31
-rw-r--r--fs/jfs/jfs_incore.h2
-rw-r--r--fs/jfs/jfs_inode.c4
-rw-r--r--fs/jfs/jfs_logmgr.c33
-rw-r--r--fs/jfs/jfs_logmgr.h2
-rw-r--r--fs/jfs/jfs_mount.c3
-rw-r--r--fs/jfs/jfs_txnmgr.c4
-rw-r--r--fs/jfs/jfs_xattr.h2
-rw-r--r--fs/jfs/jfs_xtree.c4
-rw-r--r--fs/jfs/jfs_xtree.h37
-rw-r--r--fs/jfs/namei.c20
-rw-r--r--fs/jfs/super.c3
-rw-r--r--fs/jfs/xattr.c2
-rw-r--r--fs/kernfs/file.c78
-rw-r--r--fs/kernfs/inode.c8
-rw-r--r--fs/kernfs/kernfs-internal.h2
-rw-r--r--fs/kernfs/mount.c7
-rw-r--r--fs/libfs.c99
-rw-r--r--fs/lockd/svc.c7
-rw-r--r--fs/lockd/svclock.c43
-rw-r--r--fs/locks.c16
-rw-r--r--fs/mbcache.c22
-rw-r--r--fs/minix/bitmap.c2
-rw-r--r--fs/minix/dir.c6
-rw-r--r--fs/minix/inode.c17
-rw-r--r--fs/minix/itree_common.c2
-rw-r--r--fs/mnt_idmapping.c2
-rw-r--r--fs/mpage.c3
-rw-r--r--fs/namei.c31
-rw-r--r--fs/namespace.c56
-rw-r--r--fs/nfs/Kconfig2
-rw-r--r--fs/nfs/blocklayout/blocklayout.h2
-rw-r--r--fs/nfs/blocklayout/dev.c76
-rw-r--r--fs/nfs/callback.c46
-rw-r--r--fs/nfs/callback_proc.c2
-rw-r--r--fs/nfs/delegation.c7
-rw-r--r--fs/nfs/delegation.h1
-rw-r--r--fs/nfs/dir.c29
-rw-r--r--fs/nfs/filelayout/filelayout.h2
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h2
-rw-r--r--fs/nfs/fscache.h4
-rw-r--r--fs/nfs/inode.c30
-rw-r--r--fs/nfs/nfs.h2
-rw-r--r--fs/nfs/nfs3proc.c3
-rw-r--r--fs/nfs/nfs42xattr.c87
-rw-r--r--fs/nfs/nfs4_fs.h4
-rw-r--r--fs/nfs/nfs4proc.c64
-rw-r--r--fs/nfs/pnfs.c8
-rw-r--r--fs/nfs/pnfs.h5
-rw-r--r--fs/nfs/proc.c3
-rw-r--r--fs/nfs/super.c31
-rw-r--r--fs/nfs/write.c2
-rw-r--r--fs/nfsd/Makefile3
-rw-r--r--fs/nfsd/auth.c4
-rw-r--r--fs/nfsd/blocklayout.c3
-rw-r--r--fs/nfsd/blocklayoutxdr.c6
-rw-r--r--fs/nfsd/blocklayoutxdr.h4
-rw-r--r--fs/nfsd/cache.h4
-rw-r--r--fs/nfsd/export.c35
-rw-r--r--fs/nfsd/export.h4
-rw-r--r--fs/nfsd/filecache.c50
-rw-r--r--fs/nfsd/flexfilelayoutxdr.c6
-rw-r--r--fs/nfsd/flexfilelayoutxdr.h4
-rw-r--r--fs/nfsd/netlink.c32
-rw-r--r--fs/nfsd/netlink.h22
-rw-r--r--fs/nfsd/netns.h4
-rw-r--r--fs/nfsd/nfs3proc.c9
-rw-r--r--fs/nfsd/nfs4layouts.c6
-rw-r--r--fs/nfsd/nfs4proc.c40
-rw-r--r--fs/nfsd/nfs4state.c63
-rw-r--r--fs/nfsd/nfs4xdr.c2631
-rw-r--r--fs/nfsd/nfscache.c118
-rw-r--r--fs/nfsd/nfsctl.c234
-rw-r--r--fs/nfsd/nfsd.h25
-rw-r--r--fs/nfsd/nfsfh.c2
-rw-r--r--fs/nfsd/nfsfh.h3
-rw-r--r--fs/nfsd/nfssvc.c59
-rw-r--r--fs/nfsd/pnfs.h6
-rw-r--r--fs/nfsd/state.h2
-rw-r--r--fs/nfsd/stats.c4
-rw-r--r--fs/nfsd/stats.h18
-rw-r--r--fs/nfsd/trace.h87
-rw-r--r--fs/nfsd/vfs.c70
-rw-r--r--fs/nfsd/vfs.h4
-rw-r--r--fs/nfsd/xdr4.h154
-rw-r--r--fs/nilfs2/dir.c6
-rw-r--r--fs/nilfs2/inode.c20
-rw-r--r--fs/nilfs2/mdt.c66
-rw-r--r--fs/nilfs2/page.c76
-rw-r--r--fs/nilfs2/page.h11
-rw-r--r--fs/nilfs2/segment.c7
-rw-r--r--fs/nilfs2/sufile.c42
-rw-r--r--fs/nilfs2/the_nilfs.c6
-rw-r--r--fs/notify/dnotify/dnotify.c12
-rw-r--r--fs/notify/fanotify/fanotify.h4
-rw-r--r--fs/notify/fanotify/fanotify_user.c12
-rw-r--r--fs/notify/inotify/inotify_user.c2
-rw-r--r--fs/nsfs.c2
-rw-r--r--fs/ntfs/aops.c255
-rw-r--r--fs/ntfs/file.c89
-rw-r--r--fs/ntfs/inode.c25
-rw-r--r--fs/ntfs/mft.c2
-rw-r--r--fs/ntfs/namei.c1
-rw-r--r--fs/ntfs3/file.c37
-rw-r--r--fs/ntfs3/frecord.c11
-rw-r--r--fs/ntfs3/inode.c25
-rw-r--r--fs/ntfs3/namei.c4
-rw-r--r--fs/ntfs3/ntfs_fs.h2
-rw-r--r--fs/ntfs3/super.c1
-rw-r--r--fs/ntfs3/xattr.c2
-rw-r--r--fs/ocfs2/acl.c4
-rw-r--r--fs/ocfs2/alloc.c17
-rw-r--r--fs/ocfs2/aops.c25
-rw-r--r--fs/ocfs2/buffer_head_io.c4
-rw-r--r--fs/ocfs2/cluster/heartbeat.c81
-rw-r--r--fs/ocfs2/dir.c9
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c7
-rw-r--r--fs/ocfs2/dlmglue.c29
-rw-r--r--fs/ocfs2/file.c30
-rw-r--r--fs/ocfs2/inode.c28
-rw-r--r--fs/ocfs2/journal.c2
-rw-r--r--fs/ocfs2/move_extents.c4
-rw-r--r--fs/ocfs2/namei.c24
-rw-r--r--fs/ocfs2/quota_local.c4
-rw-r--r--fs/ocfs2/refcounttree.c12
-rw-r--r--fs/ocfs2/slot_map.c2
-rw-r--r--fs/ocfs2/xattr.c8
-rw-r--r--fs/ocfs2/xattr.h2
-rw-r--r--fs/omfs/inode.c12
-rw-r--r--fs/open.c55
-rw-r--r--fs/openpromfs/inode.c4
-rw-r--r--fs/orangefs/orangefs-kernel.h2
-rw-r--r--fs/orangefs/orangefs-utils.c16
-rw-r--r--fs/orangefs/xattr.c2
-rw-r--r--fs/overlayfs/Makefile2
-rw-r--r--fs/overlayfs/copy_up.c143
-rw-r--r--fs/overlayfs/dir.c64
-rw-r--r--fs/overlayfs/export.c7
-rw-r--r--fs/overlayfs/file.c97
-rw-r--r--fs/overlayfs/inode.c178
-rw-r--r--fs/overlayfs/namei.c52
-rw-r--r--fs/overlayfs/overlayfs.h80
-rw-r--r--fs/overlayfs/params.c333
-rw-r--r--fs/overlayfs/params.h1
-rw-r--r--fs/overlayfs/readdir.c27
-rw-r--r--fs/overlayfs/super.c118
-rw-r--r--fs/overlayfs/util.c123
-rw-r--r--fs/overlayfs/xattrs.c271
-rw-r--r--fs/pipe.c68
-rw-r--r--fs/proc/array.c7
-rw-r--r--fs/proc/base.c56
-rw-r--r--fs/proc/bootconfig.c6
-rw-r--r--fs/proc/fd.c11
-rw-r--r--fs/proc/inode.c13
-rw-r--r--fs/proc/kcore.c3
-rw-r--r--fs/proc/nommu.c2
-rw-r--r--fs/proc/proc_sysctl.c10
-rw-r--r--fs/proc/root.c2
-rw-r--r--fs/proc/self.c2
-rw-r--r--fs/proc/task_mmu.c764
-rw-r--r--fs/proc/task_nommu.c2
-rw-r--r--fs/proc/thread_self.c2
-rw-r--r--fs/pstore/inode.c5
-rw-r--r--fs/pstore/platform.c9
-rw-r--r--fs/qnx4/inode.c6
-rw-r--r--fs/qnx6/inode.c6
-rw-r--r--fs/quota/dquot.c31
-rw-r--r--fs/ramfs/inode.c7
-rw-r--r--fs/reiserfs/inode.c106
-rw-r--r--fs/reiserfs/journal.c56
-rw-r--r--fs/reiserfs/namei.c8
-rw-r--r--fs/reiserfs/procfs.c2
-rw-r--r--fs/reiserfs/reiserfs.h13
-rw-r--r--fs/reiserfs/stree.c5
-rw-r--r--fs/reiserfs/super.c2
-rw-r--r--fs/reiserfs/xattr.c4
-rw-r--r--fs/romfs/super.c5
-rw-r--r--fs/smb/client/cached_dir.c99
-rw-r--r--fs/smb/client/cifs_debug.c45
-rw-r--r--fs/smb/client/cifs_ioctl.h6
-rw-r--r--fs/smb/client/cifs_spnego.c4
-rw-r--r--fs/smb/client/cifsfs.c175
-rw-r--r--fs/smb/client/cifsfs.h6
-rw-r--r--fs/smb/client/cifsglob.h34
-rw-r--r--fs/smb/client/cifspdu.h30
-rw-r--r--fs/smb/client/cifsproto.h28
-rw-r--r--fs/smb/client/cifssmb.c199
-rw-r--r--fs/smb/client/connect.c97
-rw-r--r--fs/smb/client/dfs.c18
-rw-r--r--fs/smb/client/export.c11
-rw-r--r--fs/smb/client/file.c18
-rw-r--r--fs/smb/client/fs_context.h1
-rw-r--r--fs/smb/client/fscache.h6
-rw-r--r--fs/smb/client/inode.c99
-rw-r--r--fs/smb/client/ioctl.c26
-rw-r--r--fs/smb/client/link.c16
-rw-r--r--fs/smb/client/misc.c4
-rw-r--r--fs/smb/client/namespace.c17
-rw-r--r--fs/smb/client/ntlmssp.h4
-rw-r--r--fs/smb/client/readdir.c6
-rw-r--r--fs/smb/client/sess.c298
-rw-r--r--fs/smb/client/smb1ops.c153
-rw-r--r--fs/smb/client/smb2inode.c2
-rw-r--r--fs/smb/client/smb2misc.c58
-rw-r--r--fs/smb/client/smb2ops.c351
-rw-r--r--fs/smb/client/smb2pdu.c262
-rw-r--r--fs/smb/client/smb2pdu.h16
-rw-r--r--fs/smb/client/smb2proto.h12
-rw-r--r--fs/smb/client/smb2transport.c13
-rw-r--r--fs/smb/client/transport.c13
-rw-r--r--fs/smb/client/xattr.c7
-rw-r--r--fs/smb/common/smb2pdu.h44
-rw-r--r--fs/smb/server/connection.c16
-rw-r--r--fs/smb/server/ksmbd_spnego_negtokeninit.asn18
-rw-r--r--fs/smb/server/ksmbd_spnego_negtokentarg.asn17
-rw-r--r--fs/smb/server/ksmbd_work.c51
-rw-r--r--fs/smb/server/mgmt/user_config.h1
-rw-r--r--fs/smb/server/oplock.c118
-rw-r--r--fs/smb/server/oplock.h8
-rw-r--r--fs/smb/server/smb2misc.c15
-rw-r--r--fs/smb/server/smb2ops.c9
-rw-r--r--fs/smb/server/smb2pdu.c164
-rw-r--r--fs/smb/server/smb_common.c11
-rw-r--r--fs/smb/server/smbacl.c36
-rw-r--r--fs/smb/server/smbacl.h2
-rw-r--r--fs/smb/server/transport_rdma.c40
-rw-r--r--fs/smb/server/unicode.c187
-rw-r--r--fs/smb/server/vfs.c105
-rw-r--r--fs/smb/server/vfs.h10
-rw-r--r--fs/smb/server/vfs_cache.c30
-rw-r--r--fs/smb/server/vfs_cache.h9
-rw-r--r--fs/squashfs/block.c2
-rw-r--r--fs/squashfs/export.c1
-rw-r--r--fs/squashfs/inode.c6
-rw-r--r--fs/squashfs/squashfs.h2
-rw-r--r--fs/squashfs/xattr.c2
-rw-r--r--fs/stack.c4
-rw-r--r--fs/stat.c10
-rw-r--r--fs/super.c102
-rw-r--r--fs/sysfs/file.c13
-rw-r--r--fs/sysv/dir.c6
-rw-r--r--fs/sysv/ialloc.c2
-rw-r--r--fs/sysv/inode.c12
-rw-r--r--fs/sysv/itree.c2
-rw-r--r--fs/tracefs/event_inode.c1197
-rw-r--r--fs/tracefs/inode.c27
-rw-r--r--fs/tracefs/internal.h56
-rw-r--r--fs/ubifs/auth.c3
-rw-r--r--fs/ubifs/crypto.c3
-rw-r--r--fs/ubifs/debug.c12
-rw-r--r--fs/ubifs/dir.c27
-rw-r--r--fs/ubifs/file.c19
-rw-r--r--fs/ubifs/journal.c13
-rw-r--r--fs/ubifs/replay.c1
-rw-r--r--fs/ubifs/super.c34
-rw-r--r--fs/ubifs/tnc.c1
-rw-r--r--fs/ubifs/ubifs.h4
-rw-r--r--fs/ubifs/xattr.c2
-rw-r--r--fs/udf/ecma_167.h2
-rw-r--r--fs/udf/ialloc.c4
-rw-r--r--fs/udf/inode.c38
-rw-r--r--fs/udf/namei.c16
-rw-r--r--fs/udf/udf_sb.h2
-rw-r--r--fs/ufs/balloc.c20
-rw-r--r--fs/ufs/dir.c6
-rw-r--r--fs/ufs/ialloc.c2
-rw-r--r--fs/ufs/inode.c67
-rw-r--r--fs/ufs/super.c1
-rw-r--r--fs/ufs/util.c34
-rw-r--r--fs/ufs/util.h10
-rw-r--r--fs/userfaultfd.c98
-rw-r--r--fs/vboxsf/utils.c15
-rw-r--r--fs/xattr.c6
-rw-r--r--fs/xfs/Kconfig2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c27
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c45
-rw-r--r--fs/xfs/libxfs/xfs_defer.c28
-rw-r--r--fs/xfs/libxfs/xfs_defer.h2
-rw-r--r--fs/xfs/libxfs/xfs_format.h34
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c13
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c809
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.h383
-rw-r--r--fs/xfs/libxfs/xfs_sb.c2
-rw-r--r--fs/xfs/libxfs/xfs_sb.h2
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c2
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c10
-rw-r--r--fs/xfs/libxfs/xfs_types.c4
-rw-r--r--fs/xfs/libxfs/xfs_types.h10
-rw-r--r--fs/xfs/scrub/bmap.c2
-rw-r--r--fs/xfs/scrub/fscounters.c2
-rw-r--r--fs/xfs/scrub/inode.c3
-rw-r--r--fs/xfs/scrub/rtbitmap.c28
-rw-r--r--fs/xfs/scrub/rtsummary.c72
-rw-r--r--fs/xfs/scrub/trace.c1
-rw-r--r--fs/xfs/scrub/trace.h15
-rw-r--r--fs/xfs/xfs_bmap_util.c81
-rw-r--r--fs/xfs/xfs_buf.c46
-rw-r--r--fs/xfs/xfs_buf.h5
-rw-r--r--fs/xfs/xfs_dquot.c5
-rw-r--r--fs/xfs/xfs_dquot_item_recover.c21
-rw-r--r--fs/xfs/xfs_file.c63
-rw-r--r--fs/xfs/xfs_fsmap.c15
-rw-r--r--fs/xfs/xfs_icache.c26
-rw-r--r--fs/xfs/xfs_inode.c28
-rw-r--r--fs/xfs/xfs_inode.h17
-rw-r--r--fs/xfs/xfs_inode_item.c7
-rw-r--r--fs/xfs/xfs_inode_item_recover.c46
-rw-r--r--fs/xfs/xfs_ioctl.c35
-rw-r--r--fs/xfs/xfs_ioctl32.h2
-rw-r--r--fs/xfs/xfs_iops.c15
-rw-r--r--fs/xfs/xfs_itable.c12
-rw-r--r--fs/xfs/xfs_linux.h12
-rw-r--r--fs/xfs/xfs_log.c23
-rw-r--r--fs/xfs/xfs_log_recover.c2
-rw-r--r--fs/xfs/xfs_mount.c4
-rw-r--r--fs/xfs/xfs_mount.h10
-rw-r--r--fs/xfs/xfs_ondisk.h4
-rw-r--r--fs/xfs/xfs_qm.c27
-rw-r--r--fs/xfs/xfs_qm.h2
-rw-r--r--fs/xfs/xfs_reflink.c5
-rw-r--r--fs/xfs/xfs_rtalloc.c650
-rw-r--r--fs/xfs/xfs_rtalloc.h94
-rw-r--r--fs/xfs/xfs_super.c45
-rw-r--r--fs/xfs/xfs_trans.c7
-rw-r--r--fs/xfs/xfs_xattr.c2
-rw-r--r--fs/xfs/xfs_xattr.h2
-rw-r--r--fs/zonefs/super.c10
864 files changed, 118289 insertions, 16327 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index d525957594b6..61dbe52bb3a3 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -732,4 +732,5 @@ module_exit(exit_v9fs)
MODULE_AUTHOR("Latchesar Ionkov <lucho@ionkov.net>");
MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>");
MODULE_AUTHOR("Ron Minnich <rminnich@lanl.gov>");
+MODULE_DESCRIPTION("9P Client File System");
MODULE_LICENSE("GPL");
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index cdf441f22e07..731e3d14b67d 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -52,7 +52,6 @@ void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
unsigned int flags);
int v9fs_dir_release(struct inode *inode, struct file *filp);
int v9fs_file_open(struct inode *inode, struct file *file);
-void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
int v9fs_uflags2omode(int uflags, int extended);
void v9fs_blank_wstat(struct p9_wstat *wstat);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 0d28ecf668d0..b845ee18a80b 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -260,7 +260,7 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
inode_init_owner(&nop_mnt_idmap, inode, NULL, mode);
inode->i_blocks = 0;
inode->i_rdev = rdev;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_mapping->a_ops = &v9fs_addr_operations;
inode->i_private = NULL;
@@ -1150,8 +1150,8 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
set_nlink(inode, 1);
- inode->i_atime.tv_sec = stat->atime;
- inode->i_mtime.tv_sec = stat->mtime;
+ inode_set_atime(inode, stat->atime, 0);
+ inode_set_mtime(inode, stat->mtime, 0);
inode_set_ctime(inode, stat->mtime, 0);
inode->i_uid = v9ses->dfltuid;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 1312f68965ac..c7319af2f471 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -641,10 +641,10 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
struct v9fs_inode *v9inode = V9FS_I(inode);
if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
- inode->i_atime.tv_sec = stat->st_atime_sec;
- inode->i_atime.tv_nsec = stat->st_atime_nsec;
- inode->i_mtime.tv_sec = stat->st_mtime_sec;
- inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+ inode_set_atime(inode, stat->st_atime_sec,
+ stat->st_atime_nsec);
+ inode_set_mtime(inode, stat->st_mtime_sec,
+ stat->st_mtime_nsec);
inode_set_ctime(inode, stat->st_ctime_sec,
stat->st_ctime_nsec);
inode->i_uid = stat->st_uid;
@@ -660,12 +660,12 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
inode->i_blocks = stat->st_blocks;
} else {
if (stat->st_result_mask & P9_STATS_ATIME) {
- inode->i_atime.tv_sec = stat->st_atime_sec;
- inode->i_atime.tv_nsec = stat->st_atime_nsec;
+ inode_set_atime(inode, stat->st_atime_sec,
+ stat->st_atime_nsec);
}
if (stat->st_result_mask & P9_STATS_MTIME) {
- inode->i_mtime.tv_sec = stat->st_mtime_sec;
- inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+ inode_set_mtime(inode, stat->st_mtime_sec,
+ stat->st_mtime_nsec);
}
if (stat->st_result_mask & P9_STATS_CTIME) {
inode_set_ctime(inode, stat->st_ctime_sec,
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index e00cf8109b3f..8604e3377ee7 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -68,7 +68,7 @@ ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
struct p9_fid *fid;
int ret;
- p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu\n",
+ p9_debug(P9_DEBUG_VFS, "name = '%s' value_len = %zu\n",
name, buffer_size);
fid = v9fs_fid_lookup(dentry);
if (IS_ERR(fid))
@@ -139,7 +139,8 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
{
- return v9fs_xattr_get(dentry, NULL, buffer, buffer_size);
+ /* Txattrwalk with an empty string lists xattrs instead */
+ return v9fs_xattr_get(dentry, "", buffer, buffer_size);
}
static int v9fs_xattr_handler_get(const struct xattr_handler *handler,
@@ -162,27 +163,27 @@ static int v9fs_xattr_handler_set(const struct xattr_handler *handler,
return v9fs_xattr_set(dentry, full_name, value, size, flags);
}
-static struct xattr_handler v9fs_xattr_user_handler = {
+static const struct xattr_handler v9fs_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
.get = v9fs_xattr_handler_get,
.set = v9fs_xattr_handler_set,
};
-static struct xattr_handler v9fs_xattr_trusted_handler = {
+static const struct xattr_handler v9fs_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
.get = v9fs_xattr_handler_get,
.set = v9fs_xattr_handler_set,
};
#ifdef CONFIG_9P_FS_SECURITY
-static struct xattr_handler v9fs_xattr_security_handler = {
+static const struct xattr_handler v9fs_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.get = v9fs_xattr_handler_get,
.set = v9fs_xattr_handler_set,
};
#endif
-const struct xattr_handler *v9fs_xattr_handlers[] = {
+const struct xattr_handler * const v9fs_xattr_handlers[] = {
&v9fs_xattr_user_handler,
&v9fs_xattr_trusted_handler,
#ifdef CONFIG_9P_FS_SECURITY
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index b5636e544c8a..3ad5a802352a 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -10,7 +10,7 @@
#include <net/9p/9p.h>
#include <net/9p/client.h>
-extern const struct xattr_handler *v9fs_xattr_handlers[];
+extern const struct xattr_handler * const v9fs_xattr_handlers[];
ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
void *buffer, size_t buffer_size);
diff --git a/fs/Kconfig b/fs/Kconfig
index aa7e03cc1941..42837617a55b 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -48,6 +48,7 @@ source "fs/ocfs2/Kconfig"
source "fs/btrfs/Kconfig"
source "fs/nilfs2/Kconfig"
source "fs/f2fs/Kconfig"
+source "fs/bcachefs/Kconfig"
source "fs/zonefs/Kconfig"
endif # BLOCK
@@ -255,7 +256,7 @@ config ARCH_SUPPORTS_HUGETLBFS
config HUGETLBFS
bool "HugeTLB file system support"
- depends on X86 || IA64 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN
+ depends on X86 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN
depends on (SYSFS || SYSCTL)
select MEMFD_CREATE
help
@@ -267,6 +268,7 @@ config HUGETLBFS
config HUGETLB_PAGE
def_bool HUGETLBFS
+ select XARRAY_MULTI
config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
def_bool HUGETLB_PAGE
diff --git a/fs/Makefile b/fs/Makefile
index f9541f40be4e..75522f88e763 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -123,6 +123,7 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/
obj-$(CONFIG_BTRFS_FS) += btrfs/
obj-$(CONFIG_GFS2_FS) += gfs2/
obj-$(CONFIG_F2FS_FS) += f2fs/
+obj-$(CONFIG_BCACHEFS_FS) += bcachefs/
obj-$(CONFIG_CEPH_FS) += ceph/
obj-$(CONFIG_PSTORE) += pstore/
obj-$(CONFIG_EFIVAR_FS) += efivarfs/
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 20963002578a..3081edb09e46 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -242,6 +242,7 @@ struct inode *
adfs_iget(struct super_block *sb, struct object_info *obj)
{
struct inode *inode;
+ struct timespec64 ts;
inode = new_inode(sb);
if (!inode)
@@ -268,9 +269,10 @@ adfs_iget(struct super_block *sb, struct object_info *obj)
ADFS_I(inode)->attr = obj->attr;
inode->i_mode = adfs_atts2mode(sb, inode);
- adfs_adfs2unix_time(&inode->i_mtime, inode);
- inode->i_atime = inode->i_mtime;
- inode_set_ctime_to_ts(inode, inode->i_mtime);
+ adfs_adfs2unix_time(&ts, inode);
+ inode_set_atime_to_ts(inode, ts);
+ inode_set_mtime_to_ts(inode, ts);
+ inode_set_ctime_to_ts(inode, ts);
if (S_ISDIR(inode->i_mode)) {
inode->i_op = &adfs_dir_inode_operations;
@@ -321,7 +323,8 @@ adfs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
if (ia_valid & ATTR_MTIME && adfs_inode_is_stamped(inode)) {
adfs_unix2adfs_time(inode, &attr->ia_mtime);
- adfs_adfs2unix_time(&inode->i_mtime, inode);
+ adfs_adfs2unix_time(&attr->ia_mtime, inode);
+ inode_set_mtime_to_ts(inode, attr->ia_mtime);
}
/*
@@ -329,7 +332,7 @@ adfs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
* have the ability to represent them in our filesystem?
*/
if (ia_valid & ATTR_ATIME)
- inode->i_atime = attr->ia_atime;
+ inode_set_atime_to_ts(inode, attr->ia_atime);
if (ia_valid & ATTR_CTIME)
inode_set_ctime_to_ts(inode, attr->ia_ctime);
if (ia_valid & ATTR_MODE) {
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 7ba93efc1143..fd669daa4e7b 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -60,7 +60,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh)
mark_buffer_dirty_inode(dir_bh, dir);
affs_brelse(dir_bh);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
inode_inc_iversion(dir);
mark_inode_dirty(dir);
@@ -114,7 +114,7 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh)
affs_brelse(bh);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
inode_inc_iversion(dir);
mark_inode_dirty(dir);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 060746c63151..0210df8d3500 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -149,13 +149,9 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
break;
}
- inode->i_mtime.tv_sec = inode->i_atime.tv_sec =
- inode_set_ctime(inode,
- (be32_to_cpu(tail->change.days) * 86400LL +
- be32_to_cpu(tail->change.mins) * 60 +
- be32_to_cpu(tail->change.ticks) / 50 + AFFS_EPOCH_DELTA)
- + sys_tz.tz_minuteswest * 60, 0).tv_sec;
- inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = 0;
+ inode_set_mtime(inode,
+ inode_set_atime(inode, inode_set_ctime(inode, (be32_to_cpu(tail->change.days) * 86400LL + be32_to_cpu(tail->change.mins) * 60 + be32_to_cpu(tail->change.ticks) / 50 + AFFS_EPOCH_DELTA) + sys_tz.tz_minuteswest * 60, 0).tv_sec, 0).tv_sec,
+ 0);
affs_brelse(bh);
unlock_new_inode(inode);
return inode;
@@ -187,12 +183,13 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc)
}
tail = AFFS_TAIL(sb, bh);
if (tail->stype == cpu_to_be32(ST_ROOT)) {
- affs_secs_to_datestamp(inode->i_mtime.tv_sec,
+ affs_secs_to_datestamp(inode_get_mtime_sec(inode),
&AFFS_ROOT_TAIL(sb, bh)->root_change);
} else {
tail->protect = cpu_to_be32(AFFS_I(inode)->i_protect);
tail->size = cpu_to_be32(inode->i_size);
- affs_secs_to_datestamp(inode->i_mtime.tv_sec, &tail->change);
+ affs_secs_to_datestamp(inode_get_mtime_sec(inode),
+ &tail->change);
if (!(inode->i_ino == AFFS_SB(sb)->s_root_block)) {
uid = i_uid_read(inode);
gid = i_gid_read(inode);
@@ -314,7 +311,7 @@ affs_new_inode(struct inode *dir)
inode->i_gid = current_fsgid();
inode->i_ino = block;
set_nlink(inode, 1);
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
atomic_set(&AFFS_I(inode)->i_opencnt, 0);
AFFS_I(inode)->i_blkcnt = 0;
AFFS_I(inode)->i_lc = NULL;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 2fe4a5832fcf..d6b9758ee23d 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -568,6 +568,7 @@ static struct dentry *affs_fh_to_parent(struct super_block *sb, struct fid *fid,
}
const struct export_operations affs_export_ops = {
+ .encode_fh = generic_encode_ino32_fh,
.fh_to_dentry = affs_fh_to_dentry,
.fh_to_parent = affs_fh_to_parent,
.get_parent = affs_get_parent,
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 988c2ac7cece..926cb1188eba 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -409,10 +409,12 @@ static int afs_update_cell(struct afs_cell *cell)
if (ret == -ENOMEM)
goto out_wake;
- ret = -ENOMEM;
vllist = afs_alloc_vlserver_list(0);
- if (!vllist)
+ if (!vllist) {
+ if (ret >= 0)
+ ret = -ENOMEM;
goto out_wake;
+ }
switch (ret) {
case -ENODATA:
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index 95bcbd7654d1..1f656005018e 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -88,7 +88,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
set_nlink(inode, 2);
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_blocks = 0;
inode->i_generation = 0;
@@ -114,6 +114,7 @@ static int afs_probe_cell_name(struct dentry *dentry)
struct afs_net *net = afs_d2net(dentry);
const char *name = dentry->d_name.name;
size_t len = dentry->d_name.len;
+ char *result = NULL;
int ret;
/* Names prefixed with a dot are R/W mounts. */
@@ -131,9 +132,22 @@ static int afs_probe_cell_name(struct dentry *dentry)
}
ret = dns_query(net->net, "afsdb", name, len, "srv=1",
- NULL, NULL, false);
- if (ret == -ENODATA)
- ret = -EDESTADDRREQ;
+ &result, NULL, false);
+ if (ret == -ENODATA || ret == -ENOKEY || ret == 0)
+ ret = -ENOENT;
+ if (ret > 0 && ret >= sizeof(struct dns_server_list_v1_header)) {
+ struct dns_server_list_v1_header *v1 = (void *)result;
+
+ if (v1->hdr.zero == 0 &&
+ v1->hdr.content == DNS_PAYLOAD_IS_SERVER_LIST &&
+ v1->hdr.version == 1 &&
+ (v1->status != DNS_LOOKUP_GOOD &&
+ v1->status != DNS_LOOKUP_GOOD_WITH_BAD))
+ return -ENOENT;
+
+ }
+
+ kfree(result);
return ret;
}
@@ -252,20 +266,9 @@ static int afs_dynroot_d_revalidate(struct dentry *dentry, unsigned int flags)
return 1;
}
-/*
- * Allow the VFS to enquire as to whether a dentry should be unhashed (mustn't
- * sleep)
- * - called from dput() when d_count is going to 0.
- * - return 1 to request dentry be unhashed, 0 otherwise
- */
-static int afs_dynroot_d_delete(const struct dentry *dentry)
-{
- return d_really_is_positive(dentry);
-}
-
const struct dentry_operations afs_dynroot_dentry_operations = {
.d_revalidate = afs_dynroot_d_revalidate,
- .d_delete = afs_dynroot_d_delete,
+ .d_delete = always_delete_dentry,
.d_release = afs_d_release,
.d_automount = afs_d_automount,
};
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 1c794a1896aa..78efc9719349 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -91,8 +91,8 @@ static int afs_inode_init_from_status(struct afs_operation *op,
t = status->mtime_client;
inode_set_ctime_to_ts(inode, t);
- inode->i_mtime = t;
- inode->i_atime = t;
+ inode_set_mtime_to_ts(inode, t);
+ inode_set_atime_to_ts(inode, t);
inode->i_flags |= S_NOATIME;
inode->i_uid = make_kuid(&init_user_ns, status->owner);
inode->i_gid = make_kgid(&init_user_ns, status->group);
@@ -204,7 +204,7 @@ static void afs_apply_status(struct afs_operation *op,
}
t = status->mtime_client;
- inode->i_mtime = t;
+ inode_set_mtime_to_ts(inode, t);
if (vp->update_ctime)
inode_set_ctime_to_ts(inode, op->ctime);
@@ -253,7 +253,7 @@ static void afs_apply_status(struct afs_operation *op,
if (change_size) {
afs_set_i_size(vnode, status->size);
inode_set_ctime_to_ts(inode, t);
- inode->i_atime = t;
+ inode_set_atime_to_ts(inode, t);
}
}
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index da73b97e19a9..7385d62c8cf5 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -87,7 +87,7 @@ struct afs_addr_list {
enum dns_lookup_status status:8;
unsigned long failed; /* Mask of addrs that failed locally/ICMP */
unsigned long responded; /* Mask of addrs that responded */
- struct sockaddr_rxrpc addrs[];
+ struct sockaddr_rxrpc addrs[] __counted_by(max_addrs);
#define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8))
};
@@ -553,6 +553,7 @@ struct afs_server_entry {
};
struct afs_server_list {
+ struct rcu_head rcu;
afs_volid_t vids[AFS_MAXTYPES]; /* Volume IDs */
refcount_t usage;
unsigned char nr_servers;
@@ -585,6 +586,7 @@ struct afs_volume {
#define AFS_VOLUME_OFFLINE 4 /* - T if volume offline notice given */
#define AFS_VOLUME_BUSY 5 /* - T if volume busy notice given */
#define AFS_VOLUME_MAYBE_NO_IBULK 6 /* - T if some servers don't have InlineBulkStatus */
+#define AFS_VOLUME_RM_TREE 7 /* - Set if volume removed from cell->volumes */
#ifdef CONFIG_AFS_FSCACHE
struct fscache_volume *cache; /* Caching cookie */
#endif
@@ -705,7 +707,7 @@ struct afs_permits {
refcount_t usage;
unsigned short nr_permits; /* Number of records */
bool invalidated; /* Invalidated due to key change */
- struct afs_permit permits[]; /* List of permits sorted by key pointer */
+ struct afs_permit permits[] __counted_by(nr_permits); /* List of permits sorted by key pointer */
};
/*
@@ -1512,6 +1514,7 @@ extern struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *,
extern struct afs_volume *afs_create_volume(struct afs_fs_context *);
extern int afs_activate_volume(struct afs_volume *);
extern void afs_deactivate_volume(struct afs_volume *);
+bool afs_try_get_volume(struct afs_volume *volume, enum afs_volume_trace reason);
extern struct afs_volume *afs_get_volume(struct afs_volume *, enum afs_volume_trace);
extern void afs_put_volume(struct afs_net *, struct afs_volume *, enum afs_volume_trace);
extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *);
@@ -1541,7 +1544,7 @@ int afs_launder_folio(struct folio *);
/*
* xattr.c
*/
-extern const struct xattr_handler *afs_xattr_handlers[];
+extern const struct xattr_handler * const afs_xattr_handlers[];
/*
* yfsclient.c
diff --git a/fs/afs/main.c b/fs/afs/main.c
index eae288c8d40a..6425c81d07de 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -41,8 +41,6 @@ const char afs_init_sysname[] = "arm_linux26";
const char afs_init_sysname[] = "aarch64_linux26";
#elif defined(CONFIG_X86_32)
const char afs_init_sysname[] = "i386_linux26";
-#elif defined(CONFIG_IA64)
-const char afs_init_sysname[] = "ia64_linux26";
#elif defined(CONFIG_PPC64)
const char afs_init_sysname[] = "ppc64_linux26";
#elif defined(CONFIG_PPC32)
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index ed1644e7683f..d642d06a453b 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -424,7 +424,7 @@ error_kill_call:
if (call->async) {
if (cancel_work_sync(&call->async_work))
afs_put_call(call);
- afs_put_call(call);
+ afs_set_call_complete(call, ret, 0);
}
ac->error = ret;
diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c
index ed9056703505..b59896b1de0a 100644
--- a/fs/afs/server_list.c
+++ b/fs/afs/server_list.c
@@ -17,7 +17,7 @@ void afs_put_serverlist(struct afs_net *net, struct afs_server_list *slist)
for (i = 0; i < slist->nr_servers; i++)
afs_unuse_server(net, slist->servers[i].server,
afs_server_trace_put_slist);
- kfree(slist);
+ kfree_rcu(slist, rcu);
}
}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 95d713074dc8..a01a0fb2cdbb 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -407,6 +407,10 @@ static int afs_validate_fc(struct fs_context *fc)
return PTR_ERR(volume);
ctx->volume = volume;
+ if (volume->type != AFSVL_RWVOL) {
+ ctx->flock_mode = afs_flock_mode_local;
+ fc->sb_flags |= SB_RDONLY;
+ }
}
return 0;
diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c
index 488e58490b16..eb415ce56360 100644
--- a/fs/afs/vl_rotate.c
+++ b/fs/afs/vl_rotate.c
@@ -58,6 +58,12 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
}
/* Status load is ordered after lookup counter load */
+ if (cell->dns_status == DNS_LOOKUP_GOT_NOT_FOUND) {
+ pr_warn("No record of cell %s\n", cell->name);
+ vc->error = -ENOENT;
+ return false;
+ }
+
if (cell->dns_source == DNS_RECORD_UNAVAILABLE) {
vc->error = -EDESTADDRREQ;
return false;
@@ -285,6 +291,7 @@ failed:
*/
static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc)
{
+ struct afs_cell *cell = vc->cell;
static int count;
int i;
@@ -294,6 +301,9 @@ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc)
rcu_read_lock();
pr_notice("EDESTADDR occurred\n");
+ pr_notice("CELL: %s err=%d\n", cell->name, cell->error);
+ pr_notice("DNS: src=%u st=%u lc=%x\n",
+ cell->dns_source, cell->dns_status, cell->dns_lookup_count);
pr_notice("VC: ut=%lx ix=%u ni=%hu fl=%hx err=%hd\n",
vc->untried, vc->index, vc->nr_iterations, vc->flags, vc->error);
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 29d483c80281..115c081a8e2c 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -32,8 +32,13 @@ static struct afs_volume *afs_insert_volume_into_cell(struct afs_cell *cell,
} else if (p->vid > volume->vid) {
pp = &(*pp)->rb_right;
} else {
- volume = afs_get_volume(p, afs_volume_trace_get_cell_insert);
- goto found;
+ if (afs_try_get_volume(p, afs_volume_trace_get_cell_insert)) {
+ volume = p;
+ goto found;
+ }
+
+ set_bit(AFS_VOLUME_RM_TREE, &volume->flags);
+ rb_replace_node_rcu(&p->cell_node, &volume->cell_node, &cell->volumes);
}
}
@@ -56,7 +61,8 @@ static void afs_remove_volume_from_cell(struct afs_volume *volume)
afs_volume_trace_remove);
write_seqlock(&cell->volume_lock);
hlist_del_rcu(&volume->proc_link);
- rb_erase(&volume->cell_node, &cell->volumes);
+ if (!test_and_set_bit(AFS_VOLUME_RM_TREE, &volume->flags))
+ rb_erase(&volume->cell_node, &cell->volumes);
write_sequnlock(&cell->volume_lock);
}
}
@@ -232,6 +238,20 @@ static void afs_destroy_volume(struct afs_net *net, struct afs_volume *volume)
}
/*
+ * Try to get a reference on a volume record.
+ */
+bool afs_try_get_volume(struct afs_volume *volume, enum afs_volume_trace reason)
+{
+ int r;
+
+ if (__refcount_inc_not_zero(&volume->ref, &r)) {
+ trace_afs_volume(volume->vid, r + 1, reason);
+ return true;
+ }
+ return false;
+}
+
+/*
* Get a reference on a volume record.
*/
struct afs_volume *afs_get_volume(struct afs_volume *volume,
diff --git a/fs/afs/write.c b/fs/afs/write.c
index e1c45341719b..4a168781936b 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -424,7 +424,7 @@ try_next_key:
op->store.write_iter = iter;
op->store.i_size = max(pos + size, vnode->netfs.remote_i_size);
- op->mtime = vnode->netfs.inode.i_mtime;
+ op->mtime = inode_get_mtime(&vnode->netfs.inode);
afs_wait_for_operation(op);
diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c
index 9048d8ccc715..64b2c0224f62 100644
--- a/fs/afs/xattr.c
+++ b/fs/afs/xattr.c
@@ -353,7 +353,7 @@ static const struct xattr_handler afs_xattr_afs_volume_handler = {
.get = afs_xattr_get_volume,
};
-const struct xattr_handler *afs_xattr_handlers[] = {
+const struct xattr_handler * const afs_xattr_handlers[] = {
&afs_xattr_afs_acl_handler,
&afs_xattr_afs_cell_handler,
&afs_xattr_afs_fid_handler,
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 24192a7667ed..d26222b7eefe 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -24,8 +24,8 @@
#include <linux/uaccess.h>
-static struct vfsmount *anon_inode_mnt __read_mostly;
-static struct inode *anon_inode_inode;
+static struct vfsmount *anon_inode_mnt __ro_after_init;
+static struct inode *anon_inode_inode __ro_after_init;
/*
* anon_inodefs_dname() is called from d_path().
diff --git a/fs/attr.c b/fs/attr.c
index a8ae5f6d9b16..bdf5deb06ea9 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -308,9 +308,9 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode,
i_uid_update(idmap, attr, inode);
i_gid_update(idmap, attr, inode);
if (ia_valid & ATTR_ATIME)
- inode->i_atime = attr->ia_atime;
+ inode_set_atime_to_ts(inode, attr->ia_atime);
if (ia_valid & ATTR_MTIME)
- inode->i_mtime = attr->ia_mtime;
+ inode_set_mtime_to_ts(inode, attr->ia_mtime);
if (ia_valid & ATTR_CTIME)
inode_set_ctime_to_ts(inode, attr->ia_ctime);
if (ia_valid & ATTR_MODE) {
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index d5a44fa88acf..8c1d587b3eef 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -25,6 +25,8 @@
#include <linux/completion.h>
#include <linux/file.h>
#include <linux/magic.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
/* This is the range of ioctl() numbers we claim as ours */
#define AUTOFS_IOC_FIRST AUTOFS_IOC_READY
@@ -205,20 +207,34 @@ static inline void managed_dentry_clear_managed(struct dentry *dentry)
/* Initializing function */
-int autofs_fill_super(struct super_block *, void *, int);
+extern const struct fs_parameter_spec autofs_param_specs[];
+int autofs_init_fs_context(struct fs_context *fc);
struct autofs_info *autofs_new_ino(struct autofs_sb_info *);
void autofs_clean_ino(struct autofs_info *);
-static inline int autofs_prepare_pipe(struct file *pipe)
+static inline int autofs_check_pipe(struct file *pipe)
{
if (!(pipe->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
if (!S_ISFIFO(file_inode(pipe)->i_mode))
return -EINVAL;
+ return 0;
+}
+
+static inline void autofs_set_packet_pipe_flags(struct file *pipe)
+{
/* We want a packet pipe */
pipe->f_flags |= O_DIRECT;
/* We don't expect -EAGAIN */
pipe->f_flags &= ~O_NONBLOCK;
+}
+
+static inline int autofs_prepare_pipe(struct file *pipe)
+{
+ int ret = autofs_check_pipe(pipe);
+ if (ret < 0)
+ return ret;
+ autofs_set_packet_pipe_flags(pipe);
return 0;
}
diff --git a/fs/autofs/init.c b/fs/autofs/init.c
index d3f55e874338..b5e4dfa04ed0 100644
--- a/fs/autofs/init.c
+++ b/fs/autofs/init.c
@@ -7,16 +7,11 @@
#include <linux/init.h>
#include "autofs_i.h"
-static struct dentry *autofs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
-{
- return mount_nodev(fs_type, flags, data, autofs_fill_super);
-}
-
struct file_system_type autofs_fs_type = {
.owner = THIS_MODULE,
.name = "autofs",
- .mount = autofs_mount,
+ .init_fs_context = autofs_init_fs_context,
+ .parameters = autofs_param_specs,
.kill_sb = autofs_kill_sb,
};
MODULE_ALIAS_FS("autofs");
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 2b49662ed237..1f5db6863663 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -6,7 +6,6 @@
#include <linux/seq_file.h>
#include <linux/pagemap.h>
-#include <linux/parser.h>
#include "autofs_i.h"
@@ -110,189 +109,179 @@ static const struct super_operations autofs_sops = {
.evict_inode = autofs_evict_inode,
};
-enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
- Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire,
- Opt_ignore};
-
-static const match_table_t tokens = {
- {Opt_fd, "fd=%u"},
- {Opt_uid, "uid=%u"},
- {Opt_gid, "gid=%u"},
- {Opt_pgrp, "pgrp=%u"},
- {Opt_minproto, "minproto=%u"},
- {Opt_maxproto, "maxproto=%u"},
- {Opt_indirect, "indirect"},
- {Opt_direct, "direct"},
- {Opt_offset, "offset"},
- {Opt_strictexpire, "strictexpire"},
- {Opt_ignore, "ignore"},
- {Opt_err, NULL}
+enum {
+ Opt_direct,
+ Opt_fd,
+ Opt_gid,
+ Opt_ignore,
+ Opt_indirect,
+ Opt_maxproto,
+ Opt_minproto,
+ Opt_offset,
+ Opt_pgrp,
+ Opt_strictexpire,
+ Opt_uid,
};
-static int parse_options(char *options,
- struct inode *root, int *pgrp, bool *pgrp_set,
- struct autofs_sb_info *sbi)
+const struct fs_parameter_spec autofs_param_specs[] = {
+ fsparam_flag ("direct", Opt_direct),
+ fsparam_fd ("fd", Opt_fd),
+ fsparam_u32 ("gid", Opt_gid),
+ fsparam_flag ("ignore", Opt_ignore),
+ fsparam_flag ("indirect", Opt_indirect),
+ fsparam_u32 ("maxproto", Opt_maxproto),
+ fsparam_u32 ("minproto", Opt_minproto),
+ fsparam_flag ("offset", Opt_offset),
+ fsparam_u32 ("pgrp", Opt_pgrp),
+ fsparam_flag ("strictexpire", Opt_strictexpire),
+ fsparam_u32 ("uid", Opt_uid),
+ {}
+};
+
+struct autofs_fs_context {
+ kuid_t uid;
+ kgid_t gid;
+ int pgrp;
+ bool pgrp_set;
+};
+
+/*
+ * Open the fd. We do it here rather than in get_tree so that it's done in the
+ * context of the system call that passed the data and not the one that
+ * triggered the superblock creation, lest the fd gets reassigned.
+ */
+static int autofs_parse_fd(struct fs_context *fc, struct autofs_sb_info *sbi,
+ struct fs_parameter *param,
+ struct fs_parse_result *result)
{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int option;
- int pipefd = -1;
- kuid_t uid;
- kgid_t gid;
+ struct file *pipe;
+ int ret;
- root->i_uid = current_uid();
- root->i_gid = current_gid();
+ if (param->type == fs_value_is_file) {
+ /* came through the new api */
+ pipe = param->file;
+ param->file = NULL;
+ } else {
+ pipe = fget(result->uint_32);
+ }
+ if (!pipe) {
+ errorf(fc, "could not open pipe file descriptor");
+ return -EBADF;
+ }
- sbi->min_proto = AUTOFS_MIN_PROTO_VERSION;
- sbi->max_proto = AUTOFS_MAX_PROTO_VERSION;
+ ret = autofs_check_pipe(pipe);
+ if (ret < 0) {
+ errorf(fc, "Invalid/unusable pipe");
+ if (param->type != fs_value_is_file)
+ fput(pipe);
+ return -EBADF;
+ }
- sbi->pipefd = -1;
+ autofs_set_packet_pipe_flags(pipe);
- if (!options)
- return 1;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
-
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_fd:
- if (match_int(args, &pipefd))
- return 1;
- sbi->pipefd = pipefd;
- break;
- case Opt_uid:
- if (match_int(args, &option))
- return 1;
- uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(uid))
- return 1;
- root->i_uid = uid;
- break;
- case Opt_gid:
- if (match_int(args, &option))
- return 1;
- gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(gid))
- return 1;
- root->i_gid = gid;
- break;
- case Opt_pgrp:
- if (match_int(args, &option))
- return 1;
- *pgrp = option;
- *pgrp_set = true;
- break;
- case Opt_minproto:
- if (match_int(args, &option))
- return 1;
- sbi->min_proto = option;
- break;
- case Opt_maxproto:
- if (match_int(args, &option))
- return 1;
- sbi->max_proto = option;
- break;
- case Opt_indirect:
- set_autofs_type_indirect(&sbi->type);
- break;
- case Opt_direct:
- set_autofs_type_direct(&sbi->type);
- break;
- case Opt_offset:
- set_autofs_type_offset(&sbi->type);
- break;
- case Opt_strictexpire:
- sbi->flags |= AUTOFS_SBI_STRICTEXPIRE;
- break;
- case Opt_ignore:
- sbi->flags |= AUTOFS_SBI_IGNORE;
- break;
- default:
- return 1;
- }
+ if (sbi->pipe)
+ fput(sbi->pipe);
+
+ sbi->pipefd = result->uint_32;
+ sbi->pipe = pipe;
+
+ return 0;
+}
+
+static int autofs_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct autofs_fs_context *ctx = fc->fs_private;
+ struct autofs_sb_info *sbi = fc->s_fs_info;
+ struct fs_parse_result result;
+ kuid_t uid;
+ kgid_t gid;
+ int opt;
+
+ opt = fs_parse(fc, autofs_param_specs, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_fd:
+ return autofs_parse_fd(fc, sbi, param, &result);
+ case Opt_uid:
+ uid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(uid))
+ return invalfc(fc, "Invalid uid");
+ ctx->uid = uid;
+ break;
+ case Opt_gid:
+ gid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(gid))
+ return invalfc(fc, "Invalid gid");
+ ctx->gid = gid;
+ break;
+ case Opt_pgrp:
+ ctx->pgrp = result.uint_32;
+ ctx->pgrp_set = true;
+ break;
+ case Opt_minproto:
+ sbi->min_proto = result.uint_32;
+ break;
+ case Opt_maxproto:
+ sbi->max_proto = result.uint_32;
+ break;
+ case Opt_indirect:
+ set_autofs_type_indirect(&sbi->type);
+ break;
+ case Opt_direct:
+ set_autofs_type_direct(&sbi->type);
+ break;
+ case Opt_offset:
+ set_autofs_type_offset(&sbi->type);
+ break;
+ case Opt_strictexpire:
+ sbi->flags |= AUTOFS_SBI_STRICTEXPIRE;
+ break;
+ case Opt_ignore:
+ sbi->flags |= AUTOFS_SBI_IGNORE;
}
- return (sbi->pipefd < 0);
+
+ return 0;
}
-int autofs_fill_super(struct super_block *s, void *data, int silent)
+static struct autofs_sb_info *autofs_alloc_sbi(void)
{
- struct inode *root_inode;
- struct dentry *root;
- struct file *pipe;
struct autofs_sb_info *sbi;
- struct autofs_info *ino;
- int pgrp = 0;
- bool pgrp_set = false;
- int ret = -EINVAL;
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
- return -ENOMEM;
- pr_debug("starting up, sbi = %p\n", sbi);
+ return NULL;
- s->s_fs_info = sbi;
sbi->magic = AUTOFS_SBI_MAGIC;
- sbi->pipefd = -1;
- sbi->pipe = NULL;
- sbi->exp_timeout = 0;
- sbi->oz_pgrp = NULL;
- sbi->sb = s;
- sbi->version = 0;
- sbi->sub_version = 0;
sbi->flags = AUTOFS_SBI_CATATONIC;
+ sbi->min_proto = AUTOFS_MIN_PROTO_VERSION;
+ sbi->max_proto = AUTOFS_MAX_PROTO_VERSION;
+ sbi->pipefd = -1;
+
set_autofs_type_indirect(&sbi->type);
- sbi->min_proto = 0;
- sbi->max_proto = 0;
mutex_init(&sbi->wq_mutex);
mutex_init(&sbi->pipe_mutex);
spin_lock_init(&sbi->fs_lock);
- sbi->queues = NULL;
spin_lock_init(&sbi->lookup_lock);
INIT_LIST_HEAD(&sbi->active_list);
INIT_LIST_HEAD(&sbi->expiring_list);
- s->s_blocksize = 1024;
- s->s_blocksize_bits = 10;
- s->s_magic = AUTOFS_SUPER_MAGIC;
- s->s_op = &autofs_sops;
- s->s_d_op = &autofs_dentry_operations;
- s->s_time_gran = 1;
-
- /*
- * Get the root inode and dentry, but defer checking for errors.
- */
- ino = autofs_new_ino(sbi);
- if (!ino) {
- ret = -ENOMEM;
- goto fail_free;
- }
- root_inode = autofs_get_inode(s, S_IFDIR | 0755);
- root = d_make_root(root_inode);
- if (!root) {
- ret = -ENOMEM;
- goto fail_ino;
- }
- pipe = NULL;
- root->d_fsdata = ino;
+ return sbi;
+}
- /* Can this call block? */
- if (parse_options(data, root_inode, &pgrp, &pgrp_set, sbi)) {
- pr_err("called with bogus options\n");
- goto fail_dput;
- }
+static int autofs_validate_protocol(struct fs_context *fc)
+{
+ struct autofs_sb_info *sbi = fc->s_fs_info;
/* Test versions first */
if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) {
- pr_err("kernel does not match daemon version "
+ errorf(fc, "kernel does not match daemon version "
"daemon (%d, %d) kernel (%d, %d)\n",
sbi->min_proto, sbi->max_proto,
AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION);
- goto fail_dput;
+ return -EINVAL;
}
/* Establish highest kernel protocol version */
@@ -300,62 +289,148 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
sbi->version = AUTOFS_MAX_PROTO_VERSION;
else
sbi->version = sbi->max_proto;
- sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
-
- if (pgrp_set) {
- sbi->oz_pgrp = find_get_pid(pgrp);
- if (!sbi->oz_pgrp) {
- pr_err("could not find process group %d\n",
- pgrp);
- goto fail_dput;
- }
- } else {
- sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID);
+
+ switch (sbi->version) {
+ case 4:
+ sbi->sub_version = 7;
+ break;
+ case 5:
+ sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
+ break;
+ default:
+ sbi->sub_version = 0;
}
- if (autofs_type_trigger(sbi->type))
- __managed_dentry_set_managed(root);
+ return 0;
+}
+static int autofs_fill_super(struct super_block *s, struct fs_context *fc)
+{
+ struct autofs_fs_context *ctx = fc->fs_private;
+ struct autofs_sb_info *sbi = s->s_fs_info;
+ struct inode *root_inode;
+ struct autofs_info *ino;
+
+ pr_debug("starting up, sbi = %p\n", sbi);
+
+ sbi->sb = s;
+ s->s_blocksize = 1024;
+ s->s_blocksize_bits = 10;
+ s->s_magic = AUTOFS_SUPER_MAGIC;
+ s->s_op = &autofs_sops;
+ s->s_d_op = &autofs_dentry_operations;
+ s->s_time_gran = 1;
+
+ /*
+ * Get the root inode and dentry, but defer checking for errors.
+ */
+ ino = autofs_new_ino(sbi);
+ if (!ino)
+ return -ENOMEM;
+
+ root_inode = autofs_get_inode(s, S_IFDIR | 0755);
+ if (!root_inode)
+ return -ENOMEM;
+
+ root_inode->i_uid = ctx->uid;
+ root_inode->i_gid = ctx->gid;
root_inode->i_fop = &autofs_root_operations;
root_inode->i_op = &autofs_dir_inode_operations;
+ s->s_root = d_make_root(root_inode);
+ if (unlikely(!s->s_root)) {
+ autofs_free_ino(ino);
+ return -ENOMEM;
+ }
+ s->s_root->d_fsdata = ino;
+
+ if (ctx->pgrp_set) {
+ sbi->oz_pgrp = find_get_pid(ctx->pgrp);
+ if (!sbi->oz_pgrp)
+ return invalf(fc, "Could not find process group %d",
+ ctx->pgrp);
+ } else
+ sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID);
+
+ if (autofs_type_trigger(sbi->type))
+ /* s->s_root won't be contended so there's little to
+ * be gained by not taking the d_lock when setting
+ * d_flags, even when a lot mounts are being done.
+ */
+ managed_dentry_set_managed(s->s_root);
+
pr_debug("pipe fd = %d, pgrp = %u\n",
sbi->pipefd, pid_nr(sbi->oz_pgrp));
- pipe = fget(sbi->pipefd);
- if (!pipe) {
- pr_err("could not open pipe file descriptor\n");
- goto fail_put_pid;
- }
- ret = autofs_prepare_pipe(pipe);
- if (ret < 0)
- goto fail_fput;
- sbi->pipe = pipe;
sbi->flags &= ~AUTOFS_SBI_CATATONIC;
+ return 0;
+}
- /*
- * Success! Install the root dentry now to indicate completion.
- */
- s->s_root = root;
+/*
+ * Validate the parameters and then request a superblock.
+ */
+static int autofs_get_tree(struct fs_context *fc)
+{
+ struct autofs_sb_info *sbi = fc->s_fs_info;
+ int ret;
+
+ ret = autofs_validate_protocol(fc);
+ if (ret)
+ return ret;
+
+ if (sbi->pipefd < 0)
+ return invalf(fc, "No control pipe specified");
+
+ return get_tree_nodev(fc, autofs_fill_super);
+}
+
+static void autofs_free_fc(struct fs_context *fc)
+{
+ struct autofs_fs_context *ctx = fc->fs_private;
+ struct autofs_sb_info *sbi = fc->s_fs_info;
+
+ if (sbi) {
+ if (sbi->pipe)
+ fput(sbi->pipe);
+ kfree(sbi);
+ }
+ kfree(ctx);
+}
+
+static const struct fs_context_operations autofs_context_ops = {
+ .free = autofs_free_fc,
+ .parse_param = autofs_parse_param,
+ .get_tree = autofs_get_tree,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+int autofs_init_fs_context(struct fs_context *fc)
+{
+ struct autofs_fs_context *ctx;
+ struct autofs_sb_info *sbi;
+
+ ctx = kzalloc(sizeof(struct autofs_fs_context), GFP_KERNEL);
+ if (!ctx)
+ goto nomem;
+
+ ctx->uid = current_uid();
+ ctx->gid = current_gid();
+
+ sbi = autofs_alloc_sbi();
+ if (!sbi)
+ goto nomem_ctx;
+
+ fc->fs_private = ctx;
+ fc->s_fs_info = sbi;
+ fc->ops = &autofs_context_ops;
return 0;
- /*
- * Failure ... clean up.
- */
-fail_fput:
- pr_err("pipe file descriptor does not contain proper ops\n");
- fput(pipe);
-fail_put_pid:
- put_pid(sbi->oz_pgrp);
-fail_dput:
- dput(root);
- goto fail_free;
-fail_ino:
- autofs_free_ino(ino);
-fail_free:
- kfree(sbi);
- s->s_fs_info = NULL;
- return ret;
+nomem_ctx:
+ kfree(ctx);
+nomem:
+ return -ENOMEM;
}
struct inode *autofs_get_inode(struct super_block *sb, umode_t mode)
@@ -370,7 +445,7 @@ struct inode *autofs_get_inode(struct super_block *sb, umode_t mode)
inode->i_uid = d_inode(sb->s_root)->i_uid;
inode->i_gid = d_inode(sb->s_root)->i_gid;
}
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_ino = get_next_ino();
if (S_ISDIR(mode)) {
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 512b9a26c63d..530d18827e35 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -600,7 +600,7 @@ static int autofs_dir_symlink(struct mnt_idmap *idmap,
p_ino = autofs_dentry_ino(dentry->d_parent);
p_ino->count++;
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
return 0;
}
@@ -633,7 +633,7 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry)
d_inode(dentry)->i_size = 0;
clear_nlink(d_inode(dentry));
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
spin_lock(&sbi->lookup_lock);
__autofs_add_expiring(dentry);
@@ -749,7 +749,7 @@ static int autofs_dir_mkdir(struct mnt_idmap *idmap,
p_ino = autofs_dentry_ino(dentry->d_parent);
p_ino->count++;
inc_nlink(dir);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
return 0;
}
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 83f9566c973b..316d88da2ce1 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -208,7 +208,7 @@ void make_bad_inode(struct inode *inode)
remove_inode_hash(inode);
inode->i_mode = S_IFREG;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_op = &bad_inode_ops;
inode->i_opflags &= ~IOP_XATTR;
inode->i_fop = &bad_file_ops;
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
new file mode 100644
index 000000000000..fddc7be58022
--- /dev/null
+++ b/fs/bcachefs/Kconfig
@@ -0,0 +1,95 @@
+
+config BCACHEFS_FS
+ tristate "bcachefs filesystem support (EXPERIMENTAL)"
+ depends on BLOCK
+ select EXPORTFS
+ select CLOSURES
+ select LIBCRC32C
+ select CRC64
+ select FS_POSIX_ACL
+ select LZ4_COMPRESS
+ select LZ4_DECOMPRESS
+ select LZ4HC_COMPRESS
+ select LZ4HC_DECOMPRESS
+ select ZLIB_DEFLATE
+ select ZLIB_INFLATE
+ select ZSTD_COMPRESS
+ select ZSTD_DECOMPRESS
+ select CRYPTO_SHA256
+ select CRYPTO_CHACHA20
+ select CRYPTO_POLY1305
+ select KEYS
+ select RAID6_PQ
+ select XOR_BLOCKS
+ select XXHASH
+ select SRCU
+ select SYMBOLIC_ERRNAME
+ help
+ The bcachefs filesystem - a modern, copy on write filesystem, with
+ support for multiple devices, compression, checksumming, etc.
+
+config BCACHEFS_QUOTA
+ bool "bcachefs quota support"
+ depends on BCACHEFS_FS
+ select QUOTACTL
+
+config BCACHEFS_ERASURE_CODING
+ bool "bcachefs erasure coding (RAID5/6) support (EXPERIMENTAL)"
+ depends on BCACHEFS_FS
+ select QUOTACTL
+ help
+ This enables the "erasure_code" filesysystem and inode option, which
+ organizes data into reed-solomon stripes instead of ordinary
+ replication.
+
+ WARNING: this feature is still undergoing on disk format changes, and
+ should only be enabled for testing purposes.
+
+config BCACHEFS_POSIX_ACL
+ bool "bcachefs POSIX ACL support"
+ depends on BCACHEFS_FS
+ select FS_POSIX_ACL
+
+config BCACHEFS_DEBUG_TRANSACTIONS
+ bool "bcachefs runtime info"
+ depends on BCACHEFS_FS
+ help
+ This makes the list of running btree transactions available in debugfs.
+
+ This is a highly useful debugging feature but does add a small amount of overhead.
+
+config BCACHEFS_DEBUG
+ bool "bcachefs debugging"
+ depends on BCACHEFS_FS
+ help
+ Enables many extra debugging checks and assertions.
+
+ The resulting code will be significantly slower than normal; you
+ probably shouldn't select this option unless you're a developer.
+
+config BCACHEFS_TESTS
+ bool "bcachefs unit and performance tests"
+ depends on BCACHEFS_FS
+ help
+ Include some unit and performance tests for the core btree code
+
+config BCACHEFS_LOCK_TIME_STATS
+ bool "bcachefs lock time statistics"
+ depends on BCACHEFS_FS
+ help
+ Expose statistics for how long we held a lock in debugfs
+
+config BCACHEFS_NO_LATENCY_ACCT
+ bool "disable latency accounting and time stats"
+ depends on BCACHEFS_FS
+ help
+ This disables device latency tracking and time stats, only for performance testing
+
+config MEAN_AND_VARIANCE_UNIT_TEST
+ tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ depends on BCACHEFS_FS
+ default KUNIT_ALL_TESTS
+ help
+ This option enables the kunit tests for mean_and_variance module.
+ If unsure, say N.
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
new file mode 100644
index 000000000000..b81268418174
--- /dev/null
+++ b/fs/bcachefs/Makefile
@@ -0,0 +1,91 @@
+
+obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o
+
+bcachefs-y := \
+ acl.o \
+ alloc_background.o \
+ alloc_foreground.o \
+ backpointers.o \
+ bkey.o \
+ bkey_methods.o \
+ bkey_sort.o \
+ bset.o \
+ btree_cache.o \
+ btree_gc.o \
+ btree_io.o \
+ btree_iter.o \
+ btree_journal_iter.o \
+ btree_key_cache.o \
+ btree_locking.o \
+ btree_trans_commit.o \
+ btree_update.o \
+ btree_update_interior.o \
+ btree_write_buffer.o \
+ buckets.o \
+ buckets_waiting_for_journal.o \
+ chardev.o \
+ checksum.o \
+ clock.o \
+ compress.o \
+ counters.o \
+ darray.o \
+ debug.o \
+ dirent.o \
+ disk_groups.o \
+ data_update.o \
+ ec.o \
+ errcode.o \
+ error.o \
+ extents.o \
+ extent_update.o \
+ fs.o \
+ fs-common.o \
+ fs-ioctl.o \
+ fs-io.o \
+ fs-io-buffered.o \
+ fs-io-direct.o \
+ fs-io-pagecache.o \
+ fsck.o \
+ inode.o \
+ io_read.o \
+ io_misc.o \
+ io_write.o \
+ journal.o \
+ journal_io.o \
+ journal_reclaim.o \
+ journal_sb.o \
+ journal_seq_blacklist.o \
+ keylist.o \
+ logged_ops.o \
+ lru.o \
+ mean_and_variance.o \
+ migrate.o \
+ move.o \
+ movinggc.o \
+ nocow_locking.o \
+ opts.o \
+ printbuf.o \
+ quota.o \
+ rebalance.o \
+ recovery.o \
+ reflink.o \
+ replicas.o \
+ sb-clean.o \
+ sb-downgrade.o \
+ sb-errors.o \
+ sb-members.o \
+ siphash.o \
+ six.o \
+ snapshot.o \
+ subvolume.o \
+ super.o \
+ super-io.o \
+ sysfs.o \
+ tests.o \
+ trace.o \
+ two_state_shared_lock.o \
+ util.o \
+ varint.o \
+ xattr.o
+
+obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
new file mode 100644
index 000000000000..3640f417cce1
--- /dev/null
+++ b/fs/bcachefs/acl.c
@@ -0,0 +1,464 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+
+#include "acl.h"
+#include "xattr.h"
+
+#include <linux/posix_acl.h>
+
+static const char * const acl_types[] = {
+ [ACL_USER_OBJ] = "user_obj",
+ [ACL_USER] = "user",
+ [ACL_GROUP_OBJ] = "group_obj",
+ [ACL_GROUP] = "group",
+ [ACL_MASK] = "mask",
+ [ACL_OTHER] = "other",
+ NULL,
+};
+
+void bch2_acl_to_text(struct printbuf *out, const void *value, size_t size)
+{
+ const void *p, *end = value + size;
+
+ if (!value ||
+ size < sizeof(bch_acl_header) ||
+ ((bch_acl_header *)value)->a_version != cpu_to_le32(BCH_ACL_VERSION))
+ return;
+
+ p = value + sizeof(bch_acl_header);
+ while (p < end) {
+ const bch_acl_entry *in = p;
+ unsigned tag = le16_to_cpu(in->e_tag);
+
+ prt_str(out, acl_types[tag]);
+
+ switch (tag) {
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_MASK:
+ case ACL_OTHER:
+ p += sizeof(bch_acl_entry_short);
+ break;
+ case ACL_USER:
+ prt_printf(out, " uid %u", le32_to_cpu(in->e_id));
+ p += sizeof(bch_acl_entry);
+ break;
+ case ACL_GROUP:
+ prt_printf(out, " gid %u", le32_to_cpu(in->e_id));
+ p += sizeof(bch_acl_entry);
+ break;
+ }
+
+ prt_printf(out, " %o", le16_to_cpu(in->e_perm));
+
+ if (p != end)
+ prt_char(out, ' ');
+ }
+}
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+
+#include "fs.h"
+
+#include <linux/fs.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long)
+{
+ return sizeof(bch_acl_header) +
+ sizeof(bch_acl_entry_short) * nr_short +
+ sizeof(bch_acl_entry) * nr_long;
+}
+
+static inline int acl_to_xattr_type(int type)
+{
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS;
+ case ACL_TYPE_DEFAULT:
+ return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT;
+ default:
+ BUG();
+ }
+}
+
+/*
+ * Convert from filesystem to in-memory representation.
+ */
+static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans,
+ const void *value, size_t size)
+{
+ const void *p, *end = value + size;
+ struct posix_acl *acl;
+ struct posix_acl_entry *out;
+ unsigned count = 0;
+ int ret;
+
+ if (!value)
+ return NULL;
+ if (size < sizeof(bch_acl_header))
+ goto invalid;
+ if (((bch_acl_header *)value)->a_version !=
+ cpu_to_le32(BCH_ACL_VERSION))
+ goto invalid;
+
+ p = value + sizeof(bch_acl_header);
+ while (p < end) {
+ const bch_acl_entry *entry = p;
+
+ if (p + sizeof(bch_acl_entry_short) > end)
+ goto invalid;
+
+ switch (le16_to_cpu(entry->e_tag)) {
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_MASK:
+ case ACL_OTHER:
+ p += sizeof(bch_acl_entry_short);
+ break;
+ case ACL_USER:
+ case ACL_GROUP:
+ p += sizeof(bch_acl_entry);
+ break;
+ default:
+ goto invalid;
+ }
+
+ count++;
+ }
+
+ if (p > end)
+ goto invalid;
+
+ if (!count)
+ return NULL;
+
+ acl = allocate_dropping_locks(trans, ret,
+ posix_acl_alloc(count, _gfp));
+ if (!acl)
+ return ERR_PTR(-ENOMEM);
+ if (ret) {
+ kfree(acl);
+ return ERR_PTR(ret);
+ }
+
+ out = acl->a_entries;
+
+ p = value + sizeof(bch_acl_header);
+ while (p < end) {
+ const bch_acl_entry *in = p;
+
+ out->e_tag = le16_to_cpu(in->e_tag);
+ out->e_perm = le16_to_cpu(in->e_perm);
+
+ switch (out->e_tag) {
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_MASK:
+ case ACL_OTHER:
+ p += sizeof(bch_acl_entry_short);
+ break;
+ case ACL_USER:
+ out->e_uid = make_kuid(&init_user_ns,
+ le32_to_cpu(in->e_id));
+ p += sizeof(bch_acl_entry);
+ break;
+ case ACL_GROUP:
+ out->e_gid = make_kgid(&init_user_ns,
+ le32_to_cpu(in->e_id));
+ p += sizeof(bch_acl_entry);
+ break;
+ }
+
+ out++;
+ }
+
+ BUG_ON(out != acl->a_entries + acl->a_count);
+
+ return acl;
+invalid:
+ pr_err("invalid acl entry");
+ return ERR_PTR(-EINVAL);
+}
+
+#define acl_for_each_entry(acl, acl_e) \
+ for (acl_e = acl->a_entries; \
+ acl_e < acl->a_entries + acl->a_count; \
+ acl_e++)
+
+/*
+ * Convert from in-memory to filesystem representation.
+ */
+static struct bkey_i_xattr *
+bch2_acl_to_xattr(struct btree_trans *trans,
+ const struct posix_acl *acl,
+ int type)
+{
+ struct bkey_i_xattr *xattr;
+ bch_acl_header *acl_header;
+ const struct posix_acl_entry *acl_e;
+ void *outptr;
+ unsigned nr_short = 0, nr_long = 0, acl_len, u64s;
+
+ acl_for_each_entry(acl, acl_e) {
+ switch (acl_e->e_tag) {
+ case ACL_USER:
+ case ACL_GROUP:
+ nr_long++;
+ break;
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_MASK:
+ case ACL_OTHER:
+ nr_short++;
+ break;
+ default:
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ acl_len = bch2_acl_size(nr_short, nr_long);
+ u64s = BKEY_U64s + xattr_val_u64s(0, acl_len);
+
+ if (u64s > U8_MAX)
+ return ERR_PTR(-E2BIG);
+
+ xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
+ if (IS_ERR(xattr))
+ return xattr;
+
+ bkey_xattr_init(&xattr->k_i);
+ xattr->k.u64s = u64s;
+ xattr->v.x_type = acl_to_xattr_type(type);
+ xattr->v.x_name_len = 0;
+ xattr->v.x_val_len = cpu_to_le16(acl_len);
+
+ acl_header = xattr_val(&xattr->v);
+ acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION);
+
+ outptr = (void *) acl_header + sizeof(*acl_header);
+
+ acl_for_each_entry(acl, acl_e) {
+ bch_acl_entry *entry = outptr;
+
+ entry->e_tag = cpu_to_le16(acl_e->e_tag);
+ entry->e_perm = cpu_to_le16(acl_e->e_perm);
+ switch (acl_e->e_tag) {
+ case ACL_USER:
+ entry->e_id = cpu_to_le32(
+ from_kuid(&init_user_ns, acl_e->e_uid));
+ outptr += sizeof(bch_acl_entry);
+ break;
+ case ACL_GROUP:
+ entry->e_id = cpu_to_le32(
+ from_kgid(&init_user_ns, acl_e->e_gid));
+ outptr += sizeof(bch_acl_entry);
+ break;
+
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_MASK:
+ case ACL_OTHER:
+ outptr += sizeof(bch_acl_entry_short);
+ break;
+ }
+ }
+
+ BUG_ON(outptr != xattr_val(&xattr->v) + acl_len);
+
+ return xattr;
+}
+
+struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
+ struct dentry *dentry, int type)
+{
+ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
+ struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter = { NULL };
+ struct bkey_s_c_xattr xattr;
+ struct posix_acl *acl = NULL;
+ struct bkey_s_c k;
+ int ret;
+retry:
+ bch2_trans_begin(trans);
+
+ ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
+ &hash, inode_inum(inode), &search, 0);
+ if (ret) {
+ if (!bch2_err_matches(ret, ENOENT))
+ acl = ERR_PTR(ret);
+ goto out;
+ }
+
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret) {
+ acl = ERR_PTR(ret);
+ goto out;
+ }
+
+ xattr = bkey_s_c_to_xattr(k);
+ acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
+ le16_to_cpu(xattr.v->x_val_len));
+
+ if (!IS_ERR(acl))
+ set_cached_acl(&inode->v, type, acl);
+out:
+ if (bch2_err_matches(PTR_ERR_OR_ZERO(acl), BCH_ERR_transaction_restart))
+ goto retry;
+
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
+ return acl;
+}
+
+int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
+ struct bch_inode_unpacked *inode_u,
+ struct posix_acl *acl, int type)
+{
+ struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u);
+ int ret;
+
+ if (type == ACL_TYPE_DEFAULT &&
+ !S_ISDIR(inode_u->bi_mode))
+ return acl ? -EACCES : 0;
+
+ if (acl) {
+ struct bkey_i_xattr *xattr =
+ bch2_acl_to_xattr(trans, acl, type);
+ if (IS_ERR(xattr))
+ return PTR_ERR(xattr);
+
+ ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info,
+ inum, &xattr->k_i, 0);
+ } else {
+ struct xattr_search_key search =
+ X_SEARCH(acl_to_xattr_type(type), "", 0);
+
+ ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info,
+ inum, &search);
+ }
+
+ return bch2_err_matches(ret, ENOENT) ? 0 : ret;
+}
+
+int bch2_set_acl(struct mnt_idmap *idmap,
+ struct dentry *dentry,
+ struct posix_acl *_acl, int type)
+{
+ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter inode_iter = { NULL };
+ struct bch_inode_unpacked inode_u;
+ struct posix_acl *acl;
+ umode_t mode;
+ int ret;
+
+ mutex_lock(&inode->ei_update_lock);
+retry:
+ bch2_trans_begin(trans);
+ acl = _acl;
+
+ ret = bch2_subvol_is_ro_trans(trans, inode->ei_subvol) ?:
+ bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
+ BTREE_ITER_INTENT);
+ if (ret)
+ goto btree_err;
+
+ mode = inode_u.bi_mode;
+
+ if (type == ACL_TYPE_ACCESS) {
+ ret = posix_acl_update_mode(idmap, &inode->v, &mode, &acl);
+ if (ret)
+ goto btree_err;
+ }
+
+ ret = bch2_set_acl_trans(trans, inode_inum(inode), &inode_u, acl, type);
+ if (ret)
+ goto btree_err;
+
+ inode_u.bi_ctime = bch2_current_time(c);
+ inode_u.bi_mode = mode;
+
+ ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
+ bch2_trans_commit(trans, NULL, NULL, 0);
+btree_err:
+ bch2_trans_iter_exit(trans, &inode_iter);
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
+ if (unlikely(ret))
+ goto err;
+
+ bch2_inode_update_after_write(trans, inode, &inode_u,
+ ATTR_CTIME|ATTR_MODE);
+
+ set_cached_acl(&inode->v, type, acl);
+err:
+ mutex_unlock(&inode->ei_update_lock);
+ bch2_trans_put(trans);
+
+ return ret;
+}
+
+int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
+ struct bch_inode_unpacked *inode,
+ umode_t mode,
+ struct posix_acl **new_acl)
+{
+ struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode);
+ struct xattr_search_key search = X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0);
+ struct btree_iter iter;
+ struct bkey_s_c_xattr xattr;
+ struct bkey_i_xattr *new;
+ struct posix_acl *acl = NULL;
+ struct bkey_s_c k;
+ int ret;
+
+ ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
+ &hash_info, inum, &search, BTREE_ITER_INTENT);
+ if (ret)
+ return bch2_err_matches(ret, ENOENT) ? 0 : ret;
+
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+ xattr = bkey_s_c_to_xattr(k);
+
+ acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
+ le16_to_cpu(xattr.v->x_val_len));
+ ret = PTR_ERR_OR_ZERO(acl);
+ if (IS_ERR_OR_NULL(acl))
+ goto err;
+
+ ret = allocate_dropping_locks_errcode(trans,
+ __posix_acl_chmod(&acl, _gfp, mode));
+ if (ret)
+ goto err;
+
+ new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS);
+ if (IS_ERR(new)) {
+ ret = PTR_ERR(new);
+ goto err;
+ }
+
+ new->k.p = iter.pos;
+ ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
+ *new_acl = acl;
+ acl = NULL;
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ if (!IS_ERR_OR_NULL(acl))
+ kfree(acl);
+ return ret;
+}
+
+#endif /* CONFIG_BCACHEFS_POSIX_ACL */
diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h
new file mode 100644
index 000000000000..27e7eec0f278
--- /dev/null
+++ b/fs/bcachefs/acl.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ACL_H
+#define _BCACHEFS_ACL_H
+
+struct bch_inode_unpacked;
+struct bch_hash_info;
+struct bch_inode_info;
+struct posix_acl;
+
+#define BCH_ACL_VERSION 0x0001
+
+typedef struct {
+ __le16 e_tag;
+ __le16 e_perm;
+ __le32 e_id;
+} bch_acl_entry;
+
+typedef struct {
+ __le16 e_tag;
+ __le16 e_perm;
+} bch_acl_entry_short;
+
+typedef struct {
+ __le32 a_version;
+} bch_acl_header;
+
+void bch2_acl_to_text(struct printbuf *, const void *, size_t);
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+
+struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int);
+
+int bch2_set_acl_trans(struct btree_trans *, subvol_inum,
+ struct bch_inode_unpacked *,
+ struct posix_acl *, int);
+int bch2_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int);
+int bch2_acl_chmod(struct btree_trans *, subvol_inum,
+ struct bch_inode_unpacked *,
+ umode_t, struct posix_acl **);
+
+#else
+
+static inline int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
+ struct bch_inode_unpacked *inode_u,
+ struct posix_acl *acl, int type)
+{
+ return 0;
+}
+
+static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
+ struct bch_inode_unpacked *inode,
+ umode_t mode,
+ struct posix_acl **new_acl)
+{
+ return 0;
+}
+
+#endif /* CONFIG_BCACHEFS_POSIX_ACL */
+
+#endif /* _BCACHEFS_ACL_H */
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
new file mode 100644
index 000000000000..1fec0e67891f
--- /dev/null
+++ b/fs/bcachefs/alloc_background.c
@@ -0,0 +1,2159 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "backpointers.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_key_cache.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_gc.h"
+#include "btree_write_buffer.h"
+#include "buckets.h"
+#include "buckets_waiting_for_journal.h"
+#include "clock.h"
+#include "debug.h"
+#include "ec.h"
+#include "error.h"
+#include "lru.h"
+#include "recovery.h"
+#include "trace.h"
+#include "varint.h"
+
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <linux/random.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/sched/task.h>
+#include <linux/sort.h>
+
+/* Persistent alloc info: */
+
+static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
+#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
+ BCH_ALLOC_FIELDS_V1()
+#undef x
+};
+
+struct bkey_alloc_unpacked {
+ u64 journal_seq;
+ u8 gen;
+ u8 oldest_gen;
+ u8 data_type;
+ bool need_discard:1;
+ bool need_inc_gen:1;
+#define x(_name, _bits) u##_bits _name;
+ BCH_ALLOC_FIELDS_V2()
+#undef x
+};
+
+static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
+ const void **p, unsigned field)
+{
+ unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
+ u64 v;
+
+ if (!(a->fields & (1 << field)))
+ return 0;
+
+ switch (bytes) {
+ case 1:
+ v = *((const u8 *) *p);
+ break;
+ case 2:
+ v = le16_to_cpup(*p);
+ break;
+ case 4:
+ v = le32_to_cpup(*p);
+ break;
+ case 8:
+ v = le64_to_cpup(*p);
+ break;
+ default:
+ BUG();
+ }
+
+ *p += bytes;
+ return v;
+}
+
+static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
+ struct bkey_s_c k)
+{
+ const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
+ const void *d = in->data;
+ unsigned idx = 0;
+
+ out->gen = in->gen;
+
+#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
+ BCH_ALLOC_FIELDS_V1()
+#undef x
+}
+
+static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
+ const u8 *in = a.v->data;
+ const u8 *end = bkey_val_end(a);
+ unsigned fieldnr = 0;
+ int ret;
+ u64 v;
+
+ out->gen = a.v->gen;
+ out->oldest_gen = a.v->oldest_gen;
+ out->data_type = a.v->data_type;
+
+#define x(_name, _bits) \
+ if (fieldnr < a.v->nr_fields) { \
+ ret = bch2_varint_decode_fast(in, end, &v); \
+ if (ret < 0) \
+ return ret; \
+ in += ret; \
+ } else { \
+ v = 0; \
+ } \
+ out->_name = v; \
+ if (v != out->_name) \
+ return -1; \
+ fieldnr++;
+
+ BCH_ALLOC_FIELDS_V2()
+#undef x
+ return 0;
+}
+
+static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k);
+ const u8 *in = a.v->data;
+ const u8 *end = bkey_val_end(a);
+ unsigned fieldnr = 0;
+ int ret;
+ u64 v;
+
+ out->gen = a.v->gen;
+ out->oldest_gen = a.v->oldest_gen;
+ out->data_type = a.v->data_type;
+ out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v);
+ out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v);
+ out->journal_seq = le64_to_cpu(a.v->journal_seq);
+
+#define x(_name, _bits) \
+ if (fieldnr < a.v->nr_fields) { \
+ ret = bch2_varint_decode_fast(in, end, &v); \
+ if (ret < 0) \
+ return ret; \
+ in += ret; \
+ } else { \
+ v = 0; \
+ } \
+ out->_name = v; \
+ if (v != out->_name) \
+ return -1; \
+ fieldnr++;
+
+ BCH_ALLOC_FIELDS_V2()
+#undef x
+ return 0;
+}
+
+static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
+{
+ struct bkey_alloc_unpacked ret = { .gen = 0 };
+
+ switch (k.k->type) {
+ case KEY_TYPE_alloc:
+ bch2_alloc_unpack_v1(&ret, k);
+ break;
+ case KEY_TYPE_alloc_v2:
+ bch2_alloc_unpack_v2(&ret, k);
+ break;
+ case KEY_TYPE_alloc_v3:
+ bch2_alloc_unpack_v3(&ret, k);
+ break;
+ }
+
+ return ret;
+}
+
+static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
+{
+ unsigned i, bytes = offsetof(struct bch_alloc, data);
+
+ for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
+ if (a->fields & (1 << i))
+ bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
+
+ return DIV_ROUND_UP(bytes, sizeof(u64));
+}
+
+int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
+{
+ struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
+ int ret = 0;
+
+ /* allow for unknown fields */
+ bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), c, err,
+ alloc_v1_val_size_bad,
+ "incorrect value size (%zu < %u)",
+ bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
+fsck_err:
+ return ret;
+}
+
+int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
+{
+ struct bkey_alloc_unpacked u;
+ int ret = 0;
+
+ bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), c, err,
+ alloc_v2_unpack_error,
+ "unpack error");
+fsck_err:
+ return ret;
+}
+
+int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
+{
+ struct bkey_alloc_unpacked u;
+ int ret = 0;
+
+ bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), c, err,
+ alloc_v2_unpack_error,
+ "unpack error");
+fsck_err:
+ return ret;
+}
+
+int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags, struct printbuf *err)
+{
+ struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
+ int ret = 0;
+
+ bkey_fsck_err_on(alloc_v4_u64s(a.v) > bkey_val_u64s(k.k), c, err,
+ alloc_v4_val_size_bad,
+ "bad val size (%u > %zu)",
+ alloc_v4_u64s(a.v), bkey_val_u64s(k.k));
+
+ bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
+ BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), c, err,
+ alloc_v4_backpointers_start_bad,
+ "invalid backpointers_start");
+
+ bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, c, err,
+ alloc_key_data_type_bad,
+ "invalid data type (got %u should be %u)",
+ a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
+
+ switch (a.v->data_type) {
+ case BCH_DATA_free:
+ case BCH_DATA_need_gc_gens:
+ case BCH_DATA_need_discard:
+ bkey_fsck_err_on(a.v->dirty_sectors ||
+ a.v->cached_sectors ||
+ a.v->stripe, c, err,
+ alloc_key_empty_but_have_data,
+ "empty data type free but have data");
+ break;
+ case BCH_DATA_sb:
+ case BCH_DATA_journal:
+ case BCH_DATA_btree:
+ case BCH_DATA_user:
+ case BCH_DATA_parity:
+ bkey_fsck_err_on(!a.v->dirty_sectors, c, err,
+ alloc_key_dirty_sectors_0,
+ "data_type %s but dirty_sectors==0",
+ bch2_data_types[a.v->data_type]);
+ break;
+ case BCH_DATA_cached:
+ bkey_fsck_err_on(!a.v->cached_sectors ||
+ a.v->dirty_sectors ||
+ a.v->stripe, c, err,
+ alloc_key_cached_inconsistency,
+ "data type inconsistency");
+
+ bkey_fsck_err_on(!a.v->io_time[READ] &&
+ c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
+ c, err,
+ alloc_key_cached_but_read_time_zero,
+ "cached bucket with read_time == 0");
+ break;
+ case BCH_DATA_stripe:
+ break;
+ }
+fsck_err:
+ return ret;
+}
+
+void bch2_alloc_v4_swab(struct bkey_s k)
+{
+ struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
+ struct bch_backpointer *bp, *bps;
+
+ a->journal_seq = swab64(a->journal_seq);
+ a->flags = swab32(a->flags);
+ a->dirty_sectors = swab32(a->dirty_sectors);
+ a->cached_sectors = swab32(a->cached_sectors);
+ a->io_time[0] = swab64(a->io_time[0]);
+ a->io_time[1] = swab64(a->io_time[1]);
+ a->stripe = swab32(a->stripe);
+ a->nr_external_backpointers = swab32(a->nr_external_backpointers);
+ a->fragmentation_lru = swab64(a->fragmentation_lru);
+
+ bps = alloc_v4_backpointers(a);
+ for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
+ bp->bucket_offset = swab40(bp->bucket_offset);
+ bp->bucket_len = swab32(bp->bucket_len);
+ bch2_bpos_swab(&bp->pos);
+ }
+}
+
+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bch_alloc_v4 _a;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
+ unsigned i;
+
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+
+ prt_printf(out, "gen %u oldest_gen %u data_type %s",
+ a->gen, a->oldest_gen,
+ a->data_type < BCH_DATA_NR
+ ? bch2_data_types[a->data_type]
+ : "(invalid data type)");
+ prt_newline(out);
+ prt_printf(out, "journal_seq %llu", a->journal_seq);
+ prt_newline(out);
+ prt_printf(out, "need_discard %llu", BCH_ALLOC_V4_NEED_DISCARD(a));
+ prt_newline(out);
+ prt_printf(out, "need_inc_gen %llu", BCH_ALLOC_V4_NEED_INC_GEN(a));
+ prt_newline(out);
+ prt_printf(out, "dirty_sectors %u", a->dirty_sectors);
+ prt_newline(out);
+ prt_printf(out, "cached_sectors %u", a->cached_sectors);
+ prt_newline(out);
+ prt_printf(out, "stripe %u", a->stripe);
+ prt_newline(out);
+ prt_printf(out, "stripe_redundancy %u", a->stripe_redundancy);
+ prt_newline(out);
+ prt_printf(out, "io_time[READ] %llu", a->io_time[READ]);
+ prt_newline(out);
+ prt_printf(out, "io_time[WRITE] %llu", a->io_time[WRITE]);
+ prt_newline(out);
+ prt_printf(out, "fragmentation %llu", a->fragmentation_lru);
+ prt_newline(out);
+ prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a));
+ prt_newline(out);
+
+ if (BCH_ALLOC_V4_NR_BACKPOINTERS(a)) {
+ struct bkey_s_c_alloc_v4 a_raw = bkey_s_c_to_alloc_v4(k);
+ const struct bch_backpointer *bps = alloc_v4_backpointers_c(a_raw.v);
+
+ prt_printf(out, "backpointers: %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v));
+ printbuf_indent_add(out, 2);
+
+ for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v); i++) {
+ prt_newline(out);
+ bch2_backpointer_to_text(out, &bps[i]);
+ }
+
+ printbuf_indent_sub(out, 2);
+ }
+
+ printbuf_indent_sub(out, 2);
+}
+
+void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
+{
+ if (k.k->type == KEY_TYPE_alloc_v4) {
+ void *src, *dst;
+
+ *out = *bkey_s_c_to_alloc_v4(k).v;
+
+ src = alloc_v4_backpointers(out);
+ SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
+ dst = alloc_v4_backpointers(out);
+
+ if (src < dst)
+ memset(src, 0, dst - src);
+
+ SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0);
+ } else {
+ struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
+
+ *out = (struct bch_alloc_v4) {
+ .journal_seq = u.journal_seq,
+ .flags = u.need_discard,
+ .gen = u.gen,
+ .oldest_gen = u.oldest_gen,
+ .data_type = u.data_type,
+ .stripe_redundancy = u.stripe_redundancy,
+ .dirty_sectors = u.dirty_sectors,
+ .cached_sectors = u.cached_sectors,
+ .io_time[READ] = u.read_time,
+ .io_time[WRITE] = u.write_time,
+ .stripe = u.stripe,
+ };
+
+ SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
+ }
+}
+
+static noinline struct bkey_i_alloc_v4 *
+__bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
+{
+ struct bkey_i_alloc_v4 *ret;
+
+ ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4)));
+ if (IS_ERR(ret))
+ return ret;
+
+ if (k.k->type == KEY_TYPE_alloc_v4) {
+ void *src, *dst;
+
+ bkey_reassemble(&ret->k_i, k);
+
+ src = alloc_v4_backpointers(&ret->v);
+ SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
+ dst = alloc_v4_backpointers(&ret->v);
+
+ if (src < dst)
+ memset(src, 0, dst - src);
+
+ SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0);
+ set_alloc_v4_u64s(ret);
+ } else {
+ bkey_alloc_v4_init(&ret->k_i);
+ ret->k.p = k.k->p;
+ bch2_alloc_to_v4(k, &ret->v);
+ }
+ return ret;
+}
+
+static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k)
+{
+ struct bkey_s_c_alloc_v4 a;
+
+ if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
+ ((a = bkey_s_c_to_alloc_v4(k), true) &&
+ BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0))
+ return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4);
+
+ return __bch2_alloc_to_v4_mut(trans, k);
+}
+
+struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
+{
+ return bch2_alloc_to_v4_mut_inlined(trans, k);
+}
+
+struct bkey_i_alloc_v4 *
+bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos pos)
+{
+ struct bkey_s_c k;
+ struct bkey_i_alloc_v4 *a;
+ int ret;
+
+ k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos,
+ BTREE_ITER_WITH_UPDATES|
+ BTREE_ITER_CACHED|
+ BTREE_ITER_INTENT);
+ ret = bkey_err(k);
+ if (unlikely(ret))
+ return ERR_PTR(ret);
+
+ a = bch2_alloc_to_v4_mut_inlined(trans, k);
+ ret = PTR_ERR_OR_ZERO(a);
+ if (unlikely(ret))
+ goto err;
+ return a;
+err:
+ bch2_trans_iter_exit(trans, iter);
+ return ERR_PTR(ret);
+}
+
+static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset)
+{
+ *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK;
+
+ pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS;
+ return pos;
+}
+
+static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset)
+{
+ pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS;
+ pos.offset += offset;
+ return pos;
+}
+
+static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
+{
+ return k.k->type == KEY_TYPE_bucket_gens
+ ? bkey_s_c_to_bucket_gens(k).v->gens[offset]
+ : 0;
+}
+
+int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
+{
+ int ret = 0;
+
+ bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), c, err,
+ bucket_gens_val_size_bad,
+ "bad val size (%zu != %zu)",
+ bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
+fsck_err:
+ return ret;
+}
+
+void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k);
+ unsigned i;
+
+ for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) {
+ if (i)
+ prt_char(out, ' ');
+ prt_printf(out, "%u", g.v->gens[i]);
+ }
+}
+
+int bch2_bucket_gens_init(struct bch_fs *c)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_alloc_v4 a;
+ struct bkey_i_bucket_gens g;
+ bool have_bucket_gens_key = false;
+ unsigned offset;
+ struct bpos pos;
+ u8 gen;
+ int ret;
+
+ for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ /*
+ * Not a fsck error because this is checked/repaired by
+ * bch2_check_alloc_key() which runs later:
+ */
+ if (!bch2_dev_bucket_exists(c, k.k->p))
+ continue;
+
+ gen = bch2_alloc_to_v4(k, &a)->gen;
+ pos = alloc_gens_pos(iter.pos, &offset);
+
+ if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
+ ret = commit_do(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
+ if (ret)
+ break;
+ have_bucket_gens_key = false;
+ }
+
+ if (!have_bucket_gens_key) {
+ bkey_bucket_gens_init(&g.k_i);
+ g.k.p = pos;
+ have_bucket_gens_key = true;
+ }
+
+ g.v.gens[offset] = gen;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (have_bucket_gens_key && !ret)
+ ret = commit_do(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
+
+ bch2_trans_put(trans);
+
+ if (ret)
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+int bch2_alloc_read(struct bch_fs *c)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_dev *ca;
+ int ret;
+
+ down_read(&c->gc_lock);
+
+ if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
+ const struct bch_bucket_gens *g;
+ u64 b;
+
+ for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
+ u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
+
+ if (k.k->type != KEY_TYPE_bucket_gens)
+ continue;
+
+ g = bkey_s_c_to_bucket_gens(k).v;
+
+ /*
+ * Not a fsck error because this is checked/repaired by
+ * bch2_check_alloc_key() which runs later:
+ */
+ if (!bch2_dev_exists2(c, k.k->p.inode))
+ continue;
+
+ ca = bch_dev_bkey_exists(c, k.k->p.inode);
+
+ for (b = max_t(u64, ca->mi.first_bucket, start);
+ b < min_t(u64, ca->mi.nbuckets, end);
+ b++)
+ *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
+ }
+ bch2_trans_iter_exit(trans, &iter);
+ } else {
+ struct bch_alloc_v4 a;
+
+ for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ /*
+ * Not a fsck error because this is checked/repaired by
+ * bch2_check_alloc_key() which runs later:
+ */
+ if (!bch2_dev_bucket_exists(c, k.k->p))
+ continue;
+
+ ca = bch_dev_bkey_exists(c, k.k->p.inode);
+
+ *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+ }
+
+ bch2_trans_put(trans);
+ up_read(&c->gc_lock);
+
+ if (ret)
+ bch_err_fn(c, ret);
+
+ return ret;
+}
+
+/* Free space/discard btree: */
+
+static int bch2_bucket_do_index(struct btree_trans *trans,
+ struct bkey_s_c alloc_k,
+ const struct bch_alloc_v4 *a,
+ bool set)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
+ struct btree_iter iter;
+ struct bkey_s_c old;
+ struct bkey_i *k;
+ enum btree_id btree;
+ enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted;
+ enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
+ struct printbuf buf = PRINTBUF;
+ int ret;
+
+ if (a->data_type != BCH_DATA_free &&
+ a->data_type != BCH_DATA_need_discard)
+ return 0;
+
+ k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
+ if (IS_ERR(k))
+ return PTR_ERR(k);
+
+ bkey_init(&k->k);
+ k->k.type = new_type;
+
+ switch (a->data_type) {
+ case BCH_DATA_free:
+ btree = BTREE_ID_freespace;
+ k->k.p = alloc_freespace_pos(alloc_k.k->p, *a);
+ bch2_key_resize(&k->k, 1);
+ break;
+ case BCH_DATA_need_discard:
+ btree = BTREE_ID_need_discard;
+ k->k.p = alloc_k.k->p;
+ break;
+ default:
+ return 0;
+ }
+
+ old = bch2_bkey_get_iter(trans, &iter, btree,
+ bkey_start_pos(&k->k),
+ BTREE_ITER_INTENT);
+ ret = bkey_err(old);
+ if (ret)
+ return ret;
+
+ if (ca->mi.freespace_initialized &&
+ c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info &&
+ bch2_trans_inconsistent_on(old.k->type != old_type, trans,
+ "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n"
+ " for %s",
+ set ? "setting" : "clearing",
+ bch2_btree_id_str(btree),
+ iter.pos.inode,
+ iter.pos.offset,
+ bch2_bkey_types[old.k->type],
+ bch2_bkey_types[old_type],
+ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+ ret = -EIO;
+ goto err;
+ }
+
+ ret = bch2_trans_update(trans, &iter, k, 0);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
+ struct bpos bucket, u8 gen)
+{
+ struct btree_iter iter;
+ unsigned offset;
+ struct bpos pos = alloc_gens_pos(bucket, &offset);
+ struct bkey_i_bucket_gens *g;
+ struct bkey_s_c k;
+ int ret;
+
+ g = bch2_trans_kmalloc(trans, sizeof(*g));
+ ret = PTR_ERR_OR_ZERO(g);
+ if (ret)
+ return ret;
+
+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_WITH_UPDATES);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ if (k.k->type != KEY_TYPE_bucket_gens) {
+ bkey_bucket_gens_init(&g->k_i);
+ g->k.p = iter.pos;
+ } else {
+ bkey_reassemble(&g->k_i, k);
+ }
+
+ g->v.gens[offset] = gen;
+
+ ret = bch2_trans_update(trans, &iter, &g->k_i, 0);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_trans_mark_alloc(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_i *new,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_alloc_v4 old_a_convert, *new_a;
+ const struct bch_alloc_v4 *old_a;
+ u64 old_lru, new_lru;
+ int ret = 0;
+
+ /*
+ * Deletion only happens in the device removal path, with
+ * BTREE_TRIGGER_NORUN:
+ */
+ BUG_ON(new->k.type != KEY_TYPE_alloc_v4);
+
+ old_a = bch2_alloc_to_v4(old, &old_a_convert);
+ new_a = &bkey_i_to_alloc_v4(new)->v;
+
+ new_a->data_type = alloc_data_type(*new_a, new_a->data_type);
+
+ if (new_a->dirty_sectors > old_a->dirty_sectors ||
+ new_a->cached_sectors > old_a->cached_sectors) {
+ new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
+ new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
+ SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
+ SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
+ }
+
+ if (data_type_is_empty(new_a->data_type) &&
+ BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
+ !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) {
+ new_a->gen++;
+ SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
+ }
+
+ if (old_a->data_type != new_a->data_type ||
+ (new_a->data_type == BCH_DATA_free &&
+ alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) {
+ ret = bch2_bucket_do_index(trans, old, old_a, false) ?:
+ bch2_bucket_do_index(trans, bkey_i_to_s_c(new), new_a, true);
+ if (ret)
+ return ret;
+ }
+
+ if (new_a->data_type == BCH_DATA_cached &&
+ !new_a->io_time[READ])
+ new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
+
+ old_lru = alloc_lru_idx_read(*old_a);
+ new_lru = alloc_lru_idx_read(*new_a);
+
+ if (old_lru != new_lru) {
+ ret = bch2_lru_change(trans, new->k.p.inode,
+ bucket_to_u64(new->k.p),
+ old_lru, new_lru);
+ if (ret)
+ return ret;
+ }
+
+ new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a,
+ bch_dev_bkey_exists(c, new->k.p.inode));
+
+ if (old_a->fragmentation_lru != new_a->fragmentation_lru) {
+ ret = bch2_lru_change(trans,
+ BCH_LRU_FRAGMENTATION_START,
+ bucket_to_u64(new->k.p),
+ old_a->fragmentation_lru, new_a->fragmentation_lru);
+ if (ret)
+ return ret;
+ }
+
+ if (old_a->gen != new_a->gen) {
+ ret = bch2_bucket_gen_update(trans, new->k.p, new_a->gen);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * This synthesizes deleted extents for holes, similar to BTREE_ITER_SLOTS for
+ * extents style btrees, but works on non-extents btrees:
+ */
+static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole)
+{
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+
+ if (bkey_err(k))
+ return k;
+
+ if (k.k->type) {
+ return k;
+ } else {
+ struct btree_iter iter2;
+ struct bpos next;
+
+ bch2_trans_copy_iter(&iter2, iter);
+
+ if (!bpos_eq(iter->path->l[0].b->key.k.p, SPOS_MAX))
+ end = bkey_min(end, bpos_nosnap_successor(iter->path->l[0].b->key.k.p));
+
+ end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1));
+
+ /*
+ * btree node min/max is a closed interval, upto takes a half
+ * open interval:
+ */
+ k = bch2_btree_iter_peek_upto(&iter2, end);
+ next = iter2.pos;
+ bch2_trans_iter_exit(iter->trans, &iter2);
+
+ BUG_ON(next.offset >= iter->pos.offset + U32_MAX);
+
+ if (bkey_err(k))
+ return k;
+
+ bkey_init(hole);
+ hole->p = iter->pos;
+
+ bch2_key_resize(hole, next.offset - iter->pos.offset);
+ return (struct bkey_s_c) { hole, NULL };
+ }
+}
+
+static bool next_bucket(struct bch_fs *c, struct bpos *bucket)
+{
+ struct bch_dev *ca;
+ unsigned iter;
+
+ if (bch2_dev_bucket_exists(c, *bucket))
+ return true;
+
+ if (bch2_dev_exists2(c, bucket->inode)) {
+ ca = bch_dev_bkey_exists(c, bucket->inode);
+
+ if (bucket->offset < ca->mi.first_bucket) {
+ bucket->offset = ca->mi.first_bucket;
+ return true;
+ }
+
+ bucket->inode++;
+ bucket->offset = 0;
+ }
+
+ rcu_read_lock();
+ iter = bucket->inode;
+ ca = __bch2_next_dev(c, &iter, NULL);
+ if (ca)
+ *bucket = POS(ca->dev_idx, ca->mi.first_bucket);
+ rcu_read_unlock();
+
+ return ca != NULL;
+}
+
+static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, struct bkey *hole)
+{
+ struct bch_fs *c = iter->trans->c;
+ struct bkey_s_c k;
+again:
+ k = bch2_get_key_or_hole(iter, POS_MAX, hole);
+ if (bkey_err(k))
+ return k;
+
+ if (!k.k->type) {
+ struct bpos bucket = bkey_start_pos(k.k);
+
+ if (!bch2_dev_bucket_exists(c, bucket)) {
+ if (!next_bucket(c, &bucket))
+ return bkey_s_c_null;
+
+ bch2_btree_iter_set_pos(iter, bucket);
+ goto again;
+ }
+
+ if (!bch2_dev_bucket_exists(c, k.k->p)) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
+
+ bch2_key_resize(hole, ca->mi.nbuckets - bucket.offset);
+ }
+ }
+
+ return k;
+}
+
+static noinline_for_stack
+int bch2_check_alloc_key(struct btree_trans *trans,
+ struct bkey_s_c alloc_k,
+ struct btree_iter *alloc_iter,
+ struct btree_iter *discard_iter,
+ struct btree_iter *freespace_iter,
+ struct btree_iter *bucket_gens_iter)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_dev *ca;
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a;
+ unsigned discard_key_type, freespace_key_type;
+ unsigned gens_offset;
+ struct bkey_s_c k;
+ struct printbuf buf = PRINTBUF;
+ int ret;
+
+ if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c,
+ alloc_key_to_missing_dev_bucket,
+ "alloc key for invalid device:bucket %llu:%llu",
+ alloc_k.k->p.inode, alloc_k.k->p.offset))
+ return bch2_btree_delete_at(trans, alloc_iter, 0);
+
+ ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
+ if (!ca->mi.freespace_initialized)
+ return 0;
+
+ a = bch2_alloc_to_v4(alloc_k, &a_convert);
+
+ discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0;
+ bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
+ k = bch2_btree_iter_peek_slot(discard_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != discard_key_type &&
+ (c->opts.reconstruct_alloc ||
+ fsck_err(c, need_discard_key_wrong,
+ "incorrect key in need_discard btree (got %s should be %s)\n"
+ " %s",
+ bch2_bkey_types[k.k->type],
+ bch2_bkey_types[discard_key_type],
+ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+ struct bkey_i *update =
+ bch2_trans_kmalloc(trans, sizeof(*update));
+
+ ret = PTR_ERR_OR_ZERO(update);
+ if (ret)
+ goto err;
+
+ bkey_init(&update->k);
+ update->k.type = discard_key_type;
+ update->k.p = discard_iter->pos;
+
+ ret = bch2_trans_update(trans, discard_iter, update, 0);
+ if (ret)
+ goto err;
+ }
+
+ freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0;
+ bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
+ k = bch2_btree_iter_peek_slot(freespace_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != freespace_key_type &&
+ (c->opts.reconstruct_alloc ||
+ fsck_err(c, freespace_key_wrong,
+ "incorrect key in freespace btree (got %s should be %s)\n"
+ " %s",
+ bch2_bkey_types[k.k->type],
+ bch2_bkey_types[freespace_key_type],
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+ struct bkey_i *update =
+ bch2_trans_kmalloc(trans, sizeof(*update));
+
+ ret = PTR_ERR_OR_ZERO(update);
+ if (ret)
+ goto err;
+
+ bkey_init(&update->k);
+ update->k.type = freespace_key_type;
+ update->k.p = freespace_iter->pos;
+ bch2_key_resize(&update->k, 1);
+
+ ret = bch2_trans_update(trans, freespace_iter, update, 0);
+ if (ret)
+ goto err;
+ }
+
+ bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
+ k = bch2_btree_iter_peek_slot(bucket_gens_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (a->gen != alloc_gen(k, gens_offset) &&
+ (c->opts.reconstruct_alloc ||
+ fsck_err(c, bucket_gens_key_wrong,
+ "incorrect gen in bucket_gens btree (got %u should be %u)\n"
+ " %s",
+ alloc_gen(k, gens_offset), a->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+ struct bkey_i_bucket_gens *g =
+ bch2_trans_kmalloc(trans, sizeof(*g));
+
+ ret = PTR_ERR_OR_ZERO(g);
+ if (ret)
+ goto err;
+
+ if (k.k->type == KEY_TYPE_bucket_gens) {
+ bkey_reassemble(&g->k_i, k);
+ } else {
+ bkey_bucket_gens_init(&g->k_i);
+ g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset);
+ }
+
+ g->v.gens[gens_offset] = a->gen;
+
+ ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0);
+ if (ret)
+ goto err;
+ }
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static noinline_for_stack
+int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
+ struct bpos start,
+ struct bpos *end,
+ struct btree_iter *freespace_iter)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_dev *ca;
+ struct bkey_s_c k;
+ struct printbuf buf = PRINTBUF;
+ int ret;
+
+ ca = bch_dev_bkey_exists(c, start.inode);
+ if (!ca->mi.freespace_initialized)
+ return 0;
+
+ bch2_btree_iter_set_pos(freespace_iter, start);
+
+ k = bch2_btree_iter_peek_slot(freespace_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ *end = bkey_min(k.k->p, *end);
+
+ if (k.k->type != KEY_TYPE_set &&
+ (c->opts.reconstruct_alloc ||
+ fsck_err(c, freespace_hole_missing,
+ "hole in alloc btree missing in freespace btree\n"
+ " device %llu buckets %llu-%llu",
+ freespace_iter->pos.inode,
+ freespace_iter->pos.offset,
+ end->offset))) {
+ struct bkey_i *update =
+ bch2_trans_kmalloc(trans, sizeof(*update));
+
+ ret = PTR_ERR_OR_ZERO(update);
+ if (ret)
+ goto err;
+
+ bkey_init(&update->k);
+ update->k.type = KEY_TYPE_set;
+ update->k.p = freespace_iter->pos;
+ bch2_key_resize(&update->k,
+ min_t(u64, U32_MAX, end->offset -
+ freespace_iter->pos.offset));
+
+ ret = bch2_trans_update(trans, freespace_iter, update, 0);
+ if (ret)
+ goto err;
+ }
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static noinline_for_stack
+int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
+ struct bpos start,
+ struct bpos *end,
+ struct btree_iter *bucket_gens_iter)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k;
+ struct printbuf buf = PRINTBUF;
+ unsigned i, gens_offset, gens_end_offset;
+ int ret;
+
+ if (c->sb.version < bcachefs_metadata_version_bucket_gens)
+ return 0;
+
+ bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
+
+ k = bch2_btree_iter_peek_slot(bucket_gens_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (bkey_cmp(alloc_gens_pos(start, &gens_offset),
+ alloc_gens_pos(*end, &gens_end_offset)))
+ gens_end_offset = KEY_TYPE_BUCKET_GENS_NR;
+
+ if (k.k->type == KEY_TYPE_bucket_gens) {
+ struct bkey_i_bucket_gens g;
+ bool need_update = false;
+
+ bkey_reassemble(&g.k_i, k);
+
+ for (i = gens_offset; i < gens_end_offset; i++) {
+ if (fsck_err_on(g.v.gens[i], c,
+ bucket_gens_hole_wrong,
+ "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
+ bucket_gens_pos_to_alloc(k.k->p, i).inode,
+ bucket_gens_pos_to_alloc(k.k->p, i).offset,
+ g.v.gens[i])) {
+ g.v.gens[i] = 0;
+ need_update = true;
+ }
+ }
+
+ if (need_update) {
+ struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
+
+ ret = PTR_ERR_OR_ZERO(u);
+ if (ret)
+ goto err;
+
+ memcpy(u, &g, sizeof(g));
+
+ ret = bch2_trans_update(trans, bucket_gens_iter, u, 0);
+ if (ret)
+ goto err;
+ }
+ }
+
+ *end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0));
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_trans *trans,
+ struct btree_iter *iter)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter alloc_iter;
+ struct bkey_s_c alloc_k;
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a;
+ u64 genbits;
+ struct bpos pos;
+ enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
+ ? BCH_DATA_need_discard
+ : BCH_DATA_free;
+ struct printbuf buf = PRINTBUF;
+ int ret;
+
+ pos = iter->pos;
+ pos.offset &= ~(~0ULL << 56);
+ genbits = iter->pos.offset & (~0ULL << 56);
+
+ alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
+ ret = bkey_err(alloc_k);
+ if (ret)
+ return ret;
+
+ if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
+ need_discard_freespace_key_to_invalid_dev_bucket,
+ "entry in %s btree for nonexistant dev:bucket %llu:%llu",
+ bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset))
+ goto delete;
+
+ a = bch2_alloc_to_v4(alloc_k, &a_convert);
+
+ if (fsck_err_on(a->data_type != state ||
+ (state == BCH_DATA_free &&
+ genbits != alloc_freespace_genbits(*a)), c,
+ need_discard_freespace_key_bad,
+ "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
+ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
+ bch2_btree_id_str(iter->btree_id),
+ iter->pos.inode,
+ iter->pos.offset,
+ a->data_type == state,
+ genbits >> 56, alloc_freespace_genbits(*a) >> 56))
+ goto delete;
+out:
+fsck_err:
+ set_btree_iter_dontneed(&alloc_iter);
+ bch2_trans_iter_exit(trans, &alloc_iter);
+ printbuf_exit(&buf);
+ return ret;
+delete:
+ ret = bch2_btree_delete_extent_at(trans, iter,
+ iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW);
+ goto out;
+}
+
+static int bch2_check_discard_freespace_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos end)
+{
+ if (!btree_id_is_extents(iter->btree_id)) {
+ return __bch2_check_discard_freespace_key(trans, iter);
+ } else {
+ int ret = 0;
+
+ while (!bkey_eq(iter->pos, end) &&
+ !(ret = btree_trans_too_many_iters(trans) ?:
+ __bch2_check_discard_freespace_key(trans, iter)))
+ bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
+
+ return ret;
+ }
+}
+
+/*
+ * We've already checked that generation numbers in the bucket_gens btree are
+ * valid for buckets that exist; this just checks for keys for nonexistent
+ * buckets.
+ */
+static noinline_for_stack
+int bch2_check_bucket_gens_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_i_bucket_gens g;
+ struct bch_dev *ca;
+ u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
+ u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
+ u64 b;
+ bool need_update = false, dev_exists;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ BUG_ON(k.k->type != KEY_TYPE_bucket_gens);
+ bkey_reassemble(&g.k_i, k);
+
+ /* if no bch_dev, skip out whether we repair or not */
+ dev_exists = bch2_dev_exists2(c, k.k->p.inode);
+ if (!dev_exists) {
+ if (fsck_err_on(!dev_exists, c,
+ bucket_gens_to_invalid_dev,
+ "bucket_gens key for invalid device:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, iter, 0);
+ }
+ goto out;
+ }
+
+ ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ if (fsck_err_on(end <= ca->mi.first_bucket ||
+ start >= ca->mi.nbuckets, c,
+ bucket_gens_to_invalid_buckets,
+ "bucket_gens key for invalid buckets:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, iter, 0);
+ goto out;
+ }
+
+ for (b = start; b < ca->mi.first_bucket; b++)
+ if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
+ bucket_gens_nonzero_for_invalid_buckets,
+ "bucket_gens key has nonzero gen for invalid bucket")) {
+ g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
+ need_update = true;
+ }
+
+ for (b = ca->mi.nbuckets; b < end; b++)
+ if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
+ bucket_gens_nonzero_for_invalid_buckets,
+ "bucket_gens key has nonzero gen for invalid bucket")) {
+ g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
+ need_update = true;
+ }
+
+ if (need_update) {
+ struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
+
+ ret = PTR_ERR_OR_ZERO(u);
+ if (ret)
+ goto out;
+
+ memcpy(u, &g, sizeof(g));
+ ret = bch2_trans_update(trans, iter, u, 0);
+ }
+out:
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+int bch2_check_alloc_info(struct bch_fs *c)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter;
+ struct bkey hole;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH);
+ bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
+ BTREE_ITER_PREFETCH);
+ bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
+ BTREE_ITER_PREFETCH);
+ bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
+ BTREE_ITER_PREFETCH);
+
+ while (1) {
+ struct bpos next;
+
+ bch2_trans_begin(trans);
+
+ k = bch2_get_key_or_real_bucket_hole(&iter, &hole);
+ ret = bkey_err(k);
+ if (ret)
+ goto bkey_err;
+
+ if (!k.k)
+ break;
+
+ if (k.k->type) {
+ next = bpos_nosnap_successor(k.k->p);
+
+ ret = bch2_check_alloc_key(trans,
+ k, &iter,
+ &discard_iter,
+ &freespace_iter,
+ &bucket_gens_iter);
+ if (ret)
+ goto bkey_err;
+ } else {
+ next = k.k->p;
+
+ ret = bch2_check_alloc_hole_freespace(trans,
+ bkey_start_pos(k.k),
+ &next,
+ &freespace_iter) ?:
+ bch2_check_alloc_hole_bucket_gens(trans,
+ bkey_start_pos(k.k),
+ &next,
+ &bucket_gens_iter);
+ if (ret)
+ goto bkey_err;
+ }
+
+ ret = bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW);
+ if (ret)
+ goto bkey_err;
+
+ bch2_btree_iter_set_pos(&iter, next);
+bkey_err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(trans, &bucket_gens_iter);
+ bch2_trans_iter_exit(trans, &freespace_iter);
+ bch2_trans_iter_exit(trans, &discard_iter);
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (ret < 0)
+ goto err;
+
+ ret = for_each_btree_key2(trans, iter,
+ BTREE_ID_need_discard, POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
+ for_each_btree_key2(trans, iter,
+ BTREE_ID_freespace, POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
+ for_each_btree_key_commit(trans, iter,
+ BTREE_ID_bucket_gens, POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ bch2_check_bucket_gens_key(trans, &iter, k));
+err:
+ bch2_trans_put(trans);
+ if (ret)
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
+ struct btree_iter *alloc_iter)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter lru_iter;
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a;
+ struct bkey_s_c alloc_k, lru_k;
+ struct printbuf buf = PRINTBUF;
+ int ret;
+
+ alloc_k = bch2_btree_iter_peek(alloc_iter);
+ if (!alloc_k.k)
+ return 0;
+
+ ret = bkey_err(alloc_k);
+ if (ret)
+ return ret;
+
+ a = bch2_alloc_to_v4(alloc_k, &a_convert);
+
+ if (a->data_type != BCH_DATA_cached)
+ return 0;
+
+ lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru,
+ lru_pos(alloc_k.k->p.inode,
+ bucket_to_u64(alloc_k.k->p),
+ a->io_time[READ]), 0);
+ ret = bkey_err(lru_k);
+ if (ret)
+ return ret;
+
+ if (fsck_err_on(!a->io_time[READ], c,
+ alloc_key_cached_but_read_time_zero,
+ "cached bucket with read_time 0\n"
+ " %s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) ||
+ fsck_err_on(lru_k.k->type != KEY_TYPE_set, c,
+ alloc_key_to_missing_lru_entry,
+ "missing lru entry\n"
+ " %s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+ u64 read_time = a->io_time[READ] ?:
+ atomic64_read(&c->io_clock[READ].now);
+
+ ret = bch2_lru_set(trans,
+ alloc_k.k->p.inode,
+ bucket_to_u64(alloc_k.k->p),
+ read_time);
+ if (ret)
+ goto err;
+
+ if (a->io_time[READ] != read_time) {
+ struct bkey_i_alloc_v4 *a_mut =
+ bch2_alloc_to_v4_mut(trans, alloc_k);
+ ret = PTR_ERR_OR_ZERO(a_mut);
+ if (ret)
+ goto err;
+
+ a_mut->v.io_time[READ] = read_time;
+ ret = bch2_trans_update(trans, alloc_iter,
+ &a_mut->k_i, BTREE_TRIGGER_NORUN);
+ if (ret)
+ goto err;
+ }
+ }
+err:
+fsck_err:
+ bch2_trans_iter_exit(trans, &lru_iter);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
+ POS_MIN, BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ bch2_check_alloc_to_lru_ref(trans, &iter)));
+ if (ret)
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+static int bch2_discard_one_bucket(struct btree_trans *trans,
+ struct btree_iter *need_discard_iter,
+ struct bpos *discard_pos_done,
+ u64 *seen,
+ u64 *open,
+ u64 *need_journal_commit,
+ u64 *discarded)
+{
+ struct bch_fs *c = trans->c;
+ struct bpos pos = need_discard_iter->pos;
+ struct btree_iter iter = { NULL };
+ struct bkey_s_c k;
+ struct bch_dev *ca;
+ struct bkey_i_alloc_v4 *a;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ ca = bch_dev_bkey_exists(c, pos.inode);
+ if (!percpu_ref_tryget(&ca->io_ref)) {
+ bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
+ return 0;
+ }
+
+ if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
+ (*open)++;
+ goto out;
+ }
+
+ if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+ c->journal.flushed_seq_ondisk,
+ pos.inode, pos.offset)) {
+ (*need_journal_commit)++;
+ goto out;
+ }
+
+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
+ need_discard_iter->pos,
+ BTREE_ITER_CACHED);
+ ret = bkey_err(k);
+ if (ret)
+ goto out;
+
+ a = bch2_alloc_to_v4_mut(trans, k);
+ ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ goto out;
+
+ if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
+ a->v.gen++;
+ SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
+ goto write;
+ }
+
+ if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
+ if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
+ bch2_trans_inconsistent(trans,
+ "clearing need_discard but journal_seq %llu > flushed_seq %llu\n"
+ "%s",
+ a->v.journal_seq,
+ c->journal.flushed_seq_ondisk,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ ret = -EIO;
+ }
+ goto out;
+ }
+
+ if (a->v.data_type != BCH_DATA_need_discard) {
+ if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
+ bch2_trans_inconsistent(trans,
+ "bucket incorrectly set in need_discard btree\n"
+ "%s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ ret = -EIO;
+ }
+
+ goto out;
+ }
+
+ if (!bkey_eq(*discard_pos_done, iter.pos) &&
+ ca->mi.discard && !c->opts.nochanges) {
+ /*
+ * This works without any other locks because this is the only
+ * thread that removes items from the need_discard tree
+ */
+ bch2_trans_unlock(trans);
+ blkdev_issue_discard(ca->disk_sb.bdev,
+ k.k->p.offset * ca->mi.bucket_size,
+ ca->mi.bucket_size,
+ GFP_KERNEL);
+ *discard_pos_done = iter.pos;
+
+ ret = bch2_trans_relock_notrace(trans);
+ if (ret)
+ goto out;
+ }
+
+ SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
+ a->v.data_type = alloc_data_type(a->v, a->v.data_type);
+write:
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BCH_WATERMARK_btree|
+ BTREE_INSERT_NOFAIL);
+ if (ret)
+ goto out;
+
+ this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]);
+ (*discarded)++;
+out:
+ (*seen)++;
+ bch2_trans_iter_exit(trans, &iter);
+ percpu_ref_put(&ca->io_ref);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static void bch2_do_discards_work(struct work_struct *work)
+{
+ struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
+ struct bpos discard_pos_done = POS_MAX;
+ int ret;
+
+ /*
+ * We're doing the commit in bch2_discard_one_bucket instead of using
+ * for_each_btree_key_commit() so that we can increment counters after
+ * successful commit:
+ */
+ ret = bch2_trans_run(c,
+ for_each_btree_key2(trans, iter,
+ BTREE_ID_need_discard, POS_MIN, 0, k,
+ bch2_discard_one_bucket(trans, &iter, &discard_pos_done,
+ &seen,
+ &open,
+ &need_journal_commit,
+ &discarded)));
+
+ if (need_journal_commit * 2 > seen)
+ bch2_journal_flush_async(&c->journal, NULL);
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+
+ trace_discard_buckets(c, seen, open, need_journal_commit, discarded,
+ bch2_err_str(ret));
+}
+
+void bch2_do_discards(struct bch_fs *c)
+{
+ if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) &&
+ !queue_work(c->write_ref_wq, &c->discard_work))
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+}
+
+static int invalidate_one_bucket(struct btree_trans *trans,
+ struct btree_iter *lru_iter,
+ struct bkey_s_c lru_k,
+ s64 *nr_to_invalidate)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter alloc_iter = { NULL };
+ struct bkey_i_alloc_v4 *a = NULL;
+ struct printbuf buf = PRINTBUF;
+ struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
+ unsigned cached_sectors;
+ int ret = 0;
+
+ if (*nr_to_invalidate <= 0)
+ return 1;
+
+ if (!bch2_dev_bucket_exists(c, bucket)) {
+ prt_str(&buf, "lru entry points to invalid bucket");
+ goto err;
+ }
+
+ if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
+ return 0;
+
+ a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket);
+ ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ goto out;
+
+ /* We expect harmless races here due to the btree write buffer: */
+ if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v))
+ goto out;
+
+ BUG_ON(a->v.data_type != BCH_DATA_cached);
+
+ if (!a->v.cached_sectors)
+ bch_err(c, "invalidating empty bucket, confused");
+
+ cached_sectors = a->v.cached_sectors;
+
+ SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
+ a->v.gen++;
+ a->v.data_type = 0;
+ a->v.dirty_sectors = 0;
+ a->v.cached_sectors = 0;
+ a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
+ a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now);
+
+ ret = bch2_trans_update(trans, &alloc_iter, &a->k_i,
+ BTREE_TRIGGER_BUCKET_INVALIDATE) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BCH_WATERMARK_btree|
+ BTREE_INSERT_NOFAIL);
+ if (ret)
+ goto out;
+
+ trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
+ --*nr_to_invalidate;
+out:
+ bch2_trans_iter_exit(trans, &alloc_iter);
+ printbuf_exit(&buf);
+ return ret;
+err:
+ prt_str(&buf, "\n lru key: ");
+ bch2_bkey_val_to_text(&buf, c, lru_k);
+
+ prt_str(&buf, "\n lru entry: ");
+ bch2_lru_pos_to_text(&buf, lru_iter->pos);
+
+ prt_str(&buf, "\n alloc key: ");
+ if (!a)
+ bch2_bpos_to_text(&buf, bucket);
+ else
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
+
+ bch_err(c, "%s", buf.buf);
+ if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) {
+ bch2_inconsistent_error(c);
+ ret = -EINVAL;
+ }
+
+ goto out;
+}
+
+static void bch2_do_invalidates_work(struct work_struct *work)
+{
+ struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
+ struct bch_dev *ca;
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ unsigned i;
+ int ret = 0;
+
+ ret = bch2_btree_write_buffer_flush(trans);
+ if (ret)
+ goto err;
+
+ for_each_member_device(ca, c, i) {
+ s64 nr_to_invalidate =
+ should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
+
+ ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru,
+ lru_pos(ca->dev_idx, 0, 0),
+ lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX),
+ BTREE_ITER_INTENT, k,
+ invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate));
+
+ if (ret < 0) {
+ percpu_ref_put(&ca->ref);
+ break;
+ }
+ }
+err:
+ bch2_trans_put(trans);
+ bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+}
+
+void bch2_do_invalidates(struct bch_fs *c)
+{
+ if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) &&
+ !queue_work(c->write_ref_wq, &c->invalidate_work))
+ bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+}
+
+int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
+ u64 bucket_start, u64 bucket_end)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey hole;
+ struct bpos end = POS(ca->dev_idx, bucket_end);
+ struct bch_member *m;
+ unsigned long last_updated = jiffies;
+ int ret;
+
+ BUG_ON(bucket_start > bucket_end);
+ BUG_ON(bucket_end > ca->mi.nbuckets);
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+ POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)),
+ BTREE_ITER_PREFETCH);
+ /*
+ * Scan the alloc btree for every bucket on @ca, and add buckets to the
+ * freespace/need_discard/need_gc_gens btrees as needed:
+ */
+ while (1) {
+ if (last_updated + HZ * 10 < jiffies) {
+ bch_info(ca, "%s: currently at %llu/%llu",
+ __func__, iter.pos.offset, ca->mi.nbuckets);
+ last_updated = jiffies;
+ }
+
+ bch2_trans_begin(trans);
+
+ if (bkey_ge(iter.pos, end)) {
+ ret = 0;
+ break;
+ }
+
+ k = bch2_get_key_or_hole(&iter, end, &hole);
+ ret = bkey_err(k);
+ if (ret)
+ goto bkey_err;
+
+ if (k.k->type) {
+ /*
+ * We process live keys in the alloc btree one at a
+ * time:
+ */
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
+
+ ret = bch2_bucket_do_index(trans, k, a, true) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL);
+ if (ret)
+ goto bkey_err;
+
+ bch2_btree_iter_advance(&iter);
+ } else {
+ struct bkey_i *freespace;
+
+ freespace = bch2_trans_kmalloc(trans, sizeof(*freespace));
+ ret = PTR_ERR_OR_ZERO(freespace);
+ if (ret)
+ goto bkey_err;
+
+ bkey_init(&freespace->k);
+ freespace->k.type = KEY_TYPE_set;
+ freespace->k.p = k.k->p;
+ freespace->k.size = k.k->size;
+
+ ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL);
+ if (ret)
+ goto bkey_err;
+
+ bch2_btree_iter_set_pos(&iter, k.k->p);
+ }
+bkey_err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ break;
+ }
+
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
+
+ if (ret < 0) {
+ bch_err_msg(ca, ret, "initializing free space");
+ return ret;
+ }
+
+ mutex_lock(&c->sb_lock);
+ m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+ SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
+ mutex_unlock(&c->sb_lock);
+
+ return 0;
+}
+
+int bch2_fs_freespace_init(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i;
+ int ret = 0;
+ bool doing_init = false;
+
+ /*
+ * We can crash during the device add path, so we need to check this on
+ * every mount:
+ */
+
+ for_each_member_device(ca, c, i) {
+ if (ca->mi.freespace_initialized)
+ continue;
+
+ if (!doing_init) {
+ bch_info(c, "initializing freespace");
+ doing_init = true;
+ }
+
+ ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
+ if (ret) {
+ percpu_ref_put(&ca->ref);
+ bch_err_fn(c, ret);
+ return ret;
+ }
+ }
+
+ if (doing_init) {
+ mutex_lock(&c->sb_lock);
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+ bch_verbose(c, "done initializing freespace");
+ }
+
+ return 0;
+}
+
+/* Bucket IO clocks: */
+
+int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
+ size_t bucket_nr, int rw)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_i_alloc_v4 *a;
+ u64 now;
+ int ret = 0;
+
+ a = bch2_trans_start_alloc_update(trans, &iter, POS(dev, bucket_nr));
+ ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ return ret;
+
+ now = atomic64_read(&c->io_clock[rw].now);
+ if (a->v.io_time[rw] == now)
+ goto out;
+
+ a->v.io_time[rw] = now;
+
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL, 0);
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+/* Startup/shutdown (ro/rw): */
+
+void bch2_recalc_capacity(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ u64 capacity = 0, reserved_sectors = 0, gc_reserve;
+ unsigned bucket_size_max = 0;
+ unsigned long ra_pages = 0;
+ unsigned i;
+
+ lockdep_assert_held(&c->state_lock);
+
+ for_each_online_member(ca, c, i) {
+ struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
+
+ ra_pages += bdi->ra_pages;
+ }
+
+ bch2_set_ra_pages(c, ra_pages);
+
+ for_each_rw_member(ca, c, i) {
+ u64 dev_reserve = 0;
+
+ /*
+ * We need to reserve buckets (from the number
+ * of currently available buckets) against
+ * foreground writes so that mainly copygc can
+ * make forward progress.
+ *
+ * We need enough to refill the various reserves
+ * from scratch - copygc will use its entire
+ * reserve all at once, then run against when
+ * its reserve is refilled (from the formerly
+ * available buckets).
+ *
+ * This reserve is just used when considering if
+ * allocations for foreground writes must wait -
+ * not -ENOSPC calculations.
+ */
+
+ dev_reserve += ca->nr_btree_reserve * 2;
+ dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
+
+ dev_reserve += 1; /* btree write point */
+ dev_reserve += 1; /* copygc write point */
+ dev_reserve += 1; /* rebalance write point */
+
+ dev_reserve *= ca->mi.bucket_size;
+
+ capacity += bucket_to_sector(ca, ca->mi.nbuckets -
+ ca->mi.first_bucket);
+
+ reserved_sectors += dev_reserve * 2;
+
+ bucket_size_max = max_t(unsigned, bucket_size_max,
+ ca->mi.bucket_size);
+ }
+
+ gc_reserve = c->opts.gc_reserve_bytes
+ ? c->opts.gc_reserve_bytes >> 9
+ : div64_u64(capacity * c->opts.gc_reserve_percent, 100);
+
+ reserved_sectors = max(gc_reserve, reserved_sectors);
+
+ reserved_sectors = min(reserved_sectors, capacity);
+
+ c->capacity = capacity - reserved_sectors;
+
+ c->bucket_size_max = bucket_size_max;
+
+ /* Wake up case someone was waiting for buckets */
+ closure_wake_up(&c->freelist_wait);
+}
+
+u64 bch2_min_rw_member_capacity(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i;
+ u64 ret = U64_MAX;
+
+ for_each_rw_member(ca, c, i)
+ ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size);
+ return ret;
+}
+
+static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
+{
+ struct open_bucket *ob;
+ bool ret = false;
+
+ for (ob = c->open_buckets;
+ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
+ ob++) {
+ spin_lock(&ob->lock);
+ if (ob->valid && !ob->on_partial_list &&
+ ob->dev == ca->dev_idx)
+ ret = true;
+ spin_unlock(&ob->lock);
+ }
+
+ return ret;
+}
+
+/* device goes ro: */
+void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
+{
+ unsigned i;
+
+ /* First, remove device from allocation groups: */
+
+ for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
+ clear_bit(ca->dev_idx, c->rw_devs[i].d);
+
+ /*
+ * Capacity is calculated based off of devices in allocation groups:
+ */
+ bch2_recalc_capacity(c);
+
+ bch2_open_buckets_stop(c, ca, false);
+
+ /*
+ * Wake up threads that were blocked on allocation, so they can notice
+ * the device can no longer be removed and the capacity has changed:
+ */
+ closure_wake_up(&c->freelist_wait);
+
+ /*
+ * journal_res_get() can block waiting for free space in the journal -
+ * it needs to notice there may not be devices to allocate from anymore:
+ */
+ wake_up(&c->journal.wait);
+
+ /* Now wait for any in flight writes: */
+
+ closure_wait_event(&c->open_buckets_wait,
+ !bch2_dev_has_open_write_point(c, ca));
+}
+
+/* device goes rw: */
+void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
+{
+ unsigned i;
+
+ for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
+ if (ca->mi.data_allowed & (1 << i))
+ set_bit(ca->dev_idx, c->rw_devs[i].d);
+}
+
+void bch2_fs_allocator_background_init(struct bch_fs *c)
+{
+ spin_lock_init(&c->freelist_lock);
+ INIT_WORK(&c->discard_work, bch2_do_discards_work);
+ INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
+}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
new file mode 100644
index 000000000000..73faf99a222a
--- /dev/null
+++ b/fs/bcachefs/alloc_background.h
@@ -0,0 +1,259 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ALLOC_BACKGROUND_H
+#define _BCACHEFS_ALLOC_BACKGROUND_H
+
+#include "bcachefs.h"
+#include "alloc_types.h"
+#include "buckets.h"
+#include "debug.h"
+#include "super.h"
+
+enum bkey_invalid_flags;
+
+/* How out of date a pointer gen is allowed to be: */
+#define BUCKET_GC_GEN_MAX 96U
+
+static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos)
+{
+ struct bch_dev *ca;
+
+ if (!bch2_dev_exists2(c, pos.inode))
+ return false;
+
+ ca = bch_dev_bkey_exists(c, pos.inode);
+ return pos.offset >= ca->mi.first_bucket &&
+ pos.offset < ca->mi.nbuckets;
+}
+
+static inline u64 bucket_to_u64(struct bpos bucket)
+{
+ return (bucket.inode << 48) | bucket.offset;
+}
+
+static inline struct bpos u64_to_bucket(u64 bucket)
+{
+ return POS(bucket >> 48, bucket & ~(~0ULL << 48));
+}
+
+static inline u8 alloc_gc_gen(struct bch_alloc_v4 a)
+{
+ return a.gen - a.oldest_gen;
+}
+
+static inline enum bch_data_type __alloc_data_type(u32 dirty_sectors,
+ u32 cached_sectors,
+ u32 stripe,
+ struct bch_alloc_v4 a,
+ enum bch_data_type data_type)
+{
+ if (stripe)
+ return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe;
+ if (dirty_sectors)
+ return data_type;
+ if (cached_sectors)
+ return BCH_DATA_cached;
+ if (BCH_ALLOC_V4_NEED_DISCARD(&a))
+ return BCH_DATA_need_discard;
+ if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX)
+ return BCH_DATA_need_gc_gens;
+ return BCH_DATA_free;
+}
+
+static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
+ enum bch_data_type data_type)
+{
+ return __alloc_data_type(a.dirty_sectors, a.cached_sectors,
+ a.stripe, a, data_type);
+}
+
+static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type)
+{
+ return data_type == BCH_DATA_stripe ? BCH_DATA_user : data_type;
+}
+
+static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a)
+{
+ return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0;
+}
+
+#define DATA_TYPES_MOVABLE \
+ ((1U << BCH_DATA_btree)| \
+ (1U << BCH_DATA_user)| \
+ (1U << BCH_DATA_stripe))
+
+static inline bool data_type_movable(enum bch_data_type type)
+{
+ return (1U << type) & DATA_TYPES_MOVABLE;
+}
+
+static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
+ struct bch_dev *ca)
+{
+ if (!data_type_movable(a.data_type) ||
+ a.dirty_sectors >= ca->mi.bucket_size)
+ return 0;
+
+ return div_u64((u64) a.dirty_sectors * (1ULL << 31), ca->mi.bucket_size);
+}
+
+static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a)
+{
+ return ((u64) alloc_gc_gen(a) >> 4) << 56;
+}
+
+static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_v4 a)
+{
+ pos.offset |= alloc_freespace_genbits(a);
+ return pos;
+}
+
+static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a)
+{
+ unsigned ret = (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
+ BCH_ALLOC_V4_U64s_V0) +
+ BCH_ALLOC_V4_NR_BACKPOINTERS(a) *
+ (sizeof(struct bch_backpointer) / sizeof(u64));
+
+ BUG_ON(ret > U8_MAX - BKEY_U64s);
+ return ret;
+}
+
+static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a)
+{
+ set_bkey_val_u64s(&a->k, alloc_v4_u64s(&a->v));
+}
+
+struct bkey_i_alloc_v4 *
+bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos);
+
+void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *);
+
+static inline const struct bch_alloc_v4 *bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *convert)
+{
+ const struct bch_alloc_v4 *ret;
+
+ if (unlikely(k.k->type != KEY_TYPE_alloc_v4))
+ goto slowpath;
+
+ ret = bkey_s_c_to_alloc_v4(k).v;
+ if (BCH_ALLOC_V4_BACKPOINTERS_START(ret) != BCH_ALLOC_V4_U64s)
+ goto slowpath;
+
+ return ret;
+slowpath:
+ __bch2_alloc_to_v4(k, convert);
+ return convert;
+}
+
+struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c);
+
+int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
+
+int bch2_alloc_v1_invalid(struct bch_fs *, struct bkey_s_c,
+ enum bkey_invalid_flags, struct printbuf *);
+int bch2_alloc_v2_invalid(struct bch_fs *, struct bkey_s_c,
+ enum bkey_invalid_flags, struct printbuf *);
+int bch2_alloc_v3_invalid(struct bch_fs *, struct bkey_s_c,
+ enum bkey_invalid_flags, struct printbuf *);
+int bch2_alloc_v4_invalid(struct bch_fs *, struct bkey_s_c,
+ enum bkey_invalid_flags, struct printbuf *);
+void bch2_alloc_v4_swab(struct bkey_s);
+void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_alloc ((struct bkey_ops) { \
+ .key_invalid = bch2_alloc_v1_invalid, \
+ .val_to_text = bch2_alloc_to_text, \
+ .trans_trigger = bch2_trans_mark_alloc, \
+ .atomic_trigger = bch2_mark_alloc, \
+ .min_val_size = 8, \
+})
+
+#define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) { \
+ .key_invalid = bch2_alloc_v2_invalid, \
+ .val_to_text = bch2_alloc_to_text, \
+ .trans_trigger = bch2_trans_mark_alloc, \
+ .atomic_trigger = bch2_mark_alloc, \
+ .min_val_size = 8, \
+})
+
+#define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) { \
+ .key_invalid = bch2_alloc_v3_invalid, \
+ .val_to_text = bch2_alloc_to_text, \
+ .trans_trigger = bch2_trans_mark_alloc, \
+ .atomic_trigger = bch2_mark_alloc, \
+ .min_val_size = 16, \
+})
+
+#define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) { \
+ .key_invalid = bch2_alloc_v4_invalid, \
+ .val_to_text = bch2_alloc_to_text, \
+ .swab = bch2_alloc_v4_swab, \
+ .trans_trigger = bch2_trans_mark_alloc, \
+ .atomic_trigger = bch2_mark_alloc, \
+ .min_val_size = 48, \
+})
+
+int bch2_bucket_gens_invalid(struct bch_fs *, struct bkey_s_c,
+ enum bkey_invalid_flags, struct printbuf *);
+void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \
+ .key_invalid = bch2_bucket_gens_invalid, \
+ .val_to_text = bch2_bucket_gens_to_text, \
+})
+
+int bch2_bucket_gens_init(struct bch_fs *);
+
+static inline bool bkey_is_alloc(const struct bkey *k)
+{
+ return k->type == KEY_TYPE_alloc ||
+ k->type == KEY_TYPE_alloc_v2 ||
+ k->type == KEY_TYPE_alloc_v3;
+}
+
+int bch2_alloc_read(struct bch_fs *);
+
+int bch2_trans_mark_alloc(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_check_alloc_info(struct bch_fs *);
+int bch2_check_alloc_to_lru_refs(struct bch_fs *);
+void bch2_do_discards(struct bch_fs *);
+
+static inline u64 should_invalidate_buckets(struct bch_dev *ca,
+ struct bch_dev_usage u)
+{
+ u64 want_free = ca->mi.nbuckets >> 7;
+ u64 free = max_t(s64, 0,
+ u.d[BCH_DATA_free].buckets
+ + u.d[BCH_DATA_need_discard].buckets
+ - bch2_dev_buckets_reserved(ca, BCH_WATERMARK_stripe));
+
+ return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets);
+}
+
+void bch2_do_invalidates(struct bch_fs *);
+
+static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a)
+{
+ return (void *) ((u64 *) &a->v +
+ (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
+ BCH_ALLOC_V4_U64s_V0));
+}
+
+static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct bch_alloc_v4 *a)
+{
+ return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a));
+}
+
+int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64);
+int bch2_fs_freespace_init(struct bch_fs *);
+
+void bch2_recalc_capacity(struct bch_fs *);
+u64 bch2_min_rw_member_capacity(struct bch_fs *);
+
+void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
+void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
+
+void bch2_fs_allocator_background_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
new file mode 100644
index 000000000000..0e6157982607
--- /dev/null
+++ b/fs/bcachefs/alloc_foreground.c
@@ -0,0 +1,1638 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2012 Google, Inc.
+ *
+ * Foreground allocator code: allocate buckets from freelist, and allocate in
+ * sector granularity from writepoints.
+ *
+ * bch2_bucket_alloc() allocates a single bucket from a specific device.
+ *
+ * bch2_bucket_alloc_set() allocates one or more buckets from different devices
+ * in a given filesystem.
+ */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "backpointers.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "buckets_waiting_for_journal.h"
+#include "clock.h"
+#include "debug.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "error.h"
+#include "io_write.h"
+#include "journal.h"
+#include "movinggc.h"
+#include "nocow_locking.h"
+#include "trace.h"
+
+#include <linux/math64.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+
+static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans,
+ struct mutex *lock)
+{
+ if (!mutex_trylock(lock)) {
+ bch2_trans_unlock(trans);
+ mutex_lock(lock);
+ }
+}
+
+const char * const bch2_watermarks[] = {
+#define x(t) #t,
+ BCH_WATERMARKS()
+#undef x
+ NULL
+};
+
+/*
+ * Open buckets represent a bucket that's currently being allocated from. They
+ * serve two purposes:
+ *
+ * - They track buckets that have been partially allocated, allowing for
+ * sub-bucket sized allocations - they're used by the sector allocator below
+ *
+ * - They provide a reference to the buckets they own that mark and sweep GC
+ * can find, until the new allocation has a pointer to it inserted into the
+ * btree
+ *
+ * When allocating some space with the sector allocator, the allocation comes
+ * with a reference to an open bucket - the caller is required to put that
+ * reference _after_ doing the index update that makes its allocation reachable.
+ */
+
+void bch2_reset_alloc_cursors(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i;
+
+ rcu_read_lock();
+ for_each_member_device_rcu(ca, c, i, NULL)
+ ca->alloc_cursor = 0;
+ rcu_read_unlock();
+}
+
+static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
+{
+ open_bucket_idx_t idx = ob - c->open_buckets;
+ open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
+
+ ob->hash = *slot;
+ *slot = idx;
+}
+
+static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob)
+{
+ open_bucket_idx_t idx = ob - c->open_buckets;
+ open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
+
+ while (*slot != idx) {
+ BUG_ON(!*slot);
+ slot = &c->open_buckets[*slot].hash;
+ }
+
+ *slot = ob->hash;
+ ob->hash = 0;
+}
+
+void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+
+ if (ob->ec) {
+ ec_stripe_new_put(c, ob->ec, STRIPE_REF_io);
+ return;
+ }
+
+ percpu_down_read(&c->mark_lock);
+ spin_lock(&ob->lock);
+
+ ob->valid = false;
+ ob->data_type = 0;
+
+ spin_unlock(&ob->lock);
+ percpu_up_read(&c->mark_lock);
+
+ spin_lock(&c->freelist_lock);
+ bch2_open_bucket_hash_remove(c, ob);
+
+ ob->freelist = c->open_buckets_freelist;
+ c->open_buckets_freelist = ob - c->open_buckets;
+
+ c->open_buckets_nr_free++;
+ ca->nr_open_buckets--;
+ spin_unlock(&c->freelist_lock);
+
+ closure_wake_up(&c->open_buckets_wait);
+}
+
+void bch2_open_bucket_write_error(struct bch_fs *c,
+ struct open_buckets *obs,
+ unsigned dev)
+{
+ struct open_bucket *ob;
+ unsigned i;
+
+ open_bucket_for_each(c, obs, ob, i)
+ if (ob->dev == dev && ob->ec)
+ bch2_ec_bucket_cancel(c, ob);
+}
+
+static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
+{
+ struct open_bucket *ob;
+
+ BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free);
+
+ ob = c->open_buckets + c->open_buckets_freelist;
+ c->open_buckets_freelist = ob->freelist;
+ atomic_set(&ob->pin, 1);
+ ob->data_type = 0;
+
+ c->open_buckets_nr_free--;
+ return ob;
+}
+
+static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
+{
+ BUG_ON(c->open_buckets_partial_nr >=
+ ARRAY_SIZE(c->open_buckets_partial));
+
+ spin_lock(&c->freelist_lock);
+ ob->on_partial_list = true;
+ c->open_buckets_partial[c->open_buckets_partial_nr++] =
+ ob - c->open_buckets;
+ spin_unlock(&c->freelist_lock);
+
+ closure_wake_up(&c->open_buckets_wait);
+ closure_wake_up(&c->freelist_wait);
+}
+
+/* _only_ for allocating the journal on a new device: */
+long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
+{
+ while (ca->new_fs_bucket_idx < ca->mi.nbuckets) {
+ u64 b = ca->new_fs_bucket_idx++;
+
+ if (!is_superblock_bucket(ca, b) &&
+ (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse)))
+ return b;
+ }
+
+ return -1;
+}
+
+static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
+{
+ switch (watermark) {
+ case BCH_WATERMARK_reclaim:
+ return 0;
+ case BCH_WATERMARK_btree:
+ case BCH_WATERMARK_btree_copygc:
+ return OPEN_BUCKETS_COUNT / 4;
+ case BCH_WATERMARK_copygc:
+ return OPEN_BUCKETS_COUNT / 3;
+ default:
+ return OPEN_BUCKETS_COUNT / 2;
+ }
+}
+
+static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+ u64 bucket,
+ enum bch_watermark watermark,
+ const struct bch_alloc_v4 *a,
+ struct bucket_alloc_state *s,
+ struct closure *cl)
+{
+ struct open_bucket *ob;
+
+ if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
+ s->skipped_nouse++;
+ return NULL;
+ }
+
+ if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
+ s->skipped_open++;
+ return NULL;
+ }
+
+ if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+ c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
+ s->skipped_need_journal_commit++;
+ return NULL;
+ }
+
+ if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) {
+ s->skipped_nocow++;
+ return NULL;
+ }
+
+ spin_lock(&c->freelist_lock);
+
+ if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) {
+ if (cl)
+ closure_wait(&c->open_buckets_wait, cl);
+
+ if (!c->blocked_allocate_open_bucket)
+ c->blocked_allocate_open_bucket = local_clock();
+
+ spin_unlock(&c->freelist_lock);
+ return ERR_PTR(-BCH_ERR_open_buckets_empty);
+ }
+
+ /* Recheck under lock: */
+ if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
+ spin_unlock(&c->freelist_lock);
+ s->skipped_open++;
+ return NULL;
+ }
+
+ ob = bch2_open_bucket_alloc(c);
+
+ spin_lock(&ob->lock);
+
+ ob->valid = true;
+ ob->sectors_free = ca->mi.bucket_size;
+ ob->dev = ca->dev_idx;
+ ob->gen = a->gen;
+ ob->bucket = bucket;
+ spin_unlock(&ob->lock);
+
+ ca->nr_open_buckets++;
+ bch2_open_bucket_hash_add(c, ob);
+
+ if (c->blocked_allocate_open_bucket) {
+ bch2_time_stats_update(
+ &c->times[BCH_TIME_blocked_allocate_open_bucket],
+ c->blocked_allocate_open_bucket);
+ c->blocked_allocate_open_bucket = 0;
+ }
+
+ if (c->blocked_allocate) {
+ bch2_time_stats_update(
+ &c->times[BCH_TIME_blocked_allocate],
+ c->blocked_allocate);
+ c->blocked_allocate = 0;
+ }
+
+ spin_unlock(&c->freelist_lock);
+ return ob;
+}
+
+static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
+ enum bch_watermark watermark, u64 free_entry,
+ struct bucket_alloc_state *s,
+ struct bkey_s_c freespace_k,
+ struct closure *cl)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter = { NULL };
+ struct bkey_s_c k;
+ struct open_bucket *ob;
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a;
+ u64 b = free_entry & ~(~0ULL << 56);
+ unsigned genbits = free_entry >> 56;
+ struct printbuf buf = PRINTBUF;
+ int ret;
+
+ if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) {
+ prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n"
+ " freespace key ",
+ ca->mi.first_bucket, ca->mi.nbuckets);
+ bch2_bkey_val_to_text(&buf, c, freespace_k);
+ bch2_trans_inconsistent(trans, "%s", buf.buf);
+ ob = ERR_PTR(-EIO);
+ goto err;
+ }
+
+ k = bch2_bkey_get_iter(trans, &iter,
+ BTREE_ID_alloc, POS(ca->dev_idx, b),
+ BTREE_ITER_CACHED);
+ ret = bkey_err(k);
+ if (ret) {
+ ob = ERR_PTR(ret);
+ goto err;
+ }
+
+ a = bch2_alloc_to_v4(k, &a_convert);
+
+ if (a->data_type != BCH_DATA_free) {
+ if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
+ ob = NULL;
+ goto err;
+ }
+
+ prt_printf(&buf, "non free bucket in freespace btree\n"
+ " freespace key ");
+ bch2_bkey_val_to_text(&buf, c, freespace_k);
+ prt_printf(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch2_trans_inconsistent(trans, "%s", buf.buf);
+ ob = ERR_PTR(-EIO);
+ goto err;
+ }
+
+ if (genbits != (alloc_freespace_genbits(*a) >> 56) &&
+ c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
+ prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
+ " freespace key ",
+ genbits, alloc_freespace_genbits(*a) >> 56);
+ bch2_bkey_val_to_text(&buf, c, freespace_k);
+ prt_printf(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch2_trans_inconsistent(trans, "%s", buf.buf);
+ ob = ERR_PTR(-EIO);
+ goto err;
+ }
+
+ if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers) {
+ struct bch_backpointer bp;
+ struct bpos bp_pos = POS_MIN;
+
+ ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1,
+ &bp_pos, &bp,
+ BTREE_ITER_NOPRESERVE);
+ if (ret) {
+ ob = ERR_PTR(ret);
+ goto err;
+ }
+
+ if (!bkey_eq(bp_pos, POS_MAX)) {
+ /*
+ * Bucket may have data in it - we don't call
+ * bc2h_trans_inconnsistent() because fsck hasn't
+ * finished yet
+ */
+ ob = NULL;
+ goto err;
+ }
+ }
+
+ ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl);
+ if (!ob)
+ iter.path->preserve = false;
+err:
+ if (iter.trans && iter.path)
+ set_btree_iter_dontneed(&iter);
+ bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
+ return ob;
+}
+
+/*
+ * This path is for before the freespace btree is initialized:
+ *
+ * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock &
+ * journal buckets - journal buckets will be < ca->new_fs_bucket_idx
+ */
+static noinline struct open_bucket *
+bch2_bucket_alloc_early(struct btree_trans *trans,
+ struct bch_dev *ca,
+ enum bch_watermark watermark,
+ struct bucket_alloc_state *s,
+ struct closure *cl)
+{
+ struct btree_iter iter, citer;
+ struct bkey_s_c k, ck;
+ struct open_bucket *ob = NULL;
+ u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
+ u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor));
+ u64 alloc_cursor = alloc_start;
+ int ret;
+
+ /*
+ * Scan with an uncached iterator to avoid polluting the key cache. An
+ * uncached iter will return a cached key if one exists, but if not
+ * there is no other underlying protection for the associated key cache
+ * slot. To avoid racing bucket allocations, look up the cached key slot
+ * of any likely allocation candidate before attempting to proceed with
+ * the allocation. This provides proper exclusion on the associated
+ * bucket.
+ */
+again:
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
+ BTREE_ITER_SLOTS, k, ret) {
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a;
+
+ if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
+ break;
+
+ if (ca->new_fs_bucket_idx &&
+ is_superblock_bucket(ca, k.k->p.offset))
+ continue;
+
+ a = bch2_alloc_to_v4(k, &a_convert);
+ if (a->data_type != BCH_DATA_free)
+ continue;
+
+ /* now check the cached key to serialize concurrent allocs of the bucket */
+ ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED);
+ ret = bkey_err(ck);
+ if (ret)
+ break;
+
+ a = bch2_alloc_to_v4(ck, &a_convert);
+ if (a->data_type != BCH_DATA_free)
+ goto next;
+
+ s->buckets_seen++;
+
+ ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl);
+next:
+ citer.path->preserve = false;
+ bch2_trans_iter_exit(trans, &citer);
+ if (ob)
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ alloc_cursor = iter.pos.offset;
+ ca->alloc_cursor = alloc_cursor;
+
+ if (!ob && ret)
+ ob = ERR_PTR(ret);
+
+ if (!ob && alloc_start > first_bucket) {
+ alloc_cursor = alloc_start = first_bucket;
+ goto again;
+ }
+
+ return ob;
+}
+
+static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
+ struct bch_dev *ca,
+ enum bch_watermark watermark,
+ struct bucket_alloc_state *s,
+ struct closure *cl)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct open_bucket *ob = NULL;
+ u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor));
+ u64 alloc_cursor = alloc_start;
+ int ret;
+
+ BUG_ON(ca->new_fs_bucket_idx);
+again:
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace,
+ POS(ca->dev_idx, alloc_cursor), 0, k, ret) {
+ if (k.k->p.inode != ca->dev_idx)
+ break;
+
+ for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k));
+ alloc_cursor < k.k->p.offset;
+ alloc_cursor++) {
+ ret = btree_trans_too_many_iters(trans);
+ if (ret) {
+ ob = ERR_PTR(ret);
+ break;
+ }
+
+ s->buckets_seen++;
+
+ ob = try_alloc_bucket(trans, ca, watermark,
+ alloc_cursor, s, k, cl);
+ if (ob) {
+ iter.path->preserve = false;
+ break;
+ }
+ }
+
+ if (ob || ret)
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ ca->alloc_cursor = alloc_cursor;
+
+ if (!ob && ret)
+ ob = ERR_PTR(ret);
+
+ if (!ob && alloc_start > ca->mi.first_bucket) {
+ alloc_cursor = alloc_start = ca->mi.first_bucket;
+ goto again;
+ }
+
+ return ob;
+}
+
+/**
+ * bch2_bucket_alloc_trans - allocate a single bucket from a specific device
+ * @trans: transaction object
+ * @ca: device to allocate from
+ * @watermark: how important is this allocation?
+ * @cl: if not NULL, closure to be used to wait if buckets not available
+ * @usage: for secondarily also returning the current device usage
+ *
+ * Returns: an open_bucket on success, or an ERR_PTR() on failure.
+ */
+static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
+ struct bch_dev *ca,
+ enum bch_watermark watermark,
+ struct closure *cl,
+ struct bch_dev_usage *usage)
+{
+ struct bch_fs *c = trans->c;
+ struct open_bucket *ob = NULL;
+ bool freespace = READ_ONCE(ca->mi.freespace_initialized);
+ u64 avail;
+ struct bucket_alloc_state s = { 0 };
+ bool waiting = false;
+again:
+ bch2_dev_usage_read_fast(ca, usage);
+ avail = dev_buckets_free(ca, *usage, watermark);
+
+ if (usage->d[BCH_DATA_need_discard].buckets > avail)
+ bch2_do_discards(c);
+
+ if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
+ bch2_do_gc_gens(c);
+
+ if (should_invalidate_buckets(ca, *usage))
+ bch2_do_invalidates(c);
+
+ if (!avail) {
+ if (cl && !waiting) {
+ closure_wait(&c->freelist_wait, cl);
+ waiting = true;
+ goto again;
+ }
+
+ if (!c->blocked_allocate)
+ c->blocked_allocate = local_clock();
+
+ ob = ERR_PTR(-BCH_ERR_freelist_empty);
+ goto err;
+ }
+
+ if (waiting)
+ closure_wake_up(&c->freelist_wait);
+alloc:
+ ob = likely(freespace)
+ ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl)
+ : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl);
+
+ if (s.skipped_need_journal_commit * 2 > avail)
+ bch2_journal_flush_async(&c->journal, NULL);
+
+ if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
+ freespace = false;
+ goto alloc;
+ }
+err:
+ if (!ob)
+ ob = ERR_PTR(-BCH_ERR_no_buckets_found);
+
+ if (!IS_ERR(ob))
+ trace_and_count(c, bucket_alloc, ca,
+ bch2_watermarks[watermark],
+ ob->bucket,
+ usage->d[BCH_DATA_free].buckets,
+ avail,
+ bch2_copygc_wait_amount(c),
+ c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
+ &s,
+ cl == NULL,
+ "");
+ else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
+ trace_and_count(c, bucket_alloc_fail, ca,
+ bch2_watermarks[watermark],
+ 0,
+ usage->d[BCH_DATA_free].buckets,
+ avail,
+ bch2_copygc_wait_amount(c),
+ c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
+ &s,
+ cl == NULL,
+ bch2_err_str(PTR_ERR(ob)));
+
+ return ob;
+}
+
+struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
+ enum bch_watermark watermark,
+ struct closure *cl)
+{
+ struct bch_dev_usage usage;
+ struct open_bucket *ob;
+
+ bch2_trans_do(c, NULL, NULL, 0,
+ PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
+ cl, &usage)));
+ return ob;
+}
+
+static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
+ unsigned l, unsigned r)
+{
+ return ((stripe->next_alloc[l] > stripe->next_alloc[r]) -
+ (stripe->next_alloc[l] < stripe->next_alloc[r]));
+}
+
+#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
+
+struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
+ struct dev_stripe_state *stripe,
+ struct bch_devs_mask *devs)
+{
+ struct dev_alloc_list ret = { .nr = 0 };
+ unsigned i;
+
+ for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
+ ret.devs[ret.nr++] = i;
+
+ bubble_sort(ret.devs, ret.nr, dev_stripe_cmp);
+ return ret;
+}
+
+static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
+ struct dev_stripe_state *stripe,
+ struct bch_dev_usage *usage)
+{
+ u64 *v = stripe->next_alloc + ca->dev_idx;
+ u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal);
+ u64 free_space_inv = free_space
+ ? div64_u64(1ULL << 48, free_space)
+ : 1ULL << 48;
+ u64 scale = *v / 4;
+
+ if (*v + free_space_inv >= *v)
+ *v += free_space_inv;
+ else
+ *v = U64_MAX;
+
+ for (v = stripe->next_alloc;
+ v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
+ *v = *v < scale ? 0 : *v - scale;
+}
+
+void bch2_dev_stripe_increment(struct bch_dev *ca,
+ struct dev_stripe_state *stripe)
+{
+ struct bch_dev_usage usage;
+
+ bch2_dev_usage_read_fast(ca, &usage);
+ bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
+}
+
+static int add_new_bucket(struct bch_fs *c,
+ struct open_buckets *ptrs,
+ struct bch_devs_mask *devs_may_alloc,
+ unsigned nr_replicas,
+ unsigned *nr_effective,
+ bool *have_cache,
+ unsigned flags,
+ struct open_bucket *ob)
+{
+ unsigned durability =
+ bch_dev_bkey_exists(c, ob->dev)->mi.durability;
+
+ BUG_ON(*nr_effective >= nr_replicas);
+ BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
+
+ __clear_bit(ob->dev, devs_may_alloc->d);
+ *nr_effective += (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
+ ? durability : 1;
+ *have_cache |= !durability;
+
+ ob_push(c, ptrs, ob);
+
+ if (*nr_effective >= nr_replicas)
+ return 1;
+ if (ob->ec)
+ return 1;
+ return 0;
+}
+
+int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
+ struct open_buckets *ptrs,
+ struct dev_stripe_state *stripe,
+ struct bch_devs_mask *devs_may_alloc,
+ unsigned nr_replicas,
+ unsigned *nr_effective,
+ bool *have_cache,
+ unsigned flags,
+ enum bch_data_type data_type,
+ enum bch_watermark watermark,
+ struct closure *cl)
+{
+ struct bch_fs *c = trans->c;
+ struct dev_alloc_list devs_sorted =
+ bch2_dev_alloc_list(c, stripe, devs_may_alloc);
+ unsigned dev;
+ struct bch_dev *ca;
+ int ret = -BCH_ERR_insufficient_devices;
+ unsigned i;
+
+ BUG_ON(*nr_effective >= nr_replicas);
+
+ for (i = 0; i < devs_sorted.nr; i++) {
+ struct bch_dev_usage usage;
+ struct open_bucket *ob;
+
+ dev = devs_sorted.devs[i];
+
+ rcu_read_lock();
+ ca = rcu_dereference(c->devs[dev]);
+ if (ca)
+ percpu_ref_get(&ca->ref);
+ rcu_read_unlock();
+
+ if (!ca)
+ continue;
+
+ if (!ca->mi.durability && *have_cache) {
+ percpu_ref_put(&ca->ref);
+ continue;
+ }
+
+ ob = bch2_bucket_alloc_trans(trans, ca, watermark, cl, &usage);
+ if (!IS_ERR(ob))
+ bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
+ percpu_ref_put(&ca->ref);
+
+ if (IS_ERR(ob)) {
+ ret = PTR_ERR(ob);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl)
+ break;
+ continue;
+ }
+
+ ob->data_type = data_type;
+
+ if (add_new_bucket(c, ptrs, devs_may_alloc,
+ nr_replicas, nr_effective,
+ have_cache, flags, ob)) {
+ ret = 0;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+/* Allocate from stripes: */
+
+/*
+ * if we can't allocate a new stripe because there are already too many
+ * partially filled stripes, force allocating from an existing stripe even when
+ * it's to a device we don't want:
+ */
+
+static int bucket_alloc_from_stripe(struct btree_trans *trans,
+ struct open_buckets *ptrs,
+ struct write_point *wp,
+ struct bch_devs_mask *devs_may_alloc,
+ u16 target,
+ unsigned nr_replicas,
+ unsigned *nr_effective,
+ bool *have_cache,
+ enum bch_watermark watermark,
+ unsigned flags,
+ struct closure *cl)
+{
+ struct bch_fs *c = trans->c;
+ struct dev_alloc_list devs_sorted;
+ struct ec_stripe_head *h;
+ struct open_bucket *ob;
+ unsigned i, ec_idx;
+ int ret = 0;
+
+ if (nr_replicas < 2)
+ return 0;
+
+ if (ec_open_bucket(c, ptrs))
+ return 0;
+
+ h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl);
+ if (IS_ERR(h))
+ return PTR_ERR(h);
+ if (!h)
+ return 0;
+
+ devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
+
+ for (i = 0; i < devs_sorted.nr; i++)
+ for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
+ if (!h->s->blocks[ec_idx])
+ continue;
+
+ ob = c->open_buckets + h->s->blocks[ec_idx];
+ if (ob->dev == devs_sorted.devs[i] &&
+ !test_and_set_bit(ec_idx, h->s->blocks_allocated))
+ goto got_bucket;
+ }
+ goto out_put_head;
+got_bucket:
+ ob->ec_idx = ec_idx;
+ ob->ec = h->s;
+ ec_stripe_new_get(h->s, STRIPE_REF_io);
+
+ ret = add_new_bucket(c, ptrs, devs_may_alloc,
+ nr_replicas, nr_effective,
+ have_cache, flags, ob);
+out_put_head:
+ bch2_ec_stripe_head_put(c, h);
+ return ret;
+}
+
+/* Sector allocator */
+
+static bool want_bucket(struct bch_fs *c,
+ struct write_point *wp,
+ struct bch_devs_mask *devs_may_alloc,
+ bool *have_cache, bool ec,
+ struct open_bucket *ob)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+
+ if (!test_bit(ob->dev, devs_may_alloc->d))
+ return false;
+
+ if (ob->data_type != wp->data_type)
+ return false;
+
+ if (!ca->mi.durability &&
+ (wp->data_type == BCH_DATA_btree || ec || *have_cache))
+ return false;
+
+ if (ec != (ob->ec != NULL))
+ return false;
+
+ return true;
+}
+
+static int bucket_alloc_set_writepoint(struct bch_fs *c,
+ struct open_buckets *ptrs,
+ struct write_point *wp,
+ struct bch_devs_mask *devs_may_alloc,
+ unsigned nr_replicas,
+ unsigned *nr_effective,
+ bool *have_cache,
+ bool ec, unsigned flags)
+{
+ struct open_buckets ptrs_skip = { .nr = 0 };
+ struct open_bucket *ob;
+ unsigned i;
+ int ret = 0;
+
+ open_bucket_for_each(c, &wp->ptrs, ob, i) {
+ if (!ret && want_bucket(c, wp, devs_may_alloc,
+ have_cache, ec, ob))
+ ret = add_new_bucket(c, ptrs, devs_may_alloc,
+ nr_replicas, nr_effective,
+ have_cache, flags, ob);
+ else
+ ob_push(c, &ptrs_skip, ob);
+ }
+ wp->ptrs = ptrs_skip;
+
+ return ret;
+}
+
+static int bucket_alloc_set_partial(struct bch_fs *c,
+ struct open_buckets *ptrs,
+ struct write_point *wp,
+ struct bch_devs_mask *devs_may_alloc,
+ unsigned nr_replicas,
+ unsigned *nr_effective,
+ bool *have_cache, bool ec,
+ enum bch_watermark watermark,
+ unsigned flags)
+{
+ int i, ret = 0;
+
+ if (!c->open_buckets_partial_nr)
+ return 0;
+
+ spin_lock(&c->freelist_lock);
+
+ if (!c->open_buckets_partial_nr)
+ goto unlock;
+
+ for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) {
+ struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
+
+ if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ struct bch_dev_usage usage;
+ u64 avail;
+
+ bch2_dev_usage_read_fast(ca, &usage);
+ avail = dev_buckets_free(ca, usage, watermark);
+ if (!avail)
+ continue;
+
+ array_remove_item(c->open_buckets_partial,
+ c->open_buckets_partial_nr,
+ i);
+ ob->on_partial_list = false;
+
+ ret = add_new_bucket(c, ptrs, devs_may_alloc,
+ nr_replicas, nr_effective,
+ have_cache, flags, ob);
+ if (ret)
+ break;
+ }
+ }
+unlock:
+ spin_unlock(&c->freelist_lock);
+ return ret;
+}
+
+static int __open_bucket_add_buckets(struct btree_trans *trans,
+ struct open_buckets *ptrs,
+ struct write_point *wp,
+ struct bch_devs_list *devs_have,
+ u16 target,
+ bool erasure_code,
+ unsigned nr_replicas,
+ unsigned *nr_effective,
+ bool *have_cache,
+ enum bch_watermark watermark,
+ unsigned flags,
+ struct closure *_cl)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_devs_mask devs;
+ struct open_bucket *ob;
+ struct closure *cl = NULL;
+ unsigned i;
+ int ret;
+
+ devs = target_rw_devs(c, wp->data_type, target);
+
+ /* Don't allocate from devices we already have pointers to: */
+ for (i = 0; i < devs_have->nr; i++)
+ __clear_bit(devs_have->devs[i], devs.d);
+
+ open_bucket_for_each(c, ptrs, ob, i)
+ __clear_bit(ob->dev, devs.d);
+
+ if (erasure_code && ec_open_bucket(c, ptrs))
+ return 0;
+
+ ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
+ nr_replicas, nr_effective,
+ have_cache, erasure_code, flags);
+ if (ret)
+ return ret;
+
+ ret = bucket_alloc_set_partial(c, ptrs, wp, &devs,
+ nr_replicas, nr_effective,
+ have_cache, erasure_code, watermark, flags);
+ if (ret)
+ return ret;
+
+ if (erasure_code) {
+ ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs,
+ target,
+ nr_replicas, nr_effective,
+ have_cache,
+ watermark, flags, _cl);
+ } else {
+retry_blocking:
+ /*
+ * Try nonblocking first, so that if one device is full we'll try from
+ * other devices:
+ */
+ ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
+ nr_replicas, nr_effective, have_cache,
+ flags, wp->data_type, watermark, cl);
+ if (ret &&
+ !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
+ !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
+ !cl && _cl) {
+ cl = _cl;
+ goto retry_blocking;
+ }
+ }
+
+ return ret;
+}
+
+static int open_bucket_add_buckets(struct btree_trans *trans,
+ struct open_buckets *ptrs,
+ struct write_point *wp,
+ struct bch_devs_list *devs_have,
+ u16 target,
+ unsigned erasure_code,
+ unsigned nr_replicas,
+ unsigned *nr_effective,
+ bool *have_cache,
+ enum bch_watermark watermark,
+ unsigned flags,
+ struct closure *cl)
+{
+ int ret;
+
+ if (erasure_code) {
+ ret = __open_bucket_add_buckets(trans, ptrs, wp,
+ devs_have, target, erasure_code,
+ nr_replicas, nr_effective, have_cache,
+ watermark, flags, cl);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+ bch2_err_matches(ret, BCH_ERR_operation_blocked) ||
+ bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
+ bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
+ return ret;
+ if (*nr_effective >= nr_replicas)
+ return 0;
+ }
+
+ ret = __open_bucket_add_buckets(trans, ptrs, wp,
+ devs_have, target, false,
+ nr_replicas, nr_effective, have_cache,
+ watermark, flags, cl);
+ return ret < 0 ? ret : 0;
+}
+
+/**
+ * should_drop_bucket - check if this is open_bucket should go away
+ * @ob: open_bucket to predicate on
+ * @c: filesystem handle
+ * @ca: if set, we're killing buckets for a particular device
+ * @ec: if true, we're shutting down erasure coding and killing all ec
+ * open_buckets
+ * otherwise, return true
+ * Returns: true if we should kill this open_bucket
+ *
+ * We're killing open_buckets because we're shutting down a device, erasure
+ * coding, or the entire filesystem - check if this open_bucket matches:
+ */
+static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
+ struct bch_dev *ca, bool ec)
+{
+ if (ec) {
+ return ob->ec != NULL;
+ } else if (ca) {
+ bool drop = ob->dev == ca->dev_idx;
+ struct open_bucket *ob2;
+ unsigned i;
+
+ if (!drop && ob->ec) {
+ unsigned nr_blocks;
+
+ mutex_lock(&ob->ec->lock);
+ nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks;
+
+ for (i = 0; i < nr_blocks; i++) {
+ if (!ob->ec->blocks[i])
+ continue;
+
+ ob2 = c->open_buckets + ob->ec->blocks[i];
+ drop |= ob2->dev == ca->dev_idx;
+ }
+ mutex_unlock(&ob->ec->lock);
+ }
+
+ return drop;
+ } else {
+ return true;
+ }
+}
+
+static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
+ bool ec, struct write_point *wp)
+{
+ struct open_buckets ptrs = { .nr = 0 };
+ struct open_bucket *ob;
+ unsigned i;
+
+ mutex_lock(&wp->lock);
+ open_bucket_for_each(c, &wp->ptrs, ob, i)
+ if (should_drop_bucket(ob, c, ca, ec))
+ bch2_open_bucket_put(c, ob);
+ else
+ ob_push(c, &ptrs, ob);
+ wp->ptrs = ptrs;
+ mutex_unlock(&wp->lock);
+}
+
+void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
+ bool ec)
+{
+ unsigned i;
+
+ /* Next, close write points that point to this device... */
+ for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
+ bch2_writepoint_stop(c, ca, ec, &c->write_points[i]);
+
+ bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point);
+ bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point);
+ bch2_writepoint_stop(c, ca, ec, &c->btree_write_point);
+
+ mutex_lock(&c->btree_reserve_cache_lock);
+ while (c->btree_reserve_cache_nr) {
+ struct btree_alloc *a =
+ &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
+
+ bch2_open_buckets_put(c, &a->ob);
+ }
+ mutex_unlock(&c->btree_reserve_cache_lock);
+
+ spin_lock(&c->freelist_lock);
+ i = 0;
+ while (i < c->open_buckets_partial_nr) {
+ struct open_bucket *ob =
+ c->open_buckets + c->open_buckets_partial[i];
+
+ if (should_drop_bucket(ob, c, ca, ec)) {
+ --c->open_buckets_partial_nr;
+ swap(c->open_buckets_partial[i],
+ c->open_buckets_partial[c->open_buckets_partial_nr]);
+ ob->on_partial_list = false;
+ spin_unlock(&c->freelist_lock);
+ bch2_open_bucket_put(c, ob);
+ spin_lock(&c->freelist_lock);
+ } else {
+ i++;
+ }
+ }
+ spin_unlock(&c->freelist_lock);
+
+ bch2_ec_stop_dev(c, ca);
+}
+
+static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
+ unsigned long write_point)
+{
+ unsigned hash =
+ hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
+
+ return &c->write_points_hash[hash];
+}
+
+static struct write_point *__writepoint_find(struct hlist_head *head,
+ unsigned long write_point)
+{
+ struct write_point *wp;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(wp, head, node)
+ if (wp->write_point == write_point)
+ goto out;
+ wp = NULL;
+out:
+ rcu_read_unlock();
+ return wp;
+}
+
+static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
+{
+ u64 stranded = c->write_points_nr * c->bucket_size_max;
+ u64 free = bch2_fs_usage_read_short(c).free;
+
+ return stranded * factor > free;
+}
+
+static bool try_increase_writepoints(struct bch_fs *c)
+{
+ struct write_point *wp;
+
+ if (c->write_points_nr == ARRAY_SIZE(c->write_points) ||
+ too_many_writepoints(c, 32))
+ return false;
+
+ wp = c->write_points + c->write_points_nr++;
+ hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
+ return true;
+}
+
+static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr)
+{
+ struct bch_fs *c = trans->c;
+ struct write_point *wp;
+ struct open_bucket *ob;
+ unsigned i;
+
+ mutex_lock(&c->write_points_hash_lock);
+ if (c->write_points_nr < old_nr) {
+ mutex_unlock(&c->write_points_hash_lock);
+ return true;
+ }
+
+ if (c->write_points_nr == 1 ||
+ !too_many_writepoints(c, 8)) {
+ mutex_unlock(&c->write_points_hash_lock);
+ return false;
+ }
+
+ wp = c->write_points + --c->write_points_nr;
+
+ hlist_del_rcu(&wp->node);
+ mutex_unlock(&c->write_points_hash_lock);
+
+ bch2_trans_mutex_lock_norelock(trans, &wp->lock);
+ open_bucket_for_each(c, &wp->ptrs, ob, i)
+ open_bucket_free_unused(c, ob);
+ wp->ptrs.nr = 0;
+ mutex_unlock(&wp->lock);
+ return true;
+}
+
+static struct write_point *writepoint_find(struct btree_trans *trans,
+ unsigned long write_point)
+{
+ struct bch_fs *c = trans->c;
+ struct write_point *wp, *oldest;
+ struct hlist_head *head;
+
+ if (!(write_point & 1UL)) {
+ wp = (struct write_point *) write_point;
+ bch2_trans_mutex_lock_norelock(trans, &wp->lock);
+ return wp;
+ }
+
+ head = writepoint_hash(c, write_point);
+restart_find:
+ wp = __writepoint_find(head, write_point);
+ if (wp) {
+lock_wp:
+ bch2_trans_mutex_lock_norelock(trans, &wp->lock);
+ if (wp->write_point == write_point)
+ goto out;
+ mutex_unlock(&wp->lock);
+ goto restart_find;
+ }
+restart_find_oldest:
+ oldest = NULL;
+ for (wp = c->write_points;
+ wp < c->write_points + c->write_points_nr; wp++)
+ if (!oldest || time_before64(wp->last_used, oldest->last_used))
+ oldest = wp;
+
+ bch2_trans_mutex_lock_norelock(trans, &oldest->lock);
+ bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock);
+ if (oldest >= c->write_points + c->write_points_nr ||
+ try_increase_writepoints(c)) {
+ mutex_unlock(&c->write_points_hash_lock);
+ mutex_unlock(&oldest->lock);
+ goto restart_find_oldest;
+ }
+
+ wp = __writepoint_find(head, write_point);
+ if (wp && wp != oldest) {
+ mutex_unlock(&c->write_points_hash_lock);
+ mutex_unlock(&oldest->lock);
+ goto lock_wp;
+ }
+
+ wp = oldest;
+ hlist_del_rcu(&wp->node);
+ wp->write_point = write_point;
+ hlist_add_head_rcu(&wp->node, head);
+ mutex_unlock(&c->write_points_hash_lock);
+out:
+ wp->last_used = local_clock();
+ return wp;
+}
+
+static noinline void
+deallocate_extra_replicas(struct bch_fs *c,
+ struct open_buckets *ptrs,
+ struct open_buckets *ptrs_no_use,
+ unsigned extra_replicas)
+{
+ struct open_buckets ptrs2 = { 0 };
+ struct open_bucket *ob;
+ unsigned i;
+
+ open_bucket_for_each(c, ptrs, ob, i) {
+ unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability;
+
+ if (d && d <= extra_replicas) {
+ extra_replicas -= d;
+ ob_push(c, ptrs_no_use, ob);
+ } else {
+ ob_push(c, &ptrs2, ob);
+ }
+ }
+
+ *ptrs = ptrs2;
+}
+
+/*
+ * Get us an open_bucket we can allocate from, return with it locked:
+ */
+int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
+ unsigned target,
+ unsigned erasure_code,
+ struct write_point_specifier write_point,
+ struct bch_devs_list *devs_have,
+ unsigned nr_replicas,
+ unsigned nr_replicas_required,
+ enum bch_watermark watermark,
+ unsigned flags,
+ struct closure *cl,
+ struct write_point **wp_ret)
+{
+ struct bch_fs *c = trans->c;
+ struct write_point *wp;
+ struct open_bucket *ob;
+ struct open_buckets ptrs;
+ unsigned nr_effective, write_points_nr;
+ bool have_cache;
+ int ret;
+ int i;
+
+ if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
+ erasure_code = false;
+
+ BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
+
+ BUG_ON(!nr_replicas || !nr_replicas_required);
+retry:
+ ptrs.nr = 0;
+ nr_effective = 0;
+ write_points_nr = c->write_points_nr;
+ have_cache = false;
+
+ *wp_ret = wp = writepoint_find(trans, write_point.v);
+
+ /* metadata may not allocate on cache devices: */
+ if (wp->data_type != BCH_DATA_user)
+ have_cache = true;
+
+ if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
+ ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
+ target, erasure_code,
+ nr_replicas, &nr_effective,
+ &have_cache, watermark,
+ flags, NULL);
+ if (!ret ||
+ bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto alloc_done;
+
+ /* Don't retry from all devices if we're out of open buckets: */
+ if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) {
+ int ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
+ target, erasure_code,
+ nr_replicas, &nr_effective,
+ &have_cache, watermark,
+ flags, cl);
+ if (!ret ||
+ bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+ bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
+ goto alloc_done;
+ }
+
+ /*
+ * Only try to allocate cache (durability = 0 devices) from the
+ * specified target:
+ */
+ have_cache = true;
+
+ ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
+ 0, erasure_code,
+ nr_replicas, &nr_effective,
+ &have_cache, watermark,
+ flags, cl);
+ } else {
+ ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
+ target, erasure_code,
+ nr_replicas, &nr_effective,
+ &have_cache, watermark,
+ flags, cl);
+ }
+alloc_done:
+ BUG_ON(!ret && nr_effective < nr_replicas);
+
+ if (erasure_code && !ec_open_bucket(c, &ptrs))
+ pr_debug("failed to get ec bucket: ret %u", ret);
+
+ if (ret == -BCH_ERR_insufficient_devices &&
+ nr_effective >= nr_replicas_required)
+ ret = 0;
+
+ if (ret)
+ goto err;
+
+ if (nr_effective > nr_replicas)
+ deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas);
+
+ /* Free buckets we didn't use: */
+ open_bucket_for_each(c, &wp->ptrs, ob, i)
+ open_bucket_free_unused(c, ob);
+
+ wp->ptrs = ptrs;
+
+ wp->sectors_free = UINT_MAX;
+
+ open_bucket_for_each(c, &wp->ptrs, ob, i)
+ wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
+
+ BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
+
+ return 0;
+err:
+ open_bucket_for_each(c, &wp->ptrs, ob, i)
+ if (ptrs.nr < ARRAY_SIZE(ptrs.v))
+ ob_push(c, &ptrs, ob);
+ else
+ open_bucket_free_unused(c, ob);
+ wp->ptrs = ptrs;
+
+ mutex_unlock(&wp->lock);
+
+ if (bch2_err_matches(ret, BCH_ERR_freelist_empty) &&
+ try_decrease_writepoints(trans, write_points_nr))
+ goto retry;
+
+ if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
+ bch2_err_matches(ret, BCH_ERR_freelist_empty))
+ return cl
+ ? -BCH_ERR_bucket_alloc_blocked
+ : -BCH_ERR_ENOSPC_bucket_alloc;
+
+ return ret;
+}
+
+struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+
+ return (struct bch_extent_ptr) {
+ .type = 1 << BCH_EXTENT_ENTRY_ptr,
+ .gen = ob->gen,
+ .dev = ob->dev,
+ .offset = bucket_to_sector(ca, ob->bucket) +
+ ca->mi.bucket_size -
+ ob->sectors_free,
+ };
+}
+
+void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
+ struct bkey_i *k, unsigned sectors,
+ bool cached)
+{
+ bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached);
+}
+
+/*
+ * Append pointers to the space we just allocated to @k, and mark @sectors space
+ * as allocated out of @ob
+ */
+void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
+{
+ bch2_alloc_sectors_done_inlined(c, wp);
+}
+
+static inline void writepoint_init(struct write_point *wp,
+ enum bch_data_type type)
+{
+ mutex_init(&wp->lock);
+ wp->data_type = type;
+
+ INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates);
+ INIT_LIST_HEAD(&wp->writes);
+ spin_lock_init(&wp->writes_lock);
+}
+
+void bch2_fs_allocator_foreground_init(struct bch_fs *c)
+{
+ struct open_bucket *ob;
+ struct write_point *wp;
+
+ mutex_init(&c->write_points_hash_lock);
+ c->write_points_nr = ARRAY_SIZE(c->write_points);
+
+ /* open bucket 0 is a sentinal NULL: */
+ spin_lock_init(&c->open_buckets[0].lock);
+
+ for (ob = c->open_buckets + 1;
+ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
+ spin_lock_init(&ob->lock);
+ c->open_buckets_nr_free++;
+
+ ob->freelist = c->open_buckets_freelist;
+ c->open_buckets_freelist = ob - c->open_buckets;
+ }
+
+ writepoint_init(&c->btree_write_point, BCH_DATA_btree);
+ writepoint_init(&c->rebalance_write_point, BCH_DATA_user);
+ writepoint_init(&c->copygc_write_point, BCH_DATA_user);
+
+ for (wp = c->write_points;
+ wp < c->write_points + c->write_points_nr; wp++) {
+ writepoint_init(wp, BCH_DATA_user);
+
+ wp->last_used = local_clock();
+ wp->write_point = (unsigned long) wp;
+ hlist_add_head_rcu(&wp->node,
+ writepoint_hash(c, wp->write_point));
+ }
+}
+
+static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ unsigned data_type = ob->data_type;
+ barrier(); /* READ_ONCE() doesn't work on bitfields */
+
+ prt_printf(out, "%zu ref %u %s %u:%llu gen %u allocated %u/%u",
+ ob - c->open_buckets,
+ atomic_read(&ob->pin),
+ data_type < BCH_DATA_NR ? bch2_data_types[data_type] : "invalid data type",
+ ob->dev, ob->bucket, ob->gen,
+ ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
+ if (ob->ec)
+ prt_printf(out, " ec idx %llu", ob->ec->idx);
+ if (ob->on_partial_list)
+ prt_str(out, " partial");
+ prt_newline(out);
+}
+
+void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ struct open_bucket *ob;
+
+ out->atomic++;
+
+ for (ob = c->open_buckets;
+ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
+ ob++) {
+ spin_lock(&ob->lock);
+ if (ob->valid && !ob->on_partial_list)
+ bch2_open_bucket_to_text(out, c, ob);
+ spin_unlock(&ob->lock);
+ }
+
+ --out->atomic;
+}
+
+void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ unsigned i;
+
+ out->atomic++;
+ spin_lock(&c->freelist_lock);
+
+ for (i = 0; i < c->open_buckets_partial_nr; i++)
+ bch2_open_bucket_to_text(out, c,
+ c->open_buckets + c->open_buckets_partial[i]);
+
+ spin_unlock(&c->freelist_lock);
+ --out->atomic;
+}
+
+static const char * const bch2_write_point_states[] = {
+#define x(n) #n,
+ WRITE_POINT_STATES()
+#undef x
+ NULL
+};
+
+static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
+ struct write_point *wp)
+{
+ struct open_bucket *ob;
+ unsigned i;
+
+ prt_printf(out, "%lu: ", wp->write_point);
+ prt_human_readable_u64(out, wp->sectors_allocated);
+
+ prt_printf(out, " last wrote: ");
+ bch2_pr_time_units(out, sched_clock() - wp->last_used);
+
+ for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
+ prt_printf(out, " %s: ", bch2_write_point_states[i]);
+ bch2_pr_time_units(out, wp->time[i]);
+ }
+
+ prt_newline(out);
+
+ printbuf_indent_add(out, 2);
+ open_bucket_for_each(c, &wp->ptrs, ob, i)
+ bch2_open_bucket_to_text(out, c, ob);
+ printbuf_indent_sub(out, 2);
+}
+
+void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ struct write_point *wp;
+
+ prt_str(out, "Foreground write points\n");
+ for (wp = c->write_points;
+ wp < c->write_points + ARRAY_SIZE(c->write_points);
+ wp++)
+ bch2_write_point_to_text(out, c, wp);
+
+ prt_str(out, "Copygc write point\n");
+ bch2_write_point_to_text(out, c, &c->copygc_write_point);
+
+ prt_str(out, "Rebalance write point\n");
+ bch2_write_point_to_text(out, c, &c->rebalance_write_point);
+
+ prt_str(out, "Btree write point\n");
+ bch2_write_point_to_text(out, c, &c->btree_write_point);
+}
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
new file mode 100644
index 000000000000..7aaeec44c746
--- /dev/null
+++ b/fs/bcachefs/alloc_foreground.h
@@ -0,0 +1,224 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ALLOC_FOREGROUND_H
+#define _BCACHEFS_ALLOC_FOREGROUND_H
+
+#include "bcachefs.h"
+#include "alloc_types.h"
+#include "extents.h"
+#include "sb-members.h"
+
+#include <linux/hash.h>
+
+struct bkey;
+struct bch_dev;
+struct bch_fs;
+struct bch_devs_List;
+
+extern const char * const bch2_watermarks[];
+
+void bch2_reset_alloc_cursors(struct bch_fs *);
+
+struct dev_alloc_list {
+ unsigned nr;
+ u8 devs[BCH_SB_MEMBERS_MAX];
+};
+
+struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *,
+ struct dev_stripe_state *,
+ struct bch_devs_mask *);
+void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *);
+
+long bch2_bucket_alloc_new_fs(struct bch_dev *);
+
+struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *,
+ enum bch_watermark, struct closure *);
+
+static inline void ob_push(struct bch_fs *c, struct open_buckets *obs,
+ struct open_bucket *ob)
+{
+ BUG_ON(obs->nr >= ARRAY_SIZE(obs->v));
+
+ obs->v[obs->nr++] = ob - c->open_buckets;
+}
+
+#define open_bucket_for_each(_c, _obs, _ob, _i) \
+ for ((_i) = 0; \
+ (_i) < (_obs)->nr && \
+ ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true); \
+ (_i)++)
+
+static inline struct open_bucket *ec_open_bucket(struct bch_fs *c,
+ struct open_buckets *obs)
+{
+ struct open_bucket *ob;
+ unsigned i;
+
+ open_bucket_for_each(c, obs, ob, i)
+ if (ob->ec)
+ return ob;
+
+ return NULL;
+}
+
+void bch2_open_bucket_write_error(struct bch_fs *,
+ struct open_buckets *, unsigned);
+
+void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
+
+static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
+{
+ if (atomic_dec_and_test(&ob->pin))
+ __bch2_open_bucket_put(c, ob);
+}
+
+static inline void bch2_open_buckets_put(struct bch_fs *c,
+ struct open_buckets *ptrs)
+{
+ struct open_bucket *ob;
+ unsigned i;
+
+ open_bucket_for_each(c, ptrs, ob, i)
+ bch2_open_bucket_put(c, ob);
+ ptrs->nr = 0;
+}
+
+static inline void bch2_alloc_sectors_done_inlined(struct bch_fs *c, struct write_point *wp)
+{
+ struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 };
+ struct open_bucket *ob;
+ unsigned i;
+
+ open_bucket_for_each(c, &wp->ptrs, ob, i)
+ ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob);
+ wp->ptrs = keep;
+
+ mutex_unlock(&wp->lock);
+
+ bch2_open_buckets_put(c, &ptrs);
+}
+
+static inline void bch2_open_bucket_get(struct bch_fs *c,
+ struct write_point *wp,
+ struct open_buckets *ptrs)
+{
+ struct open_bucket *ob;
+ unsigned i;
+
+ open_bucket_for_each(c, &wp->ptrs, ob, i) {
+ ob->data_type = wp->data_type;
+ atomic_inc(&ob->pin);
+ ob_push(c, ptrs, ob);
+ }
+}
+
+static inline open_bucket_idx_t *open_bucket_hashslot(struct bch_fs *c,
+ unsigned dev, u64 bucket)
+{
+ return c->open_buckets_hash +
+ (jhash_3words(dev, bucket, bucket >> 32, 0) &
+ (OPEN_BUCKETS_COUNT - 1));
+}
+
+static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucket)
+{
+ open_bucket_idx_t slot = *open_bucket_hashslot(c, dev, bucket);
+
+ while (slot) {
+ struct open_bucket *ob = &c->open_buckets[slot];
+
+ if (ob->dev == dev && ob->bucket == bucket)
+ return true;
+
+ slot = ob->hash;
+ }
+
+ return false;
+}
+
+static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket)
+{
+ bool ret;
+
+ if (bch2_bucket_is_open(c, dev, bucket))
+ return true;
+
+ spin_lock(&c->freelist_lock);
+ ret = bch2_bucket_is_open(c, dev, bucket);
+ spin_unlock(&c->freelist_lock);
+
+ return ret;
+}
+
+int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *,
+ struct dev_stripe_state *, struct bch_devs_mask *,
+ unsigned, unsigned *, bool *, unsigned,
+ enum bch_data_type, enum bch_watermark,
+ struct closure *);
+
+int bch2_alloc_sectors_start_trans(struct btree_trans *,
+ unsigned, unsigned,
+ struct write_point_specifier,
+ struct bch_devs_list *,
+ unsigned, unsigned,
+ enum bch_watermark,
+ unsigned,
+ struct closure *,
+ struct write_point **);
+
+struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *);
+
+/*
+ * Append pointers to the space we just allocated to @k, and mark @sectors space
+ * as allocated out of @ob
+ */
+static inline void
+bch2_alloc_sectors_append_ptrs_inlined(struct bch_fs *c, struct write_point *wp,
+ struct bkey_i *k, unsigned sectors,
+ bool cached)
+{
+ struct open_bucket *ob;
+ unsigned i;
+
+ BUG_ON(sectors > wp->sectors_free);
+ wp->sectors_free -= sectors;
+ wp->sectors_allocated += sectors;
+
+ open_bucket_for_each(c, &wp->ptrs, ob, i) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob);
+
+ ptr.cached = cached ||
+ (!ca->mi.durability &&
+ wp->data_type == BCH_DATA_user);
+
+ bch2_bkey_append_ptr(k, ptr);
+
+ BUG_ON(sectors > ob->sectors_free);
+ ob->sectors_free -= sectors;
+ }
+}
+
+void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
+ struct bkey_i *, unsigned, bool);
+void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
+
+void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *, bool);
+
+static inline struct write_point_specifier writepoint_hashed(unsigned long v)
+{
+ return (struct write_point_specifier) { .v = v | 1 };
+}
+
+static inline struct write_point_specifier writepoint_ptr(struct write_point *wp)
+{
+ return (struct write_point_specifier) { .v = (unsigned long) wp };
+}
+
+void bch2_fs_allocator_foreground_init(struct bch_fs *);
+
+void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
+void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *);
+
+void bch2_write_points_to_text(struct printbuf *, struct bch_fs *);
+
+#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
new file mode 100644
index 000000000000..b91b7a461056
--- /dev/null
+++ b/fs/bcachefs/alloc_types.h
@@ -0,0 +1,126 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ALLOC_TYPES_H
+#define _BCACHEFS_ALLOC_TYPES_H
+
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+
+#include "clock_types.h"
+#include "fifo.h"
+
+struct bucket_alloc_state {
+ u64 buckets_seen;
+ u64 skipped_open;
+ u64 skipped_need_journal_commit;
+ u64 skipped_nocow;
+ u64 skipped_nouse;
+};
+
+#define BCH_WATERMARKS() \
+ x(stripe) \
+ x(normal) \
+ x(copygc) \
+ x(btree) \
+ x(btree_copygc) \
+ x(reclaim)
+
+enum bch_watermark {
+#define x(name) BCH_WATERMARK_##name,
+ BCH_WATERMARKS()
+#undef x
+ BCH_WATERMARK_NR,
+};
+
+#define BCH_WATERMARK_BITS 3
+#define BCH_WATERMARK_MASK ~(~0U << BCH_WATERMARK_BITS)
+
+#define OPEN_BUCKETS_COUNT 1024
+
+#define WRITE_POINT_HASH_NR 32
+#define WRITE_POINT_MAX 32
+
+/*
+ * 0 is never a valid open_bucket_idx_t:
+ */
+typedef u16 open_bucket_idx_t;
+
+struct open_bucket {
+ spinlock_t lock;
+ atomic_t pin;
+ open_bucket_idx_t freelist;
+ open_bucket_idx_t hash;
+
+ /*
+ * When an open bucket has an ec_stripe attached, this is the index of
+ * the block in the stripe this open_bucket corresponds to:
+ */
+ u8 ec_idx;
+ enum bch_data_type data_type:6;
+ unsigned valid:1;
+ unsigned on_partial_list:1;
+
+ u8 dev;
+ u8 gen;
+ u32 sectors_free;
+ u64 bucket;
+ struct ec_stripe_new *ec;
+};
+
+#define OPEN_BUCKET_LIST_MAX 15
+
+struct open_buckets {
+ open_bucket_idx_t nr;
+ open_bucket_idx_t v[OPEN_BUCKET_LIST_MAX];
+};
+
+struct dev_stripe_state {
+ u64 next_alloc[BCH_SB_MEMBERS_MAX];
+};
+
+#define WRITE_POINT_STATES() \
+ x(stopped) \
+ x(waiting_io) \
+ x(waiting_work) \
+ x(running)
+
+enum write_point_state {
+#define x(n) WRITE_POINT_##n,
+ WRITE_POINT_STATES()
+#undef x
+ WRITE_POINT_STATE_NR
+};
+
+struct write_point {
+ struct {
+ struct hlist_node node;
+ struct mutex lock;
+ u64 last_used;
+ unsigned long write_point;
+ enum bch_data_type data_type;
+
+ /* calculated based on how many pointers we're actually going to use: */
+ unsigned sectors_free;
+
+ struct open_buckets ptrs;
+ struct dev_stripe_state stripe;
+
+ u64 sectors_allocated;
+ } __aligned(SMP_CACHE_BYTES);
+
+ struct {
+ struct work_struct index_update_work;
+
+ struct list_head writes;
+ spinlock_t writes_lock;
+
+ enum write_point_state state;
+ u64 last_state_change;
+ u64 time[WRITE_POINT_STATE_NR];
+ } __aligned(SMP_CACHE_BYTES);
+};
+
+struct write_point_specifier {
+ unsigned long v;
+};
+
+#endif /* _BCACHEFS_ALLOC_TYPES_H */
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
new file mode 100644
index 000000000000..23c0834a97a4
--- /dev/null
+++ b/fs/bcachefs/backpointers.c
@@ -0,0 +1,860 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "bbpos.h"
+#include "alloc_background.h"
+#include "backpointers.h"
+#include "btree_cache.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
+#include "error.h"
+
+#include <linux/mm.h>
+
+static bool extent_matches_bp(struct bch_fs *c,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k,
+ struct bpos bucket,
+ struct bch_backpointer bp)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ struct bpos bucket2;
+ struct bch_backpointer bp2;
+
+ if (p.ptr.cached)
+ continue;
+
+ bch2_extent_ptr_to_bp(c, btree_id, level, k, p,
+ &bucket2, &bp2);
+ if (bpos_eq(bucket, bucket2) &&
+ !memcmp(&bp, &bp2, sizeof(bp)))
+ return true;
+ }
+
+ return false;
+}
+
+int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
+{
+ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
+ struct bpos bucket = bp_pos_to_bucket(c, bp.k->p);
+ int ret = 0;
+
+ bkey_fsck_err_on(!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)),
+ c, err,
+ backpointer_pos_wrong,
+ "backpointer at wrong pos");
+fsck_err:
+ return ret;
+}
+
+void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp)
+{
+ prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=",
+ bch2_btree_id_str(bp->btree_id),
+ bp->level,
+ (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT),
+ (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
+ bp->bucket_len);
+ bch2_bpos_to_text(out, bp->pos);
+}
+
+void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+ prt_str(out, "bucket=");
+ bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p));
+ prt_str(out, " ");
+
+ bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v);
+}
+
+void bch2_backpointer_swab(struct bkey_s k)
+{
+ struct bkey_s_backpointer bp = bkey_s_to_backpointer(k);
+
+ bp.v->bucket_offset = swab40(bp.v->bucket_offset);
+ bp.v->bucket_len = swab32(bp.v->bucket_len);
+ bch2_bpos_swab(&bp.v->pos);
+}
+
+static noinline int backpointer_mod_err(struct btree_trans *trans,
+ struct bch_backpointer bp,
+ struct bkey_s_c bp_k,
+ struct bkey_s_c orig_k,
+ bool insert)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+
+ if (insert) {
+ prt_printf(&buf, "existing backpointer found when inserting ");
+ bch2_backpointer_to_text(&buf, &bp);
+ prt_newline(&buf);
+ printbuf_indent_add(&buf, 2);
+
+ prt_printf(&buf, "found ");
+ bch2_bkey_val_to_text(&buf, c, bp_k);
+ prt_newline(&buf);
+
+ prt_printf(&buf, "for ");
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+
+ bch_err(c, "%s", buf.buf);
+ } else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) {
+ prt_printf(&buf, "backpointer not found when deleting");
+ prt_newline(&buf);
+ printbuf_indent_add(&buf, 2);
+
+ prt_printf(&buf, "searching for ");
+ bch2_backpointer_to_text(&buf, &bp);
+ prt_newline(&buf);
+
+ prt_printf(&buf, "got ");
+ bch2_bkey_val_to_text(&buf, c, bp_k);
+ prt_newline(&buf);
+
+ prt_printf(&buf, "for ");
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+
+ bch_err(c, "%s", buf.buf);
+ }
+
+ printbuf_exit(&buf);
+
+ if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) {
+ bch2_inconsistent_error(c);
+ return -EIO;
+ } else {
+ return 0;
+ }
+}
+
+int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
+ struct bkey_i_backpointer *bp_k,
+ struct bch_backpointer bp,
+ struct bkey_s_c orig_k,
+ bool insert)
+{
+ struct btree_iter bp_iter;
+ struct bkey_s_c k;
+ int ret;
+
+ k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
+ bp_k->k.p,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_SLOTS|
+ BTREE_ITER_WITH_UPDATES);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (insert
+ ? k.k->type
+ : (k.k->type != KEY_TYPE_backpointer ||
+ memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp)))) {
+ ret = backpointer_mod_err(trans, bp, k, orig_k, insert);
+ if (ret)
+ goto err;
+ }
+
+ ret = bch2_trans_update(trans, &bp_iter, &bp_k->k_i, 0);
+err:
+ bch2_trans_iter_exit(trans, &bp_iter);
+ return ret;
+}
+
+/*
+ * Find the next backpointer >= *bp_offset:
+ */
+int bch2_get_next_backpointer(struct btree_trans *trans,
+ struct bpos bucket, int gen,
+ struct bpos *bp_pos,
+ struct bch_backpointer *bp,
+ unsigned iter_flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bpos bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0);
+ struct btree_iter alloc_iter = { NULL }, bp_iter = { NULL };
+ struct bkey_s_c k;
+ int ret = 0;
+
+ if (bpos_ge(*bp_pos, bp_end_pos))
+ goto done;
+
+ if (gen >= 0) {
+ k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc,
+ bucket, BTREE_ITER_CACHED|iter_flags);
+ ret = bkey_err(k);
+ if (ret)
+ goto out;
+
+ if (k.k->type != KEY_TYPE_alloc_v4 ||
+ bkey_s_c_to_alloc_v4(k).v->gen != gen)
+ goto done;
+ }
+
+ *bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(c, bucket, 0));
+
+ for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers,
+ *bp_pos, iter_flags, k, ret) {
+ if (bpos_ge(k.k->p, bp_end_pos))
+ break;
+
+ *bp_pos = k.k->p;
+ *bp = *bkey_s_c_to_backpointer(k).v;
+ goto out;
+ }
+done:
+ *bp_pos = SPOS_MAX;
+out:
+ bch2_trans_iter_exit(trans, &bp_iter);
+ bch2_trans_iter_exit(trans, &alloc_iter);
+ return ret;
+}
+
+static void backpointer_not_found(struct btree_trans *trans,
+ struct bpos bp_pos,
+ struct bch_backpointer bp,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+ struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
+
+ /*
+ * If we're using the btree write buffer, the backpointer we were
+ * looking at may have already been deleted - failure to find what it
+ * pointed to is not an error:
+ */
+ if (likely(!bch2_backpointers_no_use_write_buffer))
+ return;
+
+ prt_printf(&buf, "backpointer doesn't match %s it points to:\n ",
+ bp.level ? "btree node" : "extent");
+ prt_printf(&buf, "bucket: ");
+ bch2_bpos_to_text(&buf, bucket);
+ prt_printf(&buf, "\n ");
+
+ prt_printf(&buf, "backpointer pos: ");
+ bch2_bpos_to_text(&buf, bp_pos);
+ prt_printf(&buf, "\n ");
+
+ bch2_backpointer_to_text(&buf, &bp);
+ prt_printf(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ if (c->curr_recovery_pass >= BCH_RECOVERY_PASS_check_extents_to_backpointers)
+ bch_err_ratelimited(c, "%s", buf.buf);
+ else
+ bch2_trans_inconsistent(trans, "%s", buf.buf);
+
+ printbuf_exit(&buf);
+}
+
+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos bp_pos,
+ struct bch_backpointer bp,
+ unsigned iter_flags)
+{
+ if (likely(!bp.level)) {
+ struct bch_fs *c = trans->c;
+ struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
+ struct bkey_s_c k;
+
+ bch2_trans_node_iter_init(trans, iter,
+ bp.btree_id,
+ bp.pos,
+ 0, 0,
+ iter_flags);
+ k = bch2_btree_iter_peek_slot(iter);
+ if (bkey_err(k)) {
+ bch2_trans_iter_exit(trans, iter);
+ return k;
+ }
+
+ if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
+ return k;
+
+ bch2_trans_iter_exit(trans, iter);
+ backpointer_not_found(trans, bp_pos, bp, k);
+ return bkey_s_c_null;
+ } else {
+ struct btree *b = bch2_backpointer_get_node(trans, iter, bp_pos, bp);
+
+ if (IS_ERR_OR_NULL(b)) {
+ bch2_trans_iter_exit(trans, iter);
+ return IS_ERR(b) ? bkey_s_c_err(PTR_ERR(b)) : bkey_s_c_null;
+ }
+ return bkey_i_to_s_c(&b->key);
+ }
+}
+
+struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos bp_pos,
+ struct bch_backpointer bp)
+{
+ struct bch_fs *c = trans->c;
+ struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
+ struct btree *b;
+
+ BUG_ON(!bp.level);
+
+ bch2_trans_node_iter_init(trans, iter,
+ bp.btree_id,
+ bp.pos,
+ 0,
+ bp.level - 1,
+ 0);
+ b = bch2_btree_iter_peek_node(iter);
+ if (IS_ERR_OR_NULL(b))
+ goto err;
+
+ BUG_ON(b->c.level != bp.level - 1);
+
+ if (extent_matches_bp(c, bp.btree_id, bp.level,
+ bkey_i_to_s_c(&b->key),
+ bucket, bp))
+ return b;
+
+ if (btree_node_will_make_reachable(b)) {
+ b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
+ } else {
+ backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key));
+ b = NULL;
+ }
+err:
+ bch2_trans_iter_exit(trans, iter);
+ return b;
+}
+
+static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter alloc_iter = { NULL };
+ struct bkey_s_c alloc_k;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c,
+ backpointer_to_missing_device,
+ "backpointer for missing device:\n%s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, bp_iter, 0);
+ goto out;
+ }
+
+ alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc,
+ bp_pos_to_bucket(c, k.k->p), 0);
+ ret = bkey_err(alloc_k);
+ if (ret)
+ goto out;
+
+ if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c,
+ backpointer_to_missing_alloc,
+ "backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
+ alloc_iter.pos.inode, alloc_iter.pos.offset,
+ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, bp_iter, 0);
+ goto out;
+ }
+out:
+fsck_err:
+ bch2_trans_iter_exit(trans, &alloc_iter);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+/* verify that every backpointer has a corresponding alloc key */
+int bch2_check_btree_backpointers(struct bch_fs *c)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter,
+ BTREE_ID_backpointers, POS_MIN, 0, k,
+ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ bch2_check_btree_backpointer(trans, &iter, k)));
+ if (ret)
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+struct bpos_level {
+ unsigned level;
+ struct bpos pos;
+};
+
+static int check_bp_exists(struct btree_trans *trans,
+ struct bpos bucket,
+ struct bch_backpointer bp,
+ struct bkey_s_c orig_k,
+ struct bpos bucket_start,
+ struct bpos bucket_end,
+ struct bpos_level *last_flushed)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter bp_iter = { NULL };
+ struct printbuf buf = PRINTBUF;
+ struct bkey_s_c bp_k;
+ int ret;
+
+ if (bpos_lt(bucket, bucket_start) ||
+ bpos_gt(bucket, bucket_end))
+ return 0;
+
+ if (!bch2_dev_bucket_exists(c, bucket))
+ goto missing;
+
+ bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
+ bucket_pos_to_bp(c, bucket, bp.bucket_offset),
+ 0);
+ ret = bkey_err(bp_k);
+ if (ret)
+ goto err;
+
+ if (bp_k.k->type != KEY_TYPE_backpointer ||
+ memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) {
+ if (last_flushed->level != bp.level ||
+ !bpos_eq(last_flushed->pos, orig_k.k->p)) {
+ last_flushed->level = bp.level;
+ last_flushed->pos = orig_k.k->p;
+
+ ret = bch2_btree_write_buffer_flush_sync(trans) ?:
+ -BCH_ERR_transaction_restart_write_buffer_flush;
+ goto out;
+ }
+ goto missing;
+ }
+out:
+err:
+fsck_err:
+ bch2_trans_iter_exit(trans, &bp_iter);
+ printbuf_exit(&buf);
+ return ret;
+missing:
+ prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
+ bch2_btree_id_str(bp.btree_id), bp.level);
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+ prt_printf(&buf, "\nbp pos ");
+ bch2_bpos_to_text(&buf, bp_iter.pos);
+
+ if (c->sb.version_upgrade_complete < bcachefs_metadata_version_backpointers ||
+ c->opts.reconstruct_alloc ||
+ fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
+ ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);
+
+ goto out;
+}
+
+static int check_extent_to_backpointers(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos bucket_start,
+ struct bpos bucket_end,
+ struct bpos_level *last_flushed)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_ptrs_c ptrs;
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ struct bkey_s_c k;
+ int ret;
+
+ k = bch2_btree_iter_peek_all_levels(iter);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+ if (!k.k)
+ return 0;
+
+ ptrs = bch2_bkey_ptrs_c(k);
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ struct bpos bucket_pos;
+ struct bch_backpointer bp;
+
+ if (p.ptr.cached)
+ continue;
+
+ bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level,
+ k, p, &bucket_pos, &bp);
+
+ ret = check_bp_exists(trans, bucket_pos, bp, k,
+ bucket_start, bucket_end,
+ last_flushed);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int check_btree_root_to_backpointers(struct btree_trans *trans,
+ enum btree_id btree_id,
+ struct bpos bucket_start,
+ struct bpos bucket_end,
+ struct bpos_level *last_flushed)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_root *r = bch2_btree_id_root(c, btree_id);
+ struct btree_iter iter;
+ struct btree *b;
+ struct bkey_s_c k;
+ struct bkey_ptrs_c ptrs;
+ struct extent_ptr_decoded p;
+ const union bch_extent_entry *entry;
+ int ret;
+
+ bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, r->level, 0);
+ b = bch2_btree_iter_peek_node(&iter);
+ ret = PTR_ERR_OR_ZERO(b);
+ if (ret)
+ goto err;
+
+ BUG_ON(b != btree_node_root(c, b));
+
+ k = bkey_i_to_s_c(&b->key);
+ ptrs = bch2_bkey_ptrs_c(k);
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ struct bpos bucket_pos;
+ struct bch_backpointer bp;
+
+ if (p.ptr.cached)
+ continue;
+
+ bch2_extent_ptr_to_bp(c, iter.btree_id, b->c.level + 1,
+ k, p, &bucket_pos, &bp);
+
+ ret = check_bp_exists(trans, bucket_pos, bp, k,
+ bucket_start, bucket_end,
+ last_flushed);
+ if (ret)
+ goto err;
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp)
+{
+ return (struct bbpos) {
+ .btree = bp.btree_id,
+ .pos = bp.pos,
+ };
+}
+
+static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
+{
+ struct sysinfo i;
+ u64 mem_bytes;
+
+ si_meminfo(&i);
+ mem_bytes = i.totalram * i.mem_unit;
+ return div_u64(mem_bytes >> 1, btree_bytes(c));
+}
+
+static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
+ unsigned btree_leaf_mask,
+ unsigned btree_interior_mask,
+ struct bbpos start, struct bbpos *end)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
+ enum btree_id btree;
+ int ret = 0;
+
+ for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) {
+ unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2;
+
+ if (!((1U << btree) & btree_leaf_mask) &&
+ !((1U << btree) & btree_interior_mask))
+ continue;
+
+ bch2_trans_node_iter_init(trans, &iter, btree,
+ btree == start.btree ? start.pos : POS_MIN,
+ 0, depth, 0);
+ /*
+ * for_each_btree_key_contineu() doesn't check the return value
+ * from bch2_btree_iter_advance(), which is needed when
+ * iterating over interior nodes where we'll see keys at
+ * SPOS_MAX:
+ */
+ do {
+ k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0);
+ ret = bkey_err(k);
+ if (!k.k || ret)
+ break;
+
+ --btree_nodes;
+ if (!btree_nodes) {
+ *end = BBPOS(btree, k.k->p);
+ bch2_trans_iter_exit(trans, &iter);
+ return 0;
+ }
+ } while (bch2_btree_iter_advance(&iter));
+ bch2_trans_iter_exit(trans, &iter);
+ }
+
+ *end = BBPOS_MAX;
+ return ret;
+}
+
+static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
+ struct bpos bucket_start,
+ struct bpos bucket_end)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ enum btree_id btree_id;
+ struct bpos_level last_flushed = { UINT_MAX, POS_MIN };
+ int ret = 0;
+
+ for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) {
+ unsigned depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
+
+ bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
+ depth,
+ BTREE_ITER_ALL_LEVELS|
+ BTREE_ITER_PREFETCH);
+
+ do {
+ ret = commit_do(trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL,
+ check_extent_to_backpointers(trans, &iter,
+ bucket_start, bucket_end,
+ &last_flushed));
+ if (ret)
+ break;
+ } while (!bch2_btree_iter_advance(&iter));
+
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (ret)
+ break;
+
+ ret = commit_do(trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL,
+ check_btree_root_to_backpointers(trans, btree_id,
+ bucket_start, bucket_end,
+ &last_flushed));
+ if (ret)
+ break;
+ }
+ return ret;
+}
+
+static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c,
+ struct bpos bucket)
+{
+ return bch2_dev_exists2(c, bucket.inode)
+ ? bucket_pos_to_bp(c, bucket, 0)
+ : bucket;
+}
+
+static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
+ struct bpos start, struct bpos *end)
+{
+ struct btree_iter alloc_iter;
+ struct btree_iter bp_iter;
+ struct bkey_s_c alloc_k, bp_k;
+ size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
+ bool alloc_end = false, bp_end = false;
+ int ret = 0;
+
+ bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
+ start, 0, 1, 0);
+ bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
+ bucket_pos_to_bp_safe(trans->c, start), 0, 1, 0);
+ while (1) {
+ alloc_k = !alloc_end
+ ? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0)
+ : bkey_s_c_null;
+ bp_k = !bp_end
+ ? __bch2_btree_iter_peek_and_restart(trans, &bp_iter, 0)
+ : bkey_s_c_null;
+
+ ret = bkey_err(alloc_k) ?: bkey_err(bp_k);
+ if ((!alloc_k.k && !bp_k.k) || ret) {
+ *end = SPOS_MAX;
+ break;
+ }
+
+ --btree_nodes;
+ if (!btree_nodes) {
+ *end = alloc_k.k ? alloc_k.k->p : SPOS_MAX;
+ break;
+ }
+
+ if (bpos_lt(alloc_iter.pos, SPOS_MAX) &&
+ bpos_lt(bucket_pos_to_bp_safe(trans->c, alloc_iter.pos), bp_iter.pos)) {
+ if (!bch2_btree_iter_advance(&alloc_iter))
+ alloc_end = true;
+ } else {
+ if (!bch2_btree_iter_advance(&bp_iter))
+ bp_end = true;
+ }
+ }
+ bch2_trans_iter_exit(trans, &bp_iter);
+ bch2_trans_iter_exit(trans, &alloc_iter);
+ return ret;
+}
+
+int bch2_check_extents_to_backpointers(struct bch_fs *c)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct bpos start = POS_MIN, end;
+ int ret;
+
+ while (1) {
+ ret = bch2_get_alloc_in_memory_pos(trans, start, &end);
+ if (ret)
+ break;
+
+ if (bpos_eq(start, POS_MIN) && !bpos_eq(end, SPOS_MAX))
+ bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
+ __func__, btree_nodes_fit_in_ram(c));
+
+ if (!bpos_eq(start, POS_MIN) || !bpos_eq(end, SPOS_MAX)) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "check_extents_to_backpointers(): ");
+ bch2_bpos_to_text(&buf, start);
+ prt_str(&buf, "-");
+ bch2_bpos_to_text(&buf, end);
+
+ bch_verbose(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ ret = bch2_check_extents_to_backpointers_pass(trans, start, end);
+ if (ret || bpos_eq(end, SPOS_MAX))
+ break;
+
+ start = bpos_successor(end);
+ }
+ bch2_trans_put(trans);
+
+ if (ret)
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+static int check_one_backpointer(struct btree_trans *trans,
+ struct bbpos start,
+ struct bbpos end,
+ struct bkey_s_c_backpointer bp,
+ struct bpos *last_flushed_pos)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bbpos pos = bp_to_bbpos(*bp.v);
+ struct bkey_s_c k;
+ struct printbuf buf = PRINTBUF;
+ int ret;
+
+ if (bbpos_cmp(pos, start) < 0 ||
+ bbpos_cmp(pos, end) > 0)
+ return 0;
+
+ k = bch2_backpointer_get_key(trans, &iter, bp.k->p, *bp.v, 0);
+ ret = bkey_err(k);
+ if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
+ return 0;
+ if (ret)
+ return ret;
+
+ if (!k.k && !bpos_eq(*last_flushed_pos, bp.k->p)) {
+ *last_flushed_pos = bp.k->p;
+ ret = bch2_btree_write_buffer_flush_sync(trans) ?:
+ -BCH_ERR_transaction_restart_write_buffer_flush;
+ goto out;
+ }
+
+ if (fsck_err_on(!k.k, c,
+ backpointer_to_missing_ptr,
+ "backpointer for missing %s\n %s",
+ bp.v->level ? "btree node" : "extent",
+ (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) {
+ ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p);
+ goto out;
+ }
+out:
+fsck_err:
+ bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
+ struct bbpos start,
+ struct bbpos end)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bpos last_flushed_pos = SPOS_MAX;
+
+ return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
+ POS_MIN, BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ check_one_backpointer(trans, start, end,
+ bkey_s_c_to_backpointer(k),
+ &last_flushed_pos));
+}
+
+int bch2_check_backpointers_to_extents(struct bch_fs *c)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end;
+ int ret;
+
+ while (1) {
+ ret = bch2_get_btree_in_memory_pos(trans,
+ (1U << BTREE_ID_extents)|
+ (1U << BTREE_ID_reflink),
+ ~0,
+ start, &end);
+ if (ret)
+ break;
+
+ if (!bbpos_cmp(start, BBPOS_MIN) &&
+ bbpos_cmp(end, BBPOS_MAX))
+ bch_verbose(c, "%s(): extents do not fit in ram, running in multiple passes with %zu nodes per pass",
+ __func__, btree_nodes_fit_in_ram(c));
+
+ if (bbpos_cmp(start, BBPOS_MIN) ||
+ bbpos_cmp(end, BBPOS_MAX)) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "check_backpointers_to_extents(): ");
+ bch2_bbpos_to_text(&buf, start);
+ prt_str(&buf, "-");
+ bch2_bbpos_to_text(&buf, end);
+
+ bch_verbose(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ ret = bch2_check_backpointers_to_extents_pass(trans, start, end);
+ if (ret || !bbpos_cmp(end, BBPOS_MAX))
+ break;
+
+ start = bbpos_successor(end);
+ }
+ bch2_trans_put(trans);
+
+ if (ret)
+ bch_err_fn(c, ret);
+ return ret;
+}
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
new file mode 100644
index 000000000000..ab866feeaf66
--- /dev/null
+++ b/fs/bcachefs/backpointers.h
@@ -0,0 +1,140 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H
+#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H
+
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "super.h"
+
+static inline u64 swab40(u64 x)
+{
+ return (((x & 0x00000000ffULL) << 32)|
+ ((x & 0x000000ff00ULL) << 16)|
+ ((x & 0x0000ff0000ULL) >> 0)|
+ ((x & 0x00ff000000ULL) >> 16)|
+ ((x & 0xff00000000ULL) >> 32));
+}
+
+int bch2_backpointer_invalid(struct bch_fs *, struct bkey_s_c k,
+ enum bkey_invalid_flags, struct printbuf *);
+void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *);
+void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+void bch2_backpointer_swab(struct bkey_s);
+
+#define bch2_bkey_ops_backpointer ((struct bkey_ops) { \
+ .key_invalid = bch2_backpointer_invalid, \
+ .val_to_text = bch2_backpointer_k_to_text, \
+ .swab = bch2_backpointer_swab, \
+ .min_val_size = 32, \
+})
+
+#define MAX_EXTENT_COMPRESS_RATIO_SHIFT 10
+
+/*
+ * Convert from pos in backpointer btree to pos of corresponding bucket in alloc
+ * btree:
+ */
+static inline struct bpos bp_pos_to_bucket(const struct bch_fs *c,
+ struct bpos bp_pos)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, bp_pos.inode);
+ u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
+
+ return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector));
+}
+
+/*
+ * Convert from pos in alloc btree + bucket offset to pos in backpointer btree:
+ */
+static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c,
+ struct bpos bucket,
+ u64 bucket_offset)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
+ struct bpos ret;
+
+ ret = POS(bucket.inode,
+ (bucket_to_sector(ca, bucket.offset) <<
+ MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
+
+ EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret)));
+
+ return ret;
+}
+
+int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bkey_i_backpointer *,
+ struct bch_backpointer, struct bkey_s_c, bool);
+
+static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
+ struct bpos bucket,
+ struct bch_backpointer bp,
+ struct bkey_s_c orig_k,
+ bool insert)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_i_backpointer *bp_k;
+ int ret;
+
+ bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer));
+ ret = PTR_ERR_OR_ZERO(bp_k);
+ if (ret)
+ return ret;
+
+ bkey_backpointer_init(&bp_k->k_i);
+ bp_k->k.p = bucket_pos_to_bp(c, bucket, bp.bucket_offset);
+ bp_k->v = bp;
+
+ if (!insert) {
+ bp_k->k.type = KEY_TYPE_deleted;
+ set_bkey_val_u64s(&bp_k->k, 0);
+ }
+
+ if (unlikely(bch2_backpointers_no_use_write_buffer))
+ return bch2_bucket_backpointer_mod_nowritebuffer(trans, bp_k, bp, orig_k, insert);
+
+ return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k->k_i);
+}
+
+static inline enum bch_data_type bkey_ptr_data_type(enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, struct extent_ptr_decoded p)
+{
+ return level ? BCH_DATA_btree :
+ p.has_ec ? BCH_DATA_stripe :
+ BCH_DATA_user;
+}
+
+static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, struct extent_ptr_decoded p,
+ struct bpos *bucket_pos, struct bch_backpointer *bp)
+{
+ enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p);
+ s64 sectors = level ? btree_sectors(c) : k.k->size;
+ u32 bucket_offset;
+
+ *bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset);
+ *bp = (struct bch_backpointer) {
+ .btree_id = btree_id,
+ .level = level,
+ .data_type = data_type,
+ .bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) +
+ p.crc.offset,
+ .bucket_len = ptr_disk_sectors(sectors, p),
+ .pos = k.k->p,
+ };
+}
+
+int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int,
+ struct bpos *, struct bch_backpointer *, unsigned);
+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *,
+ struct bpos, struct bch_backpointer,
+ unsigned);
+struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *,
+ struct bpos, struct bch_backpointer);
+
+int bch2_check_btree_backpointers(struct bch_fs *);
+int bch2_check_extents_to_backpointers(struct bch_fs *);
+int bch2_check_backpointers_to_extents(struct bch_fs *);
+
+#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */
diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h
new file mode 100644
index 000000000000..be2edced5213
--- /dev/null
+++ b/fs/bcachefs/bbpos.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BBPOS_H
+#define _BCACHEFS_BBPOS_H
+
+#include "bbpos_types.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+
+static inline int bbpos_cmp(struct bbpos l, struct bbpos r)
+{
+ return cmp_int(l.btree, r.btree) ?: bpos_cmp(l.pos, r.pos);
+}
+
+static inline struct bbpos bbpos_successor(struct bbpos pos)
+{
+ if (bpos_cmp(pos.pos, SPOS_MAX)) {
+ pos.pos = bpos_successor(pos.pos);
+ return pos;
+ }
+
+ if (pos.btree != BTREE_ID_NR) {
+ pos.btree++;
+ pos.pos = POS_MIN;
+ return pos;
+ }
+
+ BUG();
+}
+
+static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos)
+{
+ prt_str(out, bch2_btree_id_str(pos.btree));
+ prt_char(out, ':');
+ bch2_bpos_to_text(out, pos.pos);
+}
+
+#endif /* _BCACHEFS_BBPOS_H */
diff --git a/fs/bcachefs/bbpos_types.h b/fs/bcachefs/bbpos_types.h
new file mode 100644
index 000000000000..5198e94cf3b8
--- /dev/null
+++ b/fs/bcachefs/bbpos_types.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BBPOS_TYPES_H
+#define _BCACHEFS_BBPOS_TYPES_H
+
+struct bbpos {
+ enum btree_id btree;
+ struct bpos pos;
+};
+
+static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
+{
+ return (struct bbpos) { btree, pos };
+}
+
+#define BBPOS_MIN BBPOS(0, POS_MIN)
+#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX)
+
+#endif /* _BCACHEFS_BBPOS_TYPES_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
new file mode 100644
index 000000000000..b62737fdf5ab
--- /dev/null
+++ b/fs/bcachefs/bcachefs.h
@@ -0,0 +1,1164 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_H
+#define _BCACHEFS_H
+
+/*
+ * SOME HIGH LEVEL CODE DOCUMENTATION:
+ *
+ * Bcache mostly works with cache sets, cache devices, and backing devices.
+ *
+ * Support for multiple cache devices hasn't quite been finished off yet, but
+ * it's about 95% plumbed through. A cache set and its cache devices is sort of
+ * like a md raid array and its component devices. Most of the code doesn't care
+ * about individual cache devices, the main abstraction is the cache set.
+ *
+ * Multiple cache devices is intended to give us the ability to mirror dirty
+ * cached data and metadata, without mirroring clean cached data.
+ *
+ * Backing devices are different, in that they have a lifetime independent of a
+ * cache set. When you register a newly formatted backing device it'll come up
+ * in passthrough mode, and then you can attach and detach a backing device from
+ * a cache set at runtime - while it's mounted and in use. Detaching implicitly
+ * invalidates any cached data for that backing device.
+ *
+ * A cache set can have multiple (many) backing devices attached to it.
+ *
+ * There's also flash only volumes - this is the reason for the distinction
+ * between struct cached_dev and struct bcache_device. A flash only volume
+ * works much like a bcache device that has a backing device, except the
+ * "cached" data is always dirty. The end result is that we get thin
+ * provisioning with very little additional code.
+ *
+ * Flash only volumes work but they're not production ready because the moving
+ * garbage collector needs more work. More on that later.
+ *
+ * BUCKETS/ALLOCATION:
+ *
+ * Bcache is primarily designed for caching, which means that in normal
+ * operation all of our available space will be allocated. Thus, we need an
+ * efficient way of deleting things from the cache so we can write new things to
+ * it.
+ *
+ * To do this, we first divide the cache device up into buckets. A bucket is the
+ * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+
+ * works efficiently.
+ *
+ * Each bucket has a 16 bit priority, and an 8 bit generation associated with
+ * it. The gens and priorities for all the buckets are stored contiguously and
+ * packed on disk (in a linked list of buckets - aside from the superblock, all
+ * of bcache's metadata is stored in buckets).
+ *
+ * The priority is used to implement an LRU. We reset a bucket's priority when
+ * we allocate it or on cache it, and every so often we decrement the priority
+ * of each bucket. It could be used to implement something more sophisticated,
+ * if anyone ever gets around to it.
+ *
+ * The generation is used for invalidating buckets. Each pointer also has an 8
+ * bit generation embedded in it; for a pointer to be considered valid, its gen
+ * must match the gen of the bucket it points into. Thus, to reuse a bucket all
+ * we have to do is increment its gen (and write its new gen to disk; we batch
+ * this up).
+ *
+ * Bcache is entirely COW - we never write twice to a bucket, even buckets that
+ * contain metadata (including btree nodes).
+ *
+ * THE BTREE:
+ *
+ * Bcache is in large part design around the btree.
+ *
+ * At a high level, the btree is just an index of key -> ptr tuples.
+ *
+ * Keys represent extents, and thus have a size field. Keys also have a variable
+ * number of pointers attached to them (potentially zero, which is handy for
+ * invalidating the cache).
+ *
+ * The key itself is an inode:offset pair. The inode number corresponds to a
+ * backing device or a flash only volume. The offset is the ending offset of the
+ * extent within the inode - not the starting offset; this makes lookups
+ * slightly more convenient.
+ *
+ * Pointers contain the cache device id, the offset on that device, and an 8 bit
+ * generation number. More on the gen later.
+ *
+ * Index lookups are not fully abstracted - cache lookups in particular are
+ * still somewhat mixed in with the btree code, but things are headed in that
+ * direction.
+ *
+ * Updates are fairly well abstracted, though. There are two different ways of
+ * updating the btree; insert and replace.
+ *
+ * BTREE_INSERT will just take a list of keys and insert them into the btree -
+ * overwriting (possibly only partially) any extents they overlap with. This is
+ * used to update the index after a write.
+ *
+ * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is
+ * overwriting a key that matches another given key. This is used for inserting
+ * data into the cache after a cache miss, and for background writeback, and for
+ * the moving garbage collector.
+ *
+ * There is no "delete" operation; deleting things from the index is
+ * accomplished by either by invalidating pointers (by incrementing a bucket's
+ * gen) or by inserting a key with 0 pointers - which will overwrite anything
+ * previously present at that location in the index.
+ *
+ * This means that there are always stale/invalid keys in the btree. They're
+ * filtered out by the code that iterates through a btree node, and removed when
+ * a btree node is rewritten.
+ *
+ * BTREE NODES:
+ *
+ * Our unit of allocation is a bucket, and we can't arbitrarily allocate and
+ * free smaller than a bucket - so, that's how big our btree nodes are.
+ *
+ * (If buckets are really big we'll only use part of the bucket for a btree node
+ * - no less than 1/4th - but a bucket still contains no more than a single
+ * btree node. I'd actually like to change this, but for now we rely on the
+ * bucket's gen for deleting btree nodes when we rewrite/split a node.)
+ *
+ * Anyways, btree nodes are big - big enough to be inefficient with a textbook
+ * btree implementation.
+ *
+ * The way this is solved is that btree nodes are internally log structured; we
+ * can append new keys to an existing btree node without rewriting it. This
+ * means each set of keys we write is sorted, but the node is not.
+ *
+ * We maintain this log structure in memory - keeping 1Mb of keys sorted would
+ * be expensive, and we have to distinguish between the keys we have written and
+ * the keys we haven't. So to do a lookup in a btree node, we have to search
+ * each sorted set. But we do merge written sets together lazily, so the cost of
+ * these extra searches is quite low (normally most of the keys in a btree node
+ * will be in one big set, and then there'll be one or two sets that are much
+ * smaller).
+ *
+ * This log structure makes bcache's btree more of a hybrid between a
+ * conventional btree and a compacting data structure, with some of the
+ * advantages of both.
+ *
+ * GARBAGE COLLECTION:
+ *
+ * We can't just invalidate any bucket - it might contain dirty data or
+ * metadata. If it once contained dirty data, other writes might overwrite it
+ * later, leaving no valid pointers into that bucket in the index.
+ *
+ * Thus, the primary purpose of garbage collection is to find buckets to reuse.
+ * It also counts how much valid data it each bucket currently contains, so that
+ * allocation can reuse buckets sooner when they've been mostly overwritten.
+ *
+ * It also does some things that are really internal to the btree
+ * implementation. If a btree node contains pointers that are stale by more than
+ * some threshold, it rewrites the btree node to avoid the bucket's generation
+ * wrapping around. It also merges adjacent btree nodes if they're empty enough.
+ *
+ * THE JOURNAL:
+ *
+ * Bcache's journal is not necessary for consistency; we always strictly
+ * order metadata writes so that the btree and everything else is consistent on
+ * disk in the event of an unclean shutdown, and in fact bcache had writeback
+ * caching (with recovery from unclean shutdown) before journalling was
+ * implemented.
+ *
+ * Rather, the journal is purely a performance optimization; we can't complete a
+ * write until we've updated the index on disk, otherwise the cache would be
+ * inconsistent in the event of an unclean shutdown. This means that without the
+ * journal, on random write workloads we constantly have to update all the leaf
+ * nodes in the btree, and those writes will be mostly empty (appending at most
+ * a few keys each) - highly inefficient in terms of amount of metadata writes,
+ * and it puts more strain on the various btree resorting/compacting code.
+ *
+ * The journal is just a log of keys we've inserted; on startup we just reinsert
+ * all the keys in the open journal entries. That means that when we're updating
+ * a node in the btree, we can wait until a 4k block of keys fills up before
+ * writing them out.
+ *
+ * For simplicity, we only journal updates to leaf nodes; updates to parent
+ * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth
+ * the complexity to deal with journalling them (in particular, journal replay)
+ * - updates to non leaf nodes just happen synchronously (see btree_split()).
+ */
+
+#undef pr_fmt
+#ifdef __KERNEL__
+#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__
+#else
+#define pr_fmt(fmt) "%s() " fmt "\n", __func__
+#endif
+
+#include <linux/backing-dev-defs.h>
+#include <linux/bug.h>
+#include <linux/bio.h>
+#include <linux/closure.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/math64.h>
+#include <linux/mutex.h>
+#include <linux/percpu-refcount.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/rhashtable.h>
+#include <linux/rwsem.h>
+#include <linux/semaphore.h>
+#include <linux/seqlock.h>
+#include <linux/shrinker.h>
+#include <linux/srcu.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/zstd.h>
+
+#include "bcachefs_format.h"
+#include "errcode.h"
+#include "fifo.h"
+#include "nocow_locking_types.h"
+#include "opts.h"
+#include "recovery_types.h"
+#include "sb-errors_types.h"
+#include "seqmutex.h"
+#include "util.h"
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define BCH_WRITE_REF_DEBUG
+#endif
+
+#ifndef dynamic_fault
+#define dynamic_fault(...) 0
+#endif
+
+#define race_fault(...) dynamic_fault("bcachefs:race")
+
+#define trace_and_count(_c, _name, ...) \
+do { \
+ this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]); \
+ trace_##_name(__VA_ARGS__); \
+} while (0)
+
+#define bch2_fs_init_fault(name) \
+ dynamic_fault("bcachefs:bch_fs_init:" name)
+#define bch2_meta_read_fault(name) \
+ dynamic_fault("bcachefs:meta:read:" name)
+#define bch2_meta_write_fault(name) \
+ dynamic_fault("bcachefs:meta:write:" name)
+
+#ifdef __KERNEL__
+#define BCACHEFS_LOG_PREFIX
+#endif
+
+#ifdef BCACHEFS_LOG_PREFIX
+
+#define bch2_log_msg(_c, fmt) "bcachefs (%s): " fmt, ((_c)->name)
+#define bch2_fmt_dev(_ca, fmt) "bcachefs (%s): " fmt "\n", ((_ca)->name)
+#define bch2_fmt_dev_offset(_ca, _offset, fmt) "bcachefs (%s sector %llu): " fmt "\n", ((_ca)->name), (_offset)
+#define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
+#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt) \
+ "bcachefs (%s inum %llu offset %llu): " fmt "\n", ((_c)->name), (_inum), (_offset)
+
+#else
+
+#define bch2_log_msg(_c, fmt) fmt
+#define bch2_fmt_dev(_ca, fmt) "%s: " fmt "\n", ((_ca)->name)
+#define bch2_fmt_dev_offset(_ca, _offset, fmt) "%s sector %llu: " fmt "\n", ((_ca)->name), (_offset)
+#define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum)
+#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt) \
+ "inum %llu offset %llu: " fmt "\n", (_inum), (_offset)
+
+#endif
+
+#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n")
+
+#define bch_info(c, fmt, ...) \
+ printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_notice(c, fmt, ...) \
+ printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_warn(c, fmt, ...) \
+ printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_warn_ratelimited(c, fmt, ...) \
+ printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+
+#define bch_err(c, fmt, ...) \
+ printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_err_dev(ca, fmt, ...) \
+ printk(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
+#define bch_err_dev_offset(ca, _offset, fmt, ...) \
+ printk(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
+#define bch_err_inum(c, _inum, fmt, ...) \
+ printk(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
+#define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \
+ printk(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
+
+#define bch_err_ratelimited(c, fmt, ...) \
+ printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_err_dev_ratelimited(ca, fmt, ...) \
+ printk_ratelimited(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
+#define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \
+ printk_ratelimited(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
+#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \
+ printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
+#define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \
+ printk_ratelimited(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
+
+#define bch_err_fn(_c, _ret) \
+do { \
+ if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+ bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\
+} while (0)
+
+#define bch_err_msg(_c, _ret, _msg, ...) \
+do { \
+ if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+ bch_err(_c, "%s(): error " _msg " %s", __func__, \
+ ##__VA_ARGS__, bch2_err_str(_ret)); \
+} while (0)
+
+#define bch_verbose(c, fmt, ...) \
+do { \
+ if ((c)->opts.verbose) \
+ bch_info(c, fmt, ##__VA_ARGS__); \
+} while (0)
+
+#define pr_verbose_init(opts, fmt, ...) \
+do { \
+ if (opt_get(opts, verbose)) \
+ pr_info(fmt, ##__VA_ARGS__); \
+} while (0)
+
+/* Parameters that are useful for debugging, but should always be compiled in: */
+#define BCH_DEBUG_PARAMS_ALWAYS() \
+ BCH_DEBUG_PARAM(key_merging_disabled, \
+ "Disables merging of extents") \
+ BCH_DEBUG_PARAM(btree_gc_always_rewrite, \
+ "Causes mark and sweep to compact and rewrite every " \
+ "btree node it traverses") \
+ BCH_DEBUG_PARAM(btree_gc_rewrite_disabled, \
+ "Disables rewriting of btree nodes during mark and sweep")\
+ BCH_DEBUG_PARAM(btree_shrinker_disabled, \
+ "Disables the shrinker callback for the btree node cache")\
+ BCH_DEBUG_PARAM(verify_btree_ondisk, \
+ "Reread btree nodes at various points to verify the " \
+ "mergesort in the read path against modifications " \
+ "done in memory") \
+ BCH_DEBUG_PARAM(verify_all_btree_replicas, \
+ "When reading btree nodes, read all replicas and " \
+ "compare them") \
+ BCH_DEBUG_PARAM(backpointers_no_use_write_buffer, \
+ "Don't use the write buffer for backpointers, enabling "\
+ "extra runtime checks")
+
+/* Parameters that should only be compiled in debug mode: */
+#define BCH_DEBUG_PARAMS_DEBUG() \
+ BCH_DEBUG_PARAM(expensive_debug_checks, \
+ "Enables various runtime debugging checks that " \
+ "significantly affect performance") \
+ BCH_DEBUG_PARAM(debug_check_iterators, \
+ "Enables extra verification for btree iterators") \
+ BCH_DEBUG_PARAM(debug_check_btree_accounting, \
+ "Verify btree accounting for keys within a node") \
+ BCH_DEBUG_PARAM(journal_seq_verify, \
+ "Store the journal sequence number in the version " \
+ "number of every btree key, and verify that btree " \
+ "update ordering is preserved during recovery") \
+ BCH_DEBUG_PARAM(inject_invalid_keys, \
+ "Store the journal sequence number in the version " \
+ "number of every btree key, and verify that btree " \
+ "update ordering is preserved during recovery") \
+ BCH_DEBUG_PARAM(test_alloc_startup, \
+ "Force allocator startup to use the slowpath where it" \
+ "can't find enough free buckets without invalidating" \
+ "cached data") \
+ BCH_DEBUG_PARAM(force_reconstruct_read, \
+ "Force reads to use the reconstruct path, when reading" \
+ "from erasure coded extents") \
+ BCH_DEBUG_PARAM(test_restart_gc, \
+ "Test restarting mark and sweep gc when bucket gens change")
+
+#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL()
+#else
+#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
+#endif
+
+#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
+BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+#ifndef CONFIG_BCACHEFS_DEBUG
+#define BCH_DEBUG_PARAM(name, description) static const __maybe_unused bool bch2_##name;
+BCH_DEBUG_PARAMS_DEBUG()
+#undef BCH_DEBUG_PARAM
+#endif
+
+#define BCH_TIME_STATS() \
+ x(btree_node_mem_alloc) \
+ x(btree_node_split) \
+ x(btree_node_compact) \
+ x(btree_node_merge) \
+ x(btree_node_sort) \
+ x(btree_node_read) \
+ x(btree_interior_update_foreground) \
+ x(btree_interior_update_total) \
+ x(btree_gc) \
+ x(data_write) \
+ x(data_read) \
+ x(data_promote) \
+ x(journal_flush_write) \
+ x(journal_noflush_write) \
+ x(journal_flush_seq) \
+ x(blocked_journal) \
+ x(blocked_allocate) \
+ x(blocked_allocate_open_bucket) \
+ x(nocow_lock_contended)
+
+enum bch_time_stats {
+#define x(name) BCH_TIME_##name,
+ BCH_TIME_STATS()
+#undef x
+ BCH_TIME_STAT_NR
+};
+
+#include "alloc_types.h"
+#include "btree_types.h"
+#include "btree_write_buffer_types.h"
+#include "buckets_types.h"
+#include "buckets_waiting_for_journal_types.h"
+#include "clock_types.h"
+#include "disk_groups_types.h"
+#include "ec_types.h"
+#include "journal_types.h"
+#include "keylist_types.h"
+#include "quota_types.h"
+#include "rebalance_types.h"
+#include "replicas_types.h"
+#include "subvolume_types.h"
+#include "super_types.h"
+
+/* Number of nodes btree coalesce will try to coalesce at once */
+#define GC_MERGE_NODES 4U
+
+/* Maximum number of nodes we might need to allocate atomically: */
+#define BTREE_RESERVE_MAX (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1))
+
+/* Size of the freelist we allocate btree nodes from: */
+#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4)
+
+#define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX)
+
+struct btree;
+
+enum gc_phase {
+ GC_PHASE_NOT_RUNNING,
+ GC_PHASE_START,
+ GC_PHASE_SB,
+
+ GC_PHASE_BTREE_stripes,
+ GC_PHASE_BTREE_extents,
+ GC_PHASE_BTREE_inodes,
+ GC_PHASE_BTREE_dirents,
+ GC_PHASE_BTREE_xattrs,
+ GC_PHASE_BTREE_alloc,
+ GC_PHASE_BTREE_quotas,
+ GC_PHASE_BTREE_reflink,
+ GC_PHASE_BTREE_subvolumes,
+ GC_PHASE_BTREE_snapshots,
+ GC_PHASE_BTREE_lru,
+ GC_PHASE_BTREE_freespace,
+ GC_PHASE_BTREE_need_discard,
+ GC_PHASE_BTREE_backpointers,
+ GC_PHASE_BTREE_bucket_gens,
+ GC_PHASE_BTREE_snapshot_trees,
+ GC_PHASE_BTREE_deleted_inodes,
+ GC_PHASE_BTREE_logged_ops,
+ GC_PHASE_BTREE_rebalance_work,
+
+ GC_PHASE_PENDING_DELETE,
+};
+
+struct gc_pos {
+ enum gc_phase phase;
+ struct bpos pos;
+ unsigned level;
+};
+
+struct reflink_gc {
+ u64 offset;
+ u32 size;
+ u32 refcount;
+};
+
+typedef GENRADIX(struct reflink_gc) reflink_gc_table;
+
+struct io_count {
+ u64 sectors[2][BCH_DATA_NR];
+};
+
+struct bch_dev {
+ struct kobject kobj;
+ struct percpu_ref ref;
+ struct completion ref_completion;
+ struct percpu_ref io_ref;
+ struct completion io_ref_completion;
+
+ struct bch_fs *fs;
+
+ u8 dev_idx;
+ /*
+ * Cached version of this device's member info from superblock
+ * Committed by bch2_write_super() -> bch_fs_mi_update()
+ */
+ struct bch_member_cpu mi;
+ atomic64_t errors[BCH_MEMBER_ERROR_NR];
+
+ __uuid_t uuid;
+ char name[BDEVNAME_SIZE];
+
+ struct bch_sb_handle disk_sb;
+ struct bch_sb *sb_read_scratch;
+ int sb_write_error;
+ dev_t dev;
+ atomic_t flush_seq;
+
+ struct bch_devs_mask self;
+
+ /* biosets used in cloned bios for writing multiple replicas */
+ struct bio_set replica_set;
+
+ /*
+ * Buckets:
+ * Per-bucket arrays are protected by c->mark_lock, bucket_lock and
+ * gc_lock, for device resize - holding any is sufficient for access:
+ * Or rcu_read_lock(), but only for ptr_stale():
+ */
+ struct bucket_array __rcu *buckets_gc;
+ struct bucket_gens __rcu *bucket_gens;
+ u8 *oldest_gen;
+ unsigned long *buckets_nouse;
+ struct rw_semaphore bucket_lock;
+
+ struct bch_dev_usage *usage_base;
+ struct bch_dev_usage __percpu *usage[JOURNAL_BUF_NR];
+ struct bch_dev_usage __percpu *usage_gc;
+
+ /* Allocator: */
+ u64 new_fs_bucket_idx;
+ u64 alloc_cursor;
+
+ unsigned nr_open_buckets;
+ unsigned nr_btree_reserve;
+
+ size_t inc_gen_needs_gc;
+ size_t inc_gen_really_needs_gc;
+ size_t buckets_waiting_on_journal;
+
+ atomic64_t rebalance_work;
+
+ struct journal_device journal;
+ u64 prev_journal_sector;
+
+ struct work_struct io_error_work;
+
+ /* The rest of this all shows up in sysfs */
+ atomic64_t cur_latency[2];
+ struct bch2_time_stats io_latency[2];
+
+#define CONGESTED_MAX 1024
+ atomic_t congested;
+ u64 congested_last;
+
+ struct io_count __percpu *io_done;
+};
+
+enum {
+ /* startup: */
+ BCH_FS_STARTED,
+ BCH_FS_MAY_GO_RW,
+ BCH_FS_RW,
+ BCH_FS_WAS_RW,
+
+ /* shutdown: */
+ BCH_FS_STOPPING,
+ BCH_FS_EMERGENCY_RO,
+ BCH_FS_GOING_RO,
+ BCH_FS_WRITE_DISABLE_COMPLETE,
+ BCH_FS_CLEAN_SHUTDOWN,
+
+ /* fsck passes: */
+ BCH_FS_FSCK_DONE,
+ BCH_FS_INITIAL_GC_UNFIXED, /* kill when we enumerate fsck errors */
+ BCH_FS_NEED_ANOTHER_GC,
+
+ BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS,
+
+ /* errors: */
+ BCH_FS_ERROR,
+ BCH_FS_TOPOLOGY_ERROR,
+ BCH_FS_ERRORS_FIXED,
+ BCH_FS_ERRORS_NOT_FIXED,
+};
+
+struct btree_debug {
+ unsigned id;
+};
+
+#define BCH_TRANSACTIONS_NR 128
+
+struct btree_transaction_stats {
+ struct bch2_time_stats lock_hold_times;
+ struct mutex lock;
+ unsigned nr_max_paths;
+ unsigned wb_updates_size;
+ unsigned max_mem;
+ char *max_paths_text;
+};
+
+struct bch_fs_pcpu {
+ u64 sectors_available;
+};
+
+struct journal_seq_blacklist_table {
+ size_t nr;
+ struct journal_seq_blacklist_table_entry {
+ u64 start;
+ u64 end;
+ bool dirty;
+ } entries[];
+};
+
+struct journal_keys {
+ struct journal_key {
+ u64 journal_seq;
+ u32 journal_offset;
+ enum btree_id btree_id:8;
+ unsigned level:8;
+ bool allocated;
+ bool overwritten;
+ struct bkey_i *k;
+ } *d;
+ /*
+ * Gap buffer: instead of all the empty space in the array being at the
+ * end of the buffer - from @nr to @size - the empty space is at @gap.
+ * This means that sequential insertions are O(n) instead of O(n^2).
+ */
+ size_t gap;
+ size_t nr;
+ size_t size;
+ atomic_t ref;
+ bool initial_ref_held;
+};
+
+struct btree_trans_buf {
+ struct btree_trans *trans;
+};
+
+#define REPLICAS_DELTA_LIST_MAX (1U << 16)
+
+#define BCACHEFS_ROOT_SUBVOL_INUM \
+ ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO })
+
+#define BCH_WRITE_REFS() \
+ x(trans) \
+ x(write) \
+ x(promote) \
+ x(node_rewrite) \
+ x(stripe_create) \
+ x(stripe_delete) \
+ x(reflink) \
+ x(fallocate) \
+ x(discard) \
+ x(invalidate) \
+ x(delete_dead_snapshots) \
+ x(snapshot_delete_pagecache) \
+ x(sysfs)
+
+enum bch_write_ref {
+#define x(n) BCH_WRITE_REF_##n,
+ BCH_WRITE_REFS()
+#undef x
+ BCH_WRITE_REF_NR,
+};
+
+struct bch_fs {
+ struct closure cl;
+
+ struct list_head list;
+ struct kobject kobj;
+ struct kobject counters_kobj;
+ struct kobject internal;
+ struct kobject opts_dir;
+ struct kobject time_stats;
+ unsigned long flags;
+
+ int minor;
+ struct device *chardev;
+ struct super_block *vfs_sb;
+ dev_t dev;
+ char name[40];
+
+ /* ro/rw, add/remove/resize devices: */
+ struct rw_semaphore state_lock;
+
+ /* Counts outstanding writes, for clean transition to read-only */
+#ifdef BCH_WRITE_REF_DEBUG
+ atomic_long_t writes[BCH_WRITE_REF_NR];
+#else
+ struct percpu_ref writes;
+#endif
+ struct work_struct read_only_work;
+
+ struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX];
+
+ struct bch_replicas_cpu replicas;
+ struct bch_replicas_cpu replicas_gc;
+ struct mutex replicas_gc_lock;
+ mempool_t replicas_delta_pool;
+
+ struct journal_entry_res btree_root_journal_res;
+ struct journal_entry_res replicas_journal_res;
+ struct journal_entry_res clock_journal_res;
+ struct journal_entry_res dev_usage_journal_res;
+
+ struct bch_disk_groups_cpu __rcu *disk_groups;
+
+ struct bch_opts opts;
+
+ /* Updated by bch2_sb_update():*/
+ struct {
+ __uuid_t uuid;
+ __uuid_t user_uuid;
+
+ u16 version;
+ u16 version_min;
+ u16 version_upgrade_complete;
+
+ u8 nr_devices;
+ u8 clean;
+
+ u8 encryption_type;
+
+ u64 time_base_lo;
+ u32 time_base_hi;
+ unsigned time_units_per_sec;
+ unsigned nsec_per_time_unit;
+ u64 features;
+ u64 compat;
+ unsigned long errors_silent[BITS_TO_LONGS(BCH_SB_ERR_MAX)];
+ } sb;
+
+
+ struct bch_sb_handle disk_sb;
+
+ unsigned short block_bits; /* ilog2(block_size) */
+
+ u16 btree_foreground_merge_threshold;
+
+ struct closure sb_write;
+ struct mutex sb_lock;
+
+ /* snapshot.c: */
+ struct snapshot_table __rcu *snapshots;
+ size_t snapshot_table_size;
+ struct mutex snapshot_table_lock;
+ struct rw_semaphore snapshot_create_lock;
+
+ struct work_struct snapshot_delete_work;
+ struct work_struct snapshot_wait_for_pagecache_and_delete_work;
+ snapshot_id_list snapshots_unlinked;
+ struct mutex snapshots_unlinked_lock;
+
+ /* BTREE CACHE */
+ struct bio_set btree_bio;
+ struct workqueue_struct *io_complete_wq;
+
+ struct btree_root btree_roots_known[BTREE_ID_NR];
+ DARRAY(struct btree_root) btree_roots_extra;
+ struct mutex btree_root_lock;
+
+ struct btree_cache btree_cache;
+
+ /*
+ * Cache of allocated btree nodes - if we allocate a btree node and
+ * don't use it, if we free it that space can't be reused until going
+ * _all_ the way through the allocator (which exposes us to a livelock
+ * when allocating btree reserves fail halfway through) - instead, we
+ * can stick them here:
+ */
+ struct btree_alloc btree_reserve_cache[BTREE_NODE_RESERVE * 2];
+ unsigned btree_reserve_cache_nr;
+ struct mutex btree_reserve_cache_lock;
+
+ mempool_t btree_interior_update_pool;
+ struct list_head btree_interior_update_list;
+ struct list_head btree_interior_updates_unwritten;
+ struct mutex btree_interior_update_lock;
+ struct closure_waitlist btree_interior_update_wait;
+
+ struct workqueue_struct *btree_interior_update_worker;
+ struct work_struct btree_interior_update_work;
+
+ struct list_head pending_node_rewrites;
+ struct mutex pending_node_rewrites_lock;
+
+ /* btree_io.c: */
+ spinlock_t btree_write_error_lock;
+ struct btree_write_stats {
+ atomic64_t nr;
+ atomic64_t bytes;
+ } btree_write_stats[BTREE_WRITE_TYPE_NR];
+
+ /* btree_iter.c: */
+ struct seqmutex btree_trans_lock;
+ struct list_head btree_trans_list;
+ mempool_t btree_trans_pool;
+ mempool_t btree_trans_mem_pool;
+ struct btree_trans_buf __percpu *btree_trans_bufs;
+
+ struct srcu_struct btree_trans_barrier;
+ bool btree_trans_barrier_initialized;
+
+ struct btree_key_cache btree_key_cache;
+ unsigned btree_key_cache_btrees;
+
+ struct btree_write_buffer btree_write_buffer;
+
+ struct workqueue_struct *btree_update_wq;
+ struct workqueue_struct *btree_io_complete_wq;
+ /* copygc needs its own workqueue for index updates.. */
+ struct workqueue_struct *copygc_wq;
+ /*
+ * Use a dedicated wq for write ref holder tasks. Required to avoid
+ * dependency problems with other wq tasks that can block on ref
+ * draining, such as read-only transition.
+ */
+ struct workqueue_struct *write_ref_wq;
+
+ /* ALLOCATION */
+ struct bch_devs_mask rw_devs[BCH_DATA_NR];
+
+ u64 capacity; /* sectors */
+
+ /*
+ * When capacity _decreases_ (due to a disk being removed), we
+ * increment capacity_gen - this invalidates outstanding reservations
+ * and forces them to be revalidated
+ */
+ u32 capacity_gen;
+ unsigned bucket_size_max;
+
+ atomic64_t sectors_available;
+ struct mutex sectors_available_lock;
+
+ struct bch_fs_pcpu __percpu *pcpu;
+
+ struct percpu_rw_semaphore mark_lock;
+
+ seqcount_t usage_lock;
+ struct bch_fs_usage *usage_base;
+ struct bch_fs_usage __percpu *usage[JOURNAL_BUF_NR];
+ struct bch_fs_usage __percpu *usage_gc;
+ u64 __percpu *online_reserved;
+
+ /* single element mempool: */
+ struct mutex usage_scratch_lock;
+ struct bch_fs_usage_online *usage_scratch;
+
+ struct io_clock io_clock[2];
+
+ /* JOURNAL SEQ BLACKLIST */
+ struct journal_seq_blacklist_table *
+ journal_seq_blacklist_table;
+ struct work_struct journal_seq_blacklist_gc_work;
+
+ /* ALLOCATOR */
+ spinlock_t freelist_lock;
+ struct closure_waitlist freelist_wait;
+ u64 blocked_allocate;
+ u64 blocked_allocate_open_bucket;
+
+ open_bucket_idx_t open_buckets_freelist;
+ open_bucket_idx_t open_buckets_nr_free;
+ struct closure_waitlist open_buckets_wait;
+ struct open_bucket open_buckets[OPEN_BUCKETS_COUNT];
+ open_bucket_idx_t open_buckets_hash[OPEN_BUCKETS_COUNT];
+
+ open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT];
+ open_bucket_idx_t open_buckets_partial_nr;
+
+ struct write_point btree_write_point;
+ struct write_point rebalance_write_point;
+
+ struct write_point write_points[WRITE_POINT_MAX];
+ struct hlist_head write_points_hash[WRITE_POINT_HASH_NR];
+ struct mutex write_points_hash_lock;
+ unsigned write_points_nr;
+
+ struct buckets_waiting_for_journal buckets_waiting_for_journal;
+ struct work_struct discard_work;
+ struct work_struct invalidate_work;
+
+ /* GARBAGE COLLECTION */
+ struct task_struct *gc_thread;
+ atomic_t kick_gc;
+ unsigned long gc_count;
+
+ enum btree_id gc_gens_btree;
+ struct bpos gc_gens_pos;
+
+ /*
+ * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
+ * has been marked by GC.
+ *
+ * gc_cur_phase is a superset of btree_ids (BTREE_ID_extents etc.)
+ *
+ * Protected by gc_pos_lock. Only written to by GC thread, so GC thread
+ * can read without a lock.
+ */
+ seqcount_t gc_pos_lock;
+ struct gc_pos gc_pos;
+
+ /*
+ * The allocation code needs gc_mark in struct bucket to be correct, but
+ * it's not while a gc is in progress.
+ */
+ struct rw_semaphore gc_lock;
+ struct mutex gc_gens_lock;
+
+ /* IO PATH */
+ struct semaphore io_in_flight;
+ struct bio_set bio_read;
+ struct bio_set bio_read_split;
+ struct bio_set bio_write;
+ struct mutex bio_bounce_pages_lock;
+ mempool_t bio_bounce_pages;
+ struct bucket_nocow_lock_table
+ nocow_locks;
+ struct rhashtable promote_table;
+
+ mempool_t compression_bounce[2];
+ mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR];
+ mempool_t decompress_workspace;
+ size_t zstd_workspace_size;
+
+ struct crypto_shash *sha256;
+ struct crypto_sync_skcipher *chacha20;
+ struct crypto_shash *poly1305;
+
+ atomic64_t key_version;
+
+ mempool_t large_bkey_pool;
+
+ /* MOVE.C */
+ struct list_head moving_context_list;
+ struct mutex moving_context_lock;
+
+ /* REBALANCE */
+ struct bch_fs_rebalance rebalance;
+
+ /* COPYGC */
+ struct task_struct *copygc_thread;
+ struct write_point copygc_write_point;
+ s64 copygc_wait_at;
+ s64 copygc_wait;
+ bool copygc_running;
+ wait_queue_head_t copygc_running_wq;
+
+ /* STRIPES: */
+ GENRADIX(struct stripe) stripes;
+ GENRADIX(struct gc_stripe) gc_stripes;
+
+ struct hlist_head ec_stripes_new[32];
+ spinlock_t ec_stripes_new_lock;
+
+ ec_stripes_heap ec_stripes_heap;
+ struct mutex ec_stripes_heap_lock;
+
+ /* ERASURE CODING */
+ struct list_head ec_stripe_head_list;
+ struct mutex ec_stripe_head_lock;
+
+ struct list_head ec_stripe_new_list;
+ struct mutex ec_stripe_new_lock;
+ wait_queue_head_t ec_stripe_new_wait;
+
+ struct work_struct ec_stripe_create_work;
+ u64 ec_stripe_hint;
+
+ struct work_struct ec_stripe_delete_work;
+
+ struct bio_set ec_bioset;
+
+ /* REFLINK */
+ reflink_gc_table reflink_gc_table;
+ size_t reflink_gc_nr;
+
+ /* fs.c */
+ struct list_head vfs_inodes_list;
+ struct mutex vfs_inodes_lock;
+
+ /* VFS IO PATH - fs-io.c */
+ struct bio_set writepage_bioset;
+ struct bio_set dio_write_bioset;
+ struct bio_set dio_read_bioset;
+ struct bio_set nocow_flush_bioset;
+
+ /* QUOTAS */
+ struct bch_memquota_type quotas[QTYP_NR];
+
+ /* RECOVERY */
+ u64 journal_replay_seq_start;
+ u64 journal_replay_seq_end;
+ enum bch_recovery_pass curr_recovery_pass;
+ /* bitmap of explicitly enabled recovery passes: */
+ u64 recovery_passes_explicit;
+ u64 recovery_passes_complete;
+
+ /* DEBUG JUNK */
+ struct dentry *fs_debug_dir;
+ struct dentry *btree_debug_dir;
+ struct btree_debug btree_debug[BTREE_ID_NR];
+ struct btree *verify_data;
+ struct btree_node *verify_ondisk;
+ struct mutex verify_lock;
+
+ u64 *unused_inode_hints;
+ unsigned inode_shard_bits;
+
+ /*
+ * A btree node on disk could have too many bsets for an iterator to fit
+ * on the stack - have to dynamically allocate them
+ */
+ mempool_t fill_iter;
+
+ mempool_t btree_bounce_pool;
+
+ struct journal journal;
+ GENRADIX(struct journal_replay *) journal_entries;
+ u64 journal_entries_base_seq;
+ struct journal_keys journal_keys;
+ struct list_head journal_iters;
+
+ u64 last_bucket_seq_cleanup;
+
+ u64 counters_on_mount[BCH_COUNTER_NR];
+ u64 __percpu *counters;
+
+ unsigned btree_gc_periodic:1;
+ unsigned copy_gc_enabled:1;
+ bool promote_whole_extents;
+
+ struct bch2_time_stats times[BCH_TIME_STAT_NR];
+
+ struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
+
+ /* ERRORS */
+ struct list_head fsck_error_msgs;
+ struct mutex fsck_error_msgs_lock;
+ bool fsck_alloc_msgs_err;
+
+ bch_sb_errors_cpu fsck_error_counts;
+ struct mutex fsck_error_counts_lock;
+};
+
+extern struct wait_queue_head bch2_read_only_wait;
+
+static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref)
+{
+#ifdef BCH_WRITE_REF_DEBUG
+ atomic_long_inc(&c->writes[ref]);
+#else
+ percpu_ref_get(&c->writes);
+#endif
+}
+
+static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
+{
+#ifdef BCH_WRITE_REF_DEBUG
+ return !test_bit(BCH_FS_GOING_RO, &c->flags) &&
+ atomic_long_inc_not_zero(&c->writes[ref]);
+#else
+ return percpu_ref_tryget_live(&c->writes);
+#endif
+}
+
+static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref)
+{
+#ifdef BCH_WRITE_REF_DEBUG
+ long v = atomic_long_dec_return(&c->writes[ref]);
+
+ BUG_ON(v < 0);
+ if (v)
+ return;
+ for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++)
+ if (atomic_long_read(&c->writes[i]))
+ return;
+
+ set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+ wake_up(&bch2_read_only_wait);
+#else
+ percpu_ref_put(&c->writes);
+#endif
+}
+
+static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
+{
+#ifndef NO_BCACHEFS_FS
+ if (c->vfs_sb)
+ c->vfs_sb->s_bdi->ra_pages = ra_pages;
+#endif
+}
+
+static inline unsigned bucket_bytes(const struct bch_dev *ca)
+{
+ return ca->mi.bucket_size << 9;
+}
+
+static inline unsigned block_bytes(const struct bch_fs *c)
+{
+ return c->opts.block_size;
+}
+
+static inline unsigned block_sectors(const struct bch_fs *c)
+{
+ return c->opts.block_size >> 9;
+}
+
+static inline size_t btree_sectors(const struct bch_fs *c)
+{
+ return c->opts.btree_node_size >> 9;
+}
+
+static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
+{
+ return c->btree_key_cache_btrees & (1U << btree);
+}
+
+static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time)
+{
+ struct timespec64 t;
+ s32 rem;
+
+ time += c->sb.time_base_lo;
+
+ t.tv_sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem);
+ t.tv_nsec = rem * c->sb.nsec_per_time_unit;
+ return t;
+}
+
+static inline s64 timespec_to_bch2_time(const struct bch_fs *c, struct timespec64 ts)
+{
+ return (ts.tv_sec * c->sb.time_units_per_sec +
+ (int) ts.tv_nsec / c->sb.nsec_per_time_unit) - c->sb.time_base_lo;
+}
+
+static inline s64 bch2_current_time(const struct bch_fs *c)
+{
+ struct timespec64 now;
+
+ ktime_get_coarse_real_ts64(&now);
+ return timespec_to_bch2_time(c, now);
+}
+
+static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev)
+{
+ return dev < c->sb.nr_devices && c->devs[dev];
+}
+
+#define BKEY_PADDED_ONSTACK(key, pad) \
+ struct { struct bkey_i key; __u64 key ## _pad[pad]; }
+
+#endif /* _BCACHEFS_H */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
new file mode 100644
index 000000000000..fe78e87603fc
--- /dev/null
+++ b/fs/bcachefs/bcachefs_format.h
@@ -0,0 +1,2454 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FORMAT_H
+#define _BCACHEFS_FORMAT_H
+
+/*
+ * bcachefs on disk data structures
+ *
+ * OVERVIEW:
+ *
+ * There are three main types of on disk data structures in bcachefs (this is
+ * reduced from 5 in bcache)
+ *
+ * - superblock
+ * - journal
+ * - btree
+ *
+ * The btree is the primary structure; most metadata exists as keys in the
+ * various btrees. There are only a small number of btrees, they're not
+ * sharded - we have one btree for extents, another for inodes, et cetera.
+ *
+ * SUPERBLOCK:
+ *
+ * The superblock contains the location of the journal, the list of devices in
+ * the filesystem, and in general any metadata we need in order to decide
+ * whether we can start a filesystem or prior to reading the journal/btree
+ * roots.
+ *
+ * The superblock is extensible, and most of the contents of the superblock are
+ * in variable length, type tagged fields; see struct bch_sb_field.
+ *
+ * Backup superblocks do not reside in a fixed location; also, superblocks do
+ * not have a fixed size. To locate backup superblocks we have struct
+ * bch_sb_layout; we store a copy of this inside every superblock, and also
+ * before the first superblock.
+ *
+ * JOURNAL:
+ *
+ * The journal primarily records btree updates in the order they occurred;
+ * journal replay consists of just iterating over all the keys in the open
+ * journal entries and re-inserting them into the btrees.
+ *
+ * The journal also contains entry types for the btree roots, and blacklisted
+ * journal sequence numbers (see journal_seq_blacklist.c).
+ *
+ * BTREE:
+ *
+ * bcachefs btrees are copy on write b+ trees, where nodes are big (typically
+ * 128k-256k) and log structured. We use struct btree_node for writing the first
+ * entry in a given node (offset 0), and struct btree_node_entry for all
+ * subsequent writes.
+ *
+ * After the header, btree node entries contain a list of keys in sorted order.
+ * Values are stored inline with the keys; since values are variable length (and
+ * keys effectively are variable length too, due to packing) we can't do random
+ * access without building up additional in memory tables in the btree node read
+ * path.
+ *
+ * BTREE KEYS (struct bkey):
+ *
+ * The various btrees share a common format for the key - so as to avoid
+ * switching in fastpath lookup/comparison code - but define their own
+ * structures for the key values.
+ *
+ * The size of a key/value pair is stored as a u8 in units of u64s, so the max
+ * size is just under 2k. The common part also contains a type tag for the
+ * value, and a format field indicating whether the key is packed or not (and
+ * also meant to allow adding new key fields in the future, if desired).
+ *
+ * bkeys, when stored within a btree node, may also be packed. In that case, the
+ * bkey_format in that node is used to unpack it. Packed bkeys mean that we can
+ * be generous with field sizes in the common part of the key format (64 bit
+ * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost.
+ */
+
+#include <asm/types.h>
+#include <asm/byteorder.h>
+#include <linux/kernel.h>
+#include <linux/uuid.h>
+#include "vstructs.h"
+
+#ifdef __KERNEL__
+typedef uuid_t __uuid_t;
+#endif
+
+#define BITMASK(name, type, field, offset, end) \
+static const __maybe_unused unsigned name##_OFFSET = offset; \
+static const __maybe_unused unsigned name##_BITS = (end - offset); \
+ \
+static inline __u64 name(const type *k) \
+{ \
+ return (k->field >> offset) & ~(~0ULL << (end - offset)); \
+} \
+ \
+static inline void SET_##name(type *k, __u64 v) \
+{ \
+ k->field &= ~(~(~0ULL << (end - offset)) << offset); \
+ k->field |= (v & ~(~0ULL << (end - offset))) << offset; \
+}
+
+#define LE_BITMASK(_bits, name, type, field, offset, end) \
+static const __maybe_unused unsigned name##_OFFSET = offset; \
+static const __maybe_unused unsigned name##_BITS = (end - offset); \
+static const __maybe_unused __u##_bits name##_MAX = (1ULL << (end - offset)) - 1;\
+ \
+static inline __u64 name(const type *k) \
+{ \
+ return (__le##_bits##_to_cpu(k->field) >> offset) & \
+ ~(~0ULL << (end - offset)); \
+} \
+ \
+static inline void SET_##name(type *k, __u64 v) \
+{ \
+ __u##_bits new = __le##_bits##_to_cpu(k->field); \
+ \
+ new &= ~(~(~0ULL << (end - offset)) << offset); \
+ new |= (v & ~(~0ULL << (end - offset))) << offset; \
+ k->field = __cpu_to_le##_bits(new); \
+}
+
+#define LE16_BITMASK(n, t, f, o, e) LE_BITMASK(16, n, t, f, o, e)
+#define LE32_BITMASK(n, t, f, o, e) LE_BITMASK(32, n, t, f, o, e)
+#define LE64_BITMASK(n, t, f, o, e) LE_BITMASK(64, n, t, f, o, e)
+
+struct bkey_format {
+ __u8 key_u64s;
+ __u8 nr_fields;
+ /* One unused slot for now: */
+ __u8 bits_per_field[6];
+ __le64 field_offset[6];
+};
+
+/* Btree keys - all units are in sectors */
+
+struct bpos {
+ /*
+ * Word order matches machine byte order - btree code treats a bpos as a
+ * single large integer, for search/comparison purposes
+ *
+ * Note that wherever a bpos is embedded in another on disk data
+ * structure, it has to be byte swabbed when reading in metadata that
+ * wasn't written in native endian order:
+ */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ __u32 snapshot;
+ __u64 offset;
+ __u64 inode;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ __u64 inode;
+ __u64 offset; /* Points to end of extent - sectors */
+ __u32 snapshot;
+#else
+#error edit for your odd byteorder.
+#endif
+} __packed
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+__aligned(4)
+#endif
+;
+
+#define KEY_INODE_MAX ((__u64)~0ULL)
+#define KEY_OFFSET_MAX ((__u64)~0ULL)
+#define KEY_SNAPSHOT_MAX ((__u32)~0U)
+#define KEY_SIZE_MAX ((__u32)~0U)
+
+static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot)
+{
+ return (struct bpos) {
+ .inode = inode,
+ .offset = offset,
+ .snapshot = snapshot,
+ };
+}
+
+#define POS_MIN SPOS(0, 0, 0)
+#define POS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, 0)
+#define SPOS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX)
+#define POS(_inode, _offset) SPOS(_inode, _offset, 0)
+
+/* Empty placeholder struct, for container_of() */
+struct bch_val {
+ __u64 __nothing[0];
+};
+
+struct bversion {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ __u64 lo;
+ __u32 hi;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ __u32 hi;
+ __u64 lo;
+#endif
+} __packed __aligned(4);
+
+struct bkey {
+ /* Size of combined key and value, in u64s */
+ __u8 u64s;
+
+ /* Format of key (0 for format local to btree node) */
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u8 format:7,
+ needs_whiteout:1;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u8 needs_whiteout:1,
+ format:7;
+#else
+#error edit for your odd byteorder.
+#endif
+
+ /* Type of the value */
+ __u8 type;
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ __u8 pad[1];
+
+ struct bversion version;
+ __u32 size; /* extent size, in sectors */
+ struct bpos p;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ struct bpos p;
+ __u32 size; /* extent size, in sectors */
+ struct bversion version;
+
+ __u8 pad[1];
+#endif
+} __packed __aligned(8);
+
+struct bkey_packed {
+ __u64 _data[0];
+
+ /* Size of combined key and value, in u64s */
+ __u8 u64s;
+
+ /* Format of key (0 for format local to btree node) */
+
+ /*
+ * XXX: next incompat on disk format change, switch format and
+ * needs_whiteout - bkey_packed() will be cheaper if format is the high
+ * bits of the bitfield
+ */
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u8 format:7,
+ needs_whiteout:1;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u8 needs_whiteout:1,
+ format:7;
+#endif
+
+ /* Type of the value */
+ __u8 type;
+ __u8 key_start[0];
+
+ /*
+ * We copy bkeys with struct assignment in various places, and while
+ * that shouldn't be done with packed bkeys we can't disallow it in C,
+ * and it's legal to cast a bkey to a bkey_packed - so padding it out
+ * to the same size as struct bkey should hopefully be safest.
+ */
+ __u8 pad[sizeof(struct bkey) - 3];
+} __packed __aligned(8);
+
+typedef struct {
+ __le64 lo;
+ __le64 hi;
+} bch_le128;
+
+#define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64))
+#define BKEY_U64s_MAX U8_MAX
+#define BKEY_VAL_U64s_MAX (BKEY_U64s_MAX - BKEY_U64s)
+
+#define KEY_PACKED_BITS_START 24
+
+#define KEY_FORMAT_LOCAL_BTREE 0
+#define KEY_FORMAT_CURRENT 1
+
+enum bch_bkey_fields {
+ BKEY_FIELD_INODE,
+ BKEY_FIELD_OFFSET,
+ BKEY_FIELD_SNAPSHOT,
+ BKEY_FIELD_SIZE,
+ BKEY_FIELD_VERSION_HI,
+ BKEY_FIELD_VERSION_LO,
+ BKEY_NR_FIELDS,
+};
+
+#define bkey_format_field(name, field) \
+ [BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8)
+
+#define BKEY_FORMAT_CURRENT \
+((struct bkey_format) { \
+ .key_u64s = BKEY_U64s, \
+ .nr_fields = BKEY_NR_FIELDS, \
+ .bits_per_field = { \
+ bkey_format_field(INODE, p.inode), \
+ bkey_format_field(OFFSET, p.offset), \
+ bkey_format_field(SNAPSHOT, p.snapshot), \
+ bkey_format_field(SIZE, size), \
+ bkey_format_field(VERSION_HI, version.hi), \
+ bkey_format_field(VERSION_LO, version.lo), \
+ }, \
+})
+
+/* bkey with inline value */
+struct bkey_i {
+ __u64 _data[0];
+
+ struct bkey k;
+ struct bch_val v;
+};
+
+#define KEY(_inode, _offset, _size) \
+((struct bkey) { \
+ .u64s = BKEY_U64s, \
+ .format = KEY_FORMAT_CURRENT, \
+ .p = POS(_inode, _offset), \
+ .size = _size, \
+})
+
+static inline void bkey_init(struct bkey *k)
+{
+ *k = KEY(0, 0, 0);
+}
+
+#define bkey_bytes(_k) ((_k)->u64s * sizeof(__u64))
+
+#define __BKEY_PADDED(key, pad) \
+ struct bkey_i key; __u64 key ## _pad[pad]
+
+/*
+ * - DELETED keys are used internally to mark keys that should be ignored but
+ * override keys in composition order. Their version number is ignored.
+ *
+ * - DISCARDED keys indicate that the data is all 0s because it has been
+ * discarded. DISCARDs may have a version; if the version is nonzero the key
+ * will be persistent, otherwise the key will be dropped whenever the btree
+ * node is rewritten (like DELETED keys).
+ *
+ * - ERROR: any read of the data returns a read error, as the data was lost due
+ * to a failing device. Like DISCARDED keys, they can be removed (overridden)
+ * by new writes or cluster-wide GC. Node repair can also overwrite them with
+ * the same or a more recent version number, but not with an older version
+ * number.
+ *
+ * - WHITEOUT: for hash table btrees
+ */
+#define BCH_BKEY_TYPES() \
+ x(deleted, 0) \
+ x(whiteout, 1) \
+ x(error, 2) \
+ x(cookie, 3) \
+ x(hash_whiteout, 4) \
+ x(btree_ptr, 5) \
+ x(extent, 6) \
+ x(reservation, 7) \
+ x(inode, 8) \
+ x(inode_generation, 9) \
+ x(dirent, 10) \
+ x(xattr, 11) \
+ x(alloc, 12) \
+ x(quota, 13) \
+ x(stripe, 14) \
+ x(reflink_p, 15) \
+ x(reflink_v, 16) \
+ x(inline_data, 17) \
+ x(btree_ptr_v2, 18) \
+ x(indirect_inline_data, 19) \
+ x(alloc_v2, 20) \
+ x(subvolume, 21) \
+ x(snapshot, 22) \
+ x(inode_v2, 23) \
+ x(alloc_v3, 24) \
+ x(set, 25) \
+ x(lru, 26) \
+ x(alloc_v4, 27) \
+ x(backpointer, 28) \
+ x(inode_v3, 29) \
+ x(bucket_gens, 30) \
+ x(snapshot_tree, 31) \
+ x(logged_op_truncate, 32) \
+ x(logged_op_finsert, 33)
+
+enum bch_bkey_type {
+#define x(name, nr) KEY_TYPE_##name = nr,
+ BCH_BKEY_TYPES()
+#undef x
+ KEY_TYPE_MAX,
+};
+
+struct bch_deleted {
+ struct bch_val v;
+};
+
+struct bch_whiteout {
+ struct bch_val v;
+};
+
+struct bch_error {
+ struct bch_val v;
+};
+
+struct bch_cookie {
+ struct bch_val v;
+ __le64 cookie;
+};
+
+struct bch_hash_whiteout {
+ struct bch_val v;
+};
+
+struct bch_set {
+ struct bch_val v;
+};
+
+/* Extents */
+
+/*
+ * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
+ * preceded by checksum/compression information (bch_extent_crc32 or
+ * bch_extent_crc64).
+ *
+ * One major determining factor in the format of extents is how we handle and
+ * represent extents that have been partially overwritten and thus trimmed:
+ *
+ * If an extent is not checksummed or compressed, when the extent is trimmed we
+ * don't have to remember the extent we originally allocated and wrote: we can
+ * merely adjust ptr->offset to point to the start of the data that is currently
+ * live. The size field in struct bkey records the current (live) size of the
+ * extent, and is also used to mean "size of region on disk that we point to" in
+ * this case.
+ *
+ * Thus an extent that is not checksummed or compressed will consist only of a
+ * list of bch_extent_ptrs, with none of the fields in
+ * bch_extent_crc32/bch_extent_crc64.
+ *
+ * When an extent is checksummed or compressed, it's not possible to read only
+ * the data that is currently live: we have to read the entire extent that was
+ * originally written, and then return only the part of the extent that is
+ * currently live.
+ *
+ * Thus, in addition to the current size of the extent in struct bkey, we need
+ * to store the size of the originally allocated space - this is the
+ * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
+ * when the extent is trimmed, instead of modifying the offset field of the
+ * pointer, we keep a second smaller offset field - "offset into the original
+ * extent of the currently live region".
+ *
+ * The other major determining factor is replication and data migration:
+ *
+ * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
+ * write, we will initially write all the replicas in the same format, with the
+ * same checksum type and compression format - however, when copygc runs later (or
+ * tiering/cache promotion, anything that moves data), it is not in general
+ * going to rewrite all the pointers at once - one of the replicas may be in a
+ * bucket on one device that has very little fragmentation while another lives
+ * in a bucket that has become heavily fragmented, and thus is being rewritten
+ * sooner than the rest.
+ *
+ * Thus it will only move a subset of the pointers (or in the case of
+ * tiering/cache promotion perhaps add a single pointer without dropping any
+ * current pointers), and if the extent has been partially overwritten it must
+ * write only the currently live portion (or copygc would not be able to reduce
+ * fragmentation!) - which necessitates a different bch_extent_crc format for
+ * the new pointer.
+ *
+ * But in the interests of space efficiency, we don't want to store one
+ * bch_extent_crc for each pointer if we don't have to.
+ *
+ * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
+ * bch_extent_ptrs appended arbitrarily one after the other. We determine the
+ * type of a given entry with a scheme similar to utf8 (except we're encoding a
+ * type, not a size), encoding the type in the position of the first set bit:
+ *
+ * bch_extent_crc32 - 0b1
+ * bch_extent_ptr - 0b10
+ * bch_extent_crc64 - 0b100
+ *
+ * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
+ * bch_extent_crc64 is the least constrained).
+ *
+ * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
+ * until the next bch_extent_crc32/64.
+ *
+ * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
+ * is neither checksummed nor compressed.
+ */
+
+/* 128 bits, sufficient for cryptographic MACs: */
+struct bch_csum {
+ __le64 lo;
+ __le64 hi;
+} __packed __aligned(8);
+
+#define BCH_EXTENT_ENTRY_TYPES() \
+ x(ptr, 0) \
+ x(crc32, 1) \
+ x(crc64, 2) \
+ x(crc128, 3) \
+ x(stripe_ptr, 4) \
+ x(rebalance, 5)
+#define BCH_EXTENT_ENTRY_MAX 6
+
+enum bch_extent_entry_type {
+#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
+ BCH_EXTENT_ENTRY_TYPES()
+#undef x
+};
+
+/* Compressed/uncompressed size are stored biased by 1: */
+struct bch_extent_crc32 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u32 type:2,
+ _compressed_size:7,
+ _uncompressed_size:7,
+ offset:7,
+ _unused:1,
+ csum_type:4,
+ compression_type:4;
+ __u32 csum;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u32 csum;
+ __u32 compression_type:4,
+ csum_type:4,
+ _unused:1,
+ offset:7,
+ _uncompressed_size:7,
+ _compressed_size:7,
+ type:2;
+#endif
+} __packed __aligned(8);
+
+#define CRC32_SIZE_MAX (1U << 7)
+#define CRC32_NONCE_MAX 0
+
+struct bch_extent_crc64 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:3,
+ _compressed_size:9,
+ _uncompressed_size:9,
+ offset:9,
+ nonce:10,
+ csum_type:4,
+ compression_type:4,
+ csum_hi:16;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 csum_hi:16,
+ compression_type:4,
+ csum_type:4,
+ nonce:10,
+ offset:9,
+ _uncompressed_size:9,
+ _compressed_size:9,
+ type:3;
+#endif
+ __u64 csum_lo;
+} __packed __aligned(8);
+
+#define CRC64_SIZE_MAX (1U << 9)
+#define CRC64_NONCE_MAX ((1U << 10) - 1)
+
+struct bch_extent_crc128 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:4,
+ _compressed_size:13,
+ _uncompressed_size:13,
+ offset:13,
+ nonce:13,
+ csum_type:4,
+ compression_type:4;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 compression_type:4,
+ csum_type:4,
+ nonce:13,
+ offset:13,
+ _uncompressed_size:13,
+ _compressed_size:13,
+ type:4;
+#endif
+ struct bch_csum csum;
+} __packed __aligned(8);
+
+#define CRC128_SIZE_MAX (1U << 13)
+#define CRC128_NONCE_MAX ((1U << 13) - 1)
+
+/*
+ * @reservation - pointer hasn't been written to, just reserved
+ */
+struct bch_extent_ptr {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:1,
+ cached:1,
+ unused:1,
+ unwritten:1,
+ offset:44, /* 8 petabytes */
+ dev:8,
+ gen:8;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 gen:8,
+ dev:8,
+ offset:44,
+ unwritten:1,
+ unused:1,
+ cached:1,
+ type:1;
+#endif
+} __packed __aligned(8);
+
+struct bch_extent_stripe_ptr {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:5,
+ block:8,
+ redundancy:4,
+ idx:47;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 idx:47,
+ redundancy:4,
+ block:8,
+ type:5;
+#endif
+};
+
+struct bch_extent_rebalance {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:6,
+ unused:34,
+ compression:8, /* enum bch_compression_opt */
+ target:16;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 target:16,
+ compression:8,
+ unused:34,
+ type:6;
+#endif
+};
+
+union bch_extent_entry {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64
+ unsigned long type;
+#elif __BITS_PER_LONG == 32
+ struct {
+ unsigned long pad;
+ unsigned long type;
+ };
+#else
+#error edit for your odd byteorder.
+#endif
+
+#define x(f, n) struct bch_extent_##f f;
+ BCH_EXTENT_ENTRY_TYPES()
+#undef x
+};
+
+struct bch_btree_ptr {
+ struct bch_val v;
+
+ __u64 _data[0];
+ struct bch_extent_ptr start[];
+} __packed __aligned(8);
+
+struct bch_btree_ptr_v2 {
+ struct bch_val v;
+
+ __u64 mem_ptr;
+ __le64 seq;
+ __le16 sectors_written;
+ __le16 flags;
+ struct bpos min_key;
+ __u64 _data[0];
+ struct bch_extent_ptr start[];
+} __packed __aligned(8);
+
+LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1);
+
+struct bch_extent {
+ struct bch_val v;
+
+ __u64 _data[0];
+ union bch_extent_entry start[];
+} __packed __aligned(8);
+
+struct bch_reservation {
+ struct bch_val v;
+
+ __le32 generation;
+ __u8 nr_replicas;
+ __u8 pad[3];
+} __packed __aligned(8);
+
+/* Maximum size (in u64s) a single pointer could be: */
+#define BKEY_EXTENT_PTR_U64s_MAX\
+ ((sizeof(struct bch_extent_crc128) + \
+ sizeof(struct bch_extent_ptr)) / sizeof(__u64))
+
+/* Maximum possible size of an entire extent value: */
+#define BKEY_EXTENT_VAL_U64s_MAX \
+ (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
+
+/* * Maximum possible size of an entire extent, key + value: */
+#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
+
+/* Btree pointers don't carry around checksums: */
+#define BKEY_BTREE_PTR_VAL_U64s_MAX \
+ ((sizeof(struct bch_btree_ptr_v2) + \
+ sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64))
+#define BKEY_BTREE_PTR_U64s_MAX \
+ (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
+
+/* Inodes */
+
+#define BLOCKDEV_INODE_MAX 4096
+
+#define BCACHEFS_ROOT_INO 4096
+
+struct bch_inode {
+ struct bch_val v;
+
+ __le64 bi_hash_seed;
+ __le32 bi_flags;
+ __le16 bi_mode;
+ __u8 fields[];
+} __packed __aligned(8);
+
+struct bch_inode_v2 {
+ struct bch_val v;
+
+ __le64 bi_journal_seq;
+ __le64 bi_hash_seed;
+ __le64 bi_flags;
+ __le16 bi_mode;
+ __u8 fields[];
+} __packed __aligned(8);
+
+struct bch_inode_v3 {
+ struct bch_val v;
+
+ __le64 bi_journal_seq;
+ __le64 bi_hash_seed;
+ __le64 bi_flags;
+ __le64 bi_sectors;
+ __le64 bi_size;
+ __le64 bi_version;
+ __u8 fields[];
+} __packed __aligned(8);
+
+#define INODEv3_FIELDS_START_INITIAL 6
+#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64))
+
+struct bch_inode_generation {
+ struct bch_val v;
+
+ __le32 bi_generation;
+ __le32 pad;
+} __packed __aligned(8);
+
+/*
+ * bi_subvol and bi_parent_subvol are only set for subvolume roots:
+ */
+
+#define BCH_INODE_FIELDS_v2() \
+ x(bi_atime, 96) \
+ x(bi_ctime, 96) \
+ x(bi_mtime, 96) \
+ x(bi_otime, 96) \
+ x(bi_size, 64) \
+ x(bi_sectors, 64) \
+ x(bi_uid, 32) \
+ x(bi_gid, 32) \
+ x(bi_nlink, 32) \
+ x(bi_generation, 32) \
+ x(bi_dev, 32) \
+ x(bi_data_checksum, 8) \
+ x(bi_compression, 8) \
+ x(bi_project, 32) \
+ x(bi_background_compression, 8) \
+ x(bi_data_replicas, 8) \
+ x(bi_promote_target, 16) \
+ x(bi_foreground_target, 16) \
+ x(bi_background_target, 16) \
+ x(bi_erasure_code, 16) \
+ x(bi_fields_set, 16) \
+ x(bi_dir, 64) \
+ x(bi_dir_offset, 64) \
+ x(bi_subvol, 32) \
+ x(bi_parent_subvol, 32)
+
+#define BCH_INODE_FIELDS_v3() \
+ x(bi_atime, 96) \
+ x(bi_ctime, 96) \
+ x(bi_mtime, 96) \
+ x(bi_otime, 96) \
+ x(bi_uid, 32) \
+ x(bi_gid, 32) \
+ x(bi_nlink, 32) \
+ x(bi_generation, 32) \
+ x(bi_dev, 32) \
+ x(bi_data_checksum, 8) \
+ x(bi_compression, 8) \
+ x(bi_project, 32) \
+ x(bi_background_compression, 8) \
+ x(bi_data_replicas, 8) \
+ x(bi_promote_target, 16) \
+ x(bi_foreground_target, 16) \
+ x(bi_background_target, 16) \
+ x(bi_erasure_code, 16) \
+ x(bi_fields_set, 16) \
+ x(bi_dir, 64) \
+ x(bi_dir_offset, 64) \
+ x(bi_subvol, 32) \
+ x(bi_parent_subvol, 32) \
+ x(bi_nocow, 8)
+
+/* subset of BCH_INODE_FIELDS */
+#define BCH_INODE_OPTS() \
+ x(data_checksum, 8) \
+ x(compression, 8) \
+ x(project, 32) \
+ x(background_compression, 8) \
+ x(data_replicas, 8) \
+ x(promote_target, 16) \
+ x(foreground_target, 16) \
+ x(background_target, 16) \
+ x(erasure_code, 16) \
+ x(nocow, 8)
+
+enum inode_opt_id {
+#define x(name, ...) \
+ Inode_opt_##name,
+ BCH_INODE_OPTS()
+#undef x
+ Inode_opt_nr,
+};
+
+#define BCH_INODE_FLAGS() \
+ x(sync, 0) \
+ x(immutable, 1) \
+ x(append, 2) \
+ x(nodump, 3) \
+ x(noatime, 4) \
+ x(i_size_dirty, 5) \
+ x(i_sectors_dirty, 6) \
+ x(unlinked, 7) \
+ x(backptr_untrusted, 8)
+
+/* bits 20+ reserved for packed fields below: */
+
+enum bch_inode_flags {
+#define x(t, n) BCH_INODE_##t = 1U << n,
+ BCH_INODE_FLAGS()
+#undef x
+};
+
+enum __bch_inode_flags {
+#define x(t, n) __BCH_INODE_##t = n,
+ BCH_INODE_FLAGS()
+#undef x
+};
+
+LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24);
+LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
+LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
+
+LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
+LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31);
+
+LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24);
+LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31);
+
+LE64_BITMASK(INODEv3_FIELDS_START,
+ struct bch_inode_v3, bi_flags, 31, 36);
+LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52);
+
+/* Dirents */
+
+/*
+ * Dirents (and xattrs) have to implement string lookups; since our b-tree
+ * doesn't support arbitrary length strings for the key, we instead index by a
+ * 64 bit hash (currently truncated sha1) of the string, stored in the offset
+ * field of the key - using linear probing to resolve hash collisions. This also
+ * provides us with the readdir cookie posix requires.
+ *
+ * Linear probing requires us to use whiteouts for deletions, in the event of a
+ * collision:
+ */
+
+struct bch_dirent {
+ struct bch_val v;
+
+ /* Target inode number: */
+ union {
+ __le64 d_inum;
+ struct { /* DT_SUBVOL */
+ __le32 d_child_subvol;
+ __le32 d_parent_subvol;
+ };
+ };
+
+ /*
+ * Copy of mode bits 12-15 from the target inode - so userspace can get
+ * the filetype without having to do a stat()
+ */
+ __u8 d_type;
+
+ __u8 d_name[];
+} __packed __aligned(8);
+
+#define DT_SUBVOL 16
+#define BCH_DT_MAX 17
+
+#define BCH_NAME_MAX 512
+
+/* Xattrs */
+
+#define KEY_TYPE_XATTR_INDEX_USER 0
+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1
+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2
+#define KEY_TYPE_XATTR_INDEX_TRUSTED 3
+#define KEY_TYPE_XATTR_INDEX_SECURITY 4
+
+struct bch_xattr {
+ struct bch_val v;
+ __u8 x_type;
+ __u8 x_name_len;
+ __le16 x_val_len;
+ __u8 x_name[];
+} __packed __aligned(8);
+
+/* Bucket/allocation information: */
+
+struct bch_alloc {
+ struct bch_val v;
+ __u8 fields;
+ __u8 gen;
+ __u8 data[];
+} __packed __aligned(8);
+
+#define BCH_ALLOC_FIELDS_V1() \
+ x(read_time, 16) \
+ x(write_time, 16) \
+ x(data_type, 8) \
+ x(dirty_sectors, 16) \
+ x(cached_sectors, 16) \
+ x(oldest_gen, 8) \
+ x(stripe, 32) \
+ x(stripe_redundancy, 8)
+
+enum {
+#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
+ BCH_ALLOC_FIELDS_V1()
+#undef x
+};
+
+struct bch_alloc_v2 {
+ struct bch_val v;
+ __u8 nr_fields;
+ __u8 gen;
+ __u8 oldest_gen;
+ __u8 data_type;
+ __u8 data[];
+} __packed __aligned(8);
+
+#define BCH_ALLOC_FIELDS_V2() \
+ x(read_time, 64) \
+ x(write_time, 64) \
+ x(dirty_sectors, 32) \
+ x(cached_sectors, 32) \
+ x(stripe, 32) \
+ x(stripe_redundancy, 8)
+
+struct bch_alloc_v3 {
+ struct bch_val v;
+ __le64 journal_seq;
+ __le32 flags;
+ __u8 nr_fields;
+ __u8 gen;
+ __u8 oldest_gen;
+ __u8 data_type;
+ __u8 data[];
+} __packed __aligned(8);
+
+LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1)
+LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2)
+
+struct bch_alloc_v4 {
+ struct bch_val v;
+ __u64 journal_seq;
+ __u32 flags;
+ __u8 gen;
+ __u8 oldest_gen;
+ __u8 data_type;
+ __u8 stripe_redundancy;
+ __u32 dirty_sectors;
+ __u32 cached_sectors;
+ __u64 io_time[2];
+ __u32 stripe;
+ __u32 nr_external_backpointers;
+ __u64 fragmentation_lru;
+} __packed __aligned(8);
+
+#define BCH_ALLOC_V4_U64s_V0 6
+#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64))
+
+BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1)
+BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2)
+BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8)
+BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14)
+
+#define BCH_ALLOC_V4_NR_BACKPOINTERS_MAX 40
+
+struct bch_backpointer {
+ struct bch_val v;
+ __u8 btree_id;
+ __u8 level;
+ __u8 data_type;
+ __u64 bucket_offset:40;
+ __u32 bucket_len;
+ struct bpos pos;
+} __packed __aligned(8);
+
+#define KEY_TYPE_BUCKET_GENS_BITS 8
+#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS)
+#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1)
+
+struct bch_bucket_gens {
+ struct bch_val v;
+ u8 gens[KEY_TYPE_BUCKET_GENS_NR];
+} __packed __aligned(8);
+
+/* Quotas: */
+
+enum quota_types {
+ QTYP_USR = 0,
+ QTYP_GRP = 1,
+ QTYP_PRJ = 2,
+ QTYP_NR = 3,
+};
+
+enum quota_counters {
+ Q_SPC = 0,
+ Q_INO = 1,
+ Q_COUNTERS = 2,
+};
+
+struct bch_quota_counter {
+ __le64 hardlimit;
+ __le64 softlimit;
+};
+
+struct bch_quota {
+ struct bch_val v;
+ struct bch_quota_counter c[Q_COUNTERS];
+} __packed __aligned(8);
+
+/* Erasure coding */
+
+struct bch_stripe {
+ struct bch_val v;
+ __le16 sectors;
+ __u8 algorithm;
+ __u8 nr_blocks;
+ __u8 nr_redundant;
+
+ __u8 csum_granularity_bits;
+ __u8 csum_type;
+ __u8 pad;
+
+ struct bch_extent_ptr ptrs[];
+} __packed __aligned(8);
+
+/* Reflink: */
+
+struct bch_reflink_p {
+ struct bch_val v;
+ __le64 idx;
+ /*
+ * A reflink pointer might point to an indirect extent which is then
+ * later split (by copygc or rebalance). If we only pointed to part of
+ * the original indirect extent, and then one of the fragments is
+ * outside the range we point to, we'd leak a refcount: so when creating
+ * reflink pointers, we need to store pad values to remember the full
+ * range we were taking a reference on.
+ */
+ __le32 front_pad;
+ __le32 back_pad;
+} __packed __aligned(8);
+
+struct bch_reflink_v {
+ struct bch_val v;
+ __le64 refcount;
+ union bch_extent_entry start[0];
+ __u64 _data[];
+} __packed __aligned(8);
+
+struct bch_indirect_inline_data {
+ struct bch_val v;
+ __le64 refcount;
+ u8 data[];
+};
+
+/* Inline data */
+
+struct bch_inline_data {
+ struct bch_val v;
+ u8 data[];
+};
+
+/* Subvolumes: */
+
+#define SUBVOL_POS_MIN POS(0, 1)
+#define SUBVOL_POS_MAX POS(0, S32_MAX)
+#define BCACHEFS_ROOT_SUBVOL 1
+
+struct bch_subvolume {
+ struct bch_val v;
+ __le32 flags;
+ __le32 snapshot;
+ __le64 inode;
+ /*
+ * Snapshot subvolumes form a tree, separate from the snapshot nodes
+ * tree - if this subvolume is a snapshot, this is the ID of the
+ * subvolume it was created from:
+ */
+ __le32 parent;
+ __le32 pad;
+ bch_le128 otime;
+};
+
+LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1)
+/*
+ * We need to know whether a subvolume is a snapshot so we can know whether we
+ * can delete it (or whether it should just be rm -rf'd)
+ */
+LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2)
+LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3)
+
+/* Snapshots */
+
+struct bch_snapshot {
+ struct bch_val v;
+ __le32 flags;
+ __le32 parent;
+ __le32 children[2];
+ __le32 subvol;
+ /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */
+ __le32 tree;
+ __le32 depth;
+ __le32 skip[3];
+};
+
+LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1)
+
+/* True if a subvolume points to this snapshot node: */
+LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2)
+
+/*
+ * Snapshot trees:
+ *
+ * The snapshot_trees btree gives us persistent indentifier for each tree of
+ * bch_snapshot nodes, and allow us to record and easily find the root/master
+ * subvolume that other snapshots were created from:
+ */
+struct bch_snapshot_tree {
+ struct bch_val v;
+ __le32 master_subvol;
+ __le32 root_snapshot;
+};
+
+/* LRU btree: */
+
+struct bch_lru {
+ struct bch_val v;
+ __le64 idx;
+} __packed __aligned(8);
+
+#define LRU_ID_STRIPES (1U << 16)
+
+/* Logged operations btree: */
+
+struct bch_logged_op_truncate {
+ struct bch_val v;
+ __le32 subvol;
+ __le32 pad;
+ __le64 inum;
+ __le64 new_i_size;
+};
+
+enum logged_op_finsert_state {
+ LOGGED_OP_FINSERT_start,
+ LOGGED_OP_FINSERT_shift_extents,
+ LOGGED_OP_FINSERT_finish,
+};
+
+struct bch_logged_op_finsert {
+ struct bch_val v;
+ __u8 state;
+ __u8 pad[3];
+ __le32 subvol;
+ __le64 inum;
+ __le64 dst_offset;
+ __le64 src_offset;
+ __le64 pos;
+};
+
+/* Optional/variable size superblock sections: */
+
+struct bch_sb_field {
+ __u64 _data[0];
+ __le32 u64s;
+ __le32 type;
+};
+
+#define BCH_SB_FIELDS() \
+ x(journal, 0) \
+ x(members_v1, 1) \
+ x(crypt, 2) \
+ x(replicas_v0, 3) \
+ x(quota, 4) \
+ x(disk_groups, 5) \
+ x(clean, 6) \
+ x(replicas, 7) \
+ x(journal_seq_blacklist, 8) \
+ x(journal_v2, 9) \
+ x(counters, 10) \
+ x(members_v2, 11) \
+ x(errors, 12) \
+ x(ext, 13) \
+ x(downgrade, 14)
+
+enum bch_sb_field_type {
+#define x(f, nr) BCH_SB_FIELD_##f = nr,
+ BCH_SB_FIELDS()
+#undef x
+ BCH_SB_FIELD_NR
+};
+
+/*
+ * Most superblock fields are replicated in all device's superblocks - a few are
+ * not:
+ */
+#define BCH_SINGLE_DEVICE_SB_FIELDS \
+ ((1U << BCH_SB_FIELD_journal)| \
+ (1U << BCH_SB_FIELD_journal_v2))
+
+/* BCH_SB_FIELD_journal: */
+
+struct bch_sb_field_journal {
+ struct bch_sb_field field;
+ __le64 buckets[];
+};
+
+struct bch_sb_field_journal_v2 {
+ struct bch_sb_field field;
+
+ struct bch_sb_field_journal_v2_entry {
+ __le64 start;
+ __le64 nr;
+ } d[];
+};
+
+/* BCH_SB_FIELD_members_v1: */
+
+#define BCH_MIN_NR_NBUCKETS (1 << 6)
+
+#define BCH_IOPS_MEASUREMENTS() \
+ x(seqread, 0) \
+ x(seqwrite, 1) \
+ x(randread, 2) \
+ x(randwrite, 3)
+
+enum bch_iops_measurement {
+#define x(t, n) BCH_IOPS_##t = n,
+ BCH_IOPS_MEASUREMENTS()
+#undef x
+ BCH_IOPS_NR
+};
+
+#define BCH_MEMBER_ERROR_TYPES() \
+ x(read, 0) \
+ x(write, 1) \
+ x(checksum, 2)
+
+enum bch_member_error_type {
+#define x(t, n) BCH_MEMBER_ERROR_##t = n,
+ BCH_MEMBER_ERROR_TYPES()
+#undef x
+ BCH_MEMBER_ERROR_NR
+};
+
+struct bch_member {
+ __uuid_t uuid;
+ __le64 nbuckets; /* device size */
+ __le16 first_bucket; /* index of first bucket used */
+ __le16 bucket_size; /* sectors */
+ __le32 pad;
+ __le64 last_mount; /* time_t */
+
+ __le64 flags;
+ __le32 iops[4];
+ __le64 errors[BCH_MEMBER_ERROR_NR];
+ __le64 errors_at_reset[BCH_MEMBER_ERROR_NR];
+ __le64 errors_reset_time;
+};
+
+#define BCH_MEMBER_V1_BYTES 56
+
+LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4)
+/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
+LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15)
+LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags, 15, 20)
+LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28)
+LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30)
+LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
+ struct bch_member, flags, 30, 31)
+
+#if 0
+LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20);
+LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
+#endif
+
+#define BCH_MEMBER_STATES() \
+ x(rw, 0) \
+ x(ro, 1) \
+ x(failed, 2) \
+ x(spare, 3)
+
+enum bch_member_state {
+#define x(t, n) BCH_MEMBER_STATE_##t = n,
+ BCH_MEMBER_STATES()
+#undef x
+ BCH_MEMBER_STATE_NR
+};
+
+struct bch_sb_field_members_v1 {
+ struct bch_sb_field field;
+ struct bch_member _members[]; //Members are now variable size
+};
+
+struct bch_sb_field_members_v2 {
+ struct bch_sb_field field;
+ __le16 member_bytes; //size of single member entry
+ u8 pad[6];
+ struct bch_member _members[];
+};
+
+/* BCH_SB_FIELD_crypt: */
+
+struct nonce {
+ __le32 d[4];
+};
+
+struct bch_key {
+ __le64 key[4];
+};
+
+#define BCH_KEY_MAGIC \
+ (((__u64) 'b' << 0)|((__u64) 'c' << 8)| \
+ ((__u64) 'h' << 16)|((__u64) '*' << 24)| \
+ ((__u64) '*' << 32)|((__u64) 'k' << 40)| \
+ ((__u64) 'e' << 48)|((__u64) 'y' << 56))
+
+struct bch_encrypted_key {
+ __le64 magic;
+ struct bch_key key;
+};
+
+/*
+ * If this field is present in the superblock, it stores an encryption key which
+ * is used encrypt all other data/metadata. The key will normally be encrypted
+ * with the key userspace provides, but if encryption has been turned off we'll
+ * just store the master key unencrypted in the superblock so we can access the
+ * previously encrypted data.
+ */
+struct bch_sb_field_crypt {
+ struct bch_sb_field field;
+
+ __le64 flags;
+ __le64 kdf_flags;
+ struct bch_encrypted_key key;
+};
+
+LE64_BITMASK(BCH_CRYPT_KDF_TYPE, struct bch_sb_field_crypt, flags, 0, 4);
+
+enum bch_kdf_types {
+ BCH_KDF_SCRYPT = 0,
+ BCH_KDF_NR = 1,
+};
+
+/* stored as base 2 log of scrypt params: */
+LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16);
+LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32);
+LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48);
+
+/* BCH_SB_FIELD_replicas: */
+
+#define BCH_DATA_TYPES() \
+ x(free, 0) \
+ x(sb, 1) \
+ x(journal, 2) \
+ x(btree, 3) \
+ x(user, 4) \
+ x(cached, 5) \
+ x(parity, 6) \
+ x(stripe, 7) \
+ x(need_gc_gens, 8) \
+ x(need_discard, 9)
+
+enum bch_data_type {
+#define x(t, n) BCH_DATA_##t,
+ BCH_DATA_TYPES()
+#undef x
+ BCH_DATA_NR
+};
+
+static inline bool data_type_is_empty(enum bch_data_type type)
+{
+ switch (type) {
+ case BCH_DATA_free:
+ case BCH_DATA_need_gc_gens:
+ case BCH_DATA_need_discard:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline bool data_type_is_hidden(enum bch_data_type type)
+{
+ switch (type) {
+ case BCH_DATA_sb:
+ case BCH_DATA_journal:
+ return true;
+ default:
+ return false;
+ }
+}
+
+struct bch_replicas_entry_v0 {
+ __u8 data_type;
+ __u8 nr_devs;
+ __u8 devs[];
+} __packed;
+
+struct bch_sb_field_replicas_v0 {
+ struct bch_sb_field field;
+ struct bch_replicas_entry_v0 entries[];
+} __packed __aligned(8);
+
+struct bch_replicas_entry {
+ __u8 data_type;
+ __u8 nr_devs;
+ __u8 nr_required;
+ __u8 devs[];
+} __packed;
+
+#define replicas_entry_bytes(_i) \
+ (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
+
+struct bch_sb_field_replicas {
+ struct bch_sb_field field;
+ struct bch_replicas_entry entries[];
+} __packed __aligned(8);
+
+/* BCH_SB_FIELD_quota: */
+
+struct bch_sb_quota_counter {
+ __le32 timelimit;
+ __le32 warnlimit;
+};
+
+struct bch_sb_quota_type {
+ __le64 flags;
+ struct bch_sb_quota_counter c[Q_COUNTERS];
+};
+
+struct bch_sb_field_quota {
+ struct bch_sb_field field;
+ struct bch_sb_quota_type q[QTYP_NR];
+} __packed __aligned(8);
+
+/* BCH_SB_FIELD_disk_groups: */
+
+#define BCH_SB_LABEL_SIZE 32
+
+struct bch_disk_group {
+ __u8 label[BCH_SB_LABEL_SIZE];
+ __le64 flags[2];
+} __packed __aligned(8);
+
+LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1)
+LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6)
+LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24)
+
+struct bch_sb_field_disk_groups {
+ struct bch_sb_field field;
+ struct bch_disk_group entries[];
+} __packed __aligned(8);
+
+/* BCH_SB_FIELD_counters */
+
+#define BCH_PERSISTENT_COUNTERS() \
+ x(io_read, 0) \
+ x(io_write, 1) \
+ x(io_move, 2) \
+ x(bucket_invalidate, 3) \
+ x(bucket_discard, 4) \
+ x(bucket_alloc, 5) \
+ x(bucket_alloc_fail, 6) \
+ x(btree_cache_scan, 7) \
+ x(btree_cache_reap, 8) \
+ x(btree_cache_cannibalize, 9) \
+ x(btree_cache_cannibalize_lock, 10) \
+ x(btree_cache_cannibalize_lock_fail, 11) \
+ x(btree_cache_cannibalize_unlock, 12) \
+ x(btree_node_write, 13) \
+ x(btree_node_read, 14) \
+ x(btree_node_compact, 15) \
+ x(btree_node_merge, 16) \
+ x(btree_node_split, 17) \
+ x(btree_node_rewrite, 18) \
+ x(btree_node_alloc, 19) \
+ x(btree_node_free, 20) \
+ x(btree_node_set_root, 21) \
+ x(btree_path_relock_fail, 22) \
+ x(btree_path_upgrade_fail, 23) \
+ x(btree_reserve_get_fail, 24) \
+ x(journal_entry_full, 25) \
+ x(journal_full, 26) \
+ x(journal_reclaim_finish, 27) \
+ x(journal_reclaim_start, 28) \
+ x(journal_write, 29) \
+ x(read_promote, 30) \
+ x(read_bounce, 31) \
+ x(read_split, 33) \
+ x(read_retry, 32) \
+ x(read_reuse_race, 34) \
+ x(move_extent_read, 35) \
+ x(move_extent_write, 36) \
+ x(move_extent_finish, 37) \
+ x(move_extent_fail, 38) \
+ x(move_extent_start_fail, 39) \
+ x(copygc, 40) \
+ x(copygc_wait, 41) \
+ x(gc_gens_end, 42) \
+ x(gc_gens_start, 43) \
+ x(trans_blocked_journal_reclaim, 44) \
+ x(trans_restart_btree_node_reused, 45) \
+ x(trans_restart_btree_node_split, 46) \
+ x(trans_restart_fault_inject, 47) \
+ x(trans_restart_iter_upgrade, 48) \
+ x(trans_restart_journal_preres_get, 49) \
+ x(trans_restart_journal_reclaim, 50) \
+ x(trans_restart_journal_res_get, 51) \
+ x(trans_restart_key_cache_key_realloced, 52) \
+ x(trans_restart_key_cache_raced, 53) \
+ x(trans_restart_mark_replicas, 54) \
+ x(trans_restart_mem_realloced, 55) \
+ x(trans_restart_memory_allocation_failure, 56) \
+ x(trans_restart_relock, 57) \
+ x(trans_restart_relock_after_fill, 58) \
+ x(trans_restart_relock_key_cache_fill, 59) \
+ x(trans_restart_relock_next_node, 60) \
+ x(trans_restart_relock_parent_for_fill, 61) \
+ x(trans_restart_relock_path, 62) \
+ x(trans_restart_relock_path_intent, 63) \
+ x(trans_restart_too_many_iters, 64) \
+ x(trans_restart_traverse, 65) \
+ x(trans_restart_upgrade, 66) \
+ x(trans_restart_would_deadlock, 67) \
+ x(trans_restart_would_deadlock_write, 68) \
+ x(trans_restart_injected, 69) \
+ x(trans_restart_key_cache_upgrade, 70) \
+ x(trans_traverse_all, 71) \
+ x(transaction_commit, 72) \
+ x(write_super, 73) \
+ x(trans_restart_would_deadlock_recursion_limit, 74) \
+ x(trans_restart_write_buffer_flush, 75) \
+ x(trans_restart_split_race, 76)
+
+enum bch_persistent_counters {
+#define x(t, n, ...) BCH_COUNTER_##t,
+ BCH_PERSISTENT_COUNTERS()
+#undef x
+ BCH_COUNTER_NR
+};
+
+struct bch_sb_field_counters {
+ struct bch_sb_field field;
+ __le64 d[];
+};
+
+/*
+ * On clean shutdown, store btree roots and current journal sequence number in
+ * the superblock:
+ */
+struct jset_entry {
+ __le16 u64s;
+ __u8 btree_id;
+ __u8 level;
+ __u8 type; /* designates what this jset holds */
+ __u8 pad[3];
+
+ struct bkey_i start[0];
+ __u64 _data[];
+};
+
+struct bch_sb_field_clean {
+ struct bch_sb_field field;
+
+ __le32 flags;
+ __le16 _read_clock; /* no longer used */
+ __le16 _write_clock;
+ __le64 journal_seq;
+
+ struct jset_entry start[0];
+ __u64 _data[];
+};
+
+struct journal_seq_blacklist_entry {
+ __le64 start;
+ __le64 end;
+};
+
+struct bch_sb_field_journal_seq_blacklist {
+ struct bch_sb_field field;
+ struct journal_seq_blacklist_entry start[];
+};
+
+struct bch_sb_field_errors {
+ struct bch_sb_field field;
+ struct bch_sb_field_error_entry {
+ __le64 v;
+ __le64 last_error_time;
+ } entries[];
+};
+
+LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16);
+LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64);
+
+struct bch_sb_field_ext {
+ struct bch_sb_field field;
+ __le64 recovery_passes_required[2];
+ __le64 errors_silent[8];
+};
+
+struct bch_sb_field_downgrade_entry {
+ __le16 version;
+ __le64 recovery_passes[2];
+ __le16 nr_errors;
+ __le16 errors[] __counted_by(nr_errors);
+} __packed __aligned(2);
+
+struct bch_sb_field_downgrade {
+ struct bch_sb_field field;
+ struct bch_sb_field_downgrade_entry entries[];
+};
+
+/* Superblock: */
+
+/*
+ * New versioning scheme:
+ * One common version number for all on disk data structures - superblock, btree
+ * nodes, journal entries
+ */
+#define BCH_VERSION_MAJOR(_v) ((__u16) ((_v) >> 10))
+#define BCH_VERSION_MINOR(_v) ((__u16) ((_v) & ~(~0U << 10)))
+#define BCH_VERSION(_major, _minor) (((_major) << 10)|(_minor) << 0)
+
+#define RECOVERY_PASS_ALL_FSCK (1ULL << 63)
+
+/*
+ * field 1: version name
+ * field 2: BCH_VERSION(major, minor)
+ * field 3: recovery passess required on upgrade
+ */
+#define BCH_METADATA_VERSIONS() \
+ x(bkey_renumber, BCH_VERSION(0, 10), \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(inode_btree_change, BCH_VERSION(0, 11), \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(snapshot, BCH_VERSION(0, 12), \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(inode_backpointers, BCH_VERSION(0, 13), \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(btree_ptr_sectors_written, BCH_VERSION(0, 14), \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(snapshot_2, BCH_VERSION(0, 15), \
+ BIT_ULL(BCH_RECOVERY_PASS_fs_upgrade_for_subvolumes)| \
+ BIT_ULL(BCH_RECOVERY_PASS_initialize_subvolumes)| \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(reflink_p_fix, BCH_VERSION(0, 16), \
+ BIT_ULL(BCH_RECOVERY_PASS_fix_reflink_p)) \
+ x(subvol_dirent, BCH_VERSION(0, 17), \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(inode_v2, BCH_VERSION(0, 18), \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(freespace, BCH_VERSION(0, 19), \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(alloc_v4, BCH_VERSION(0, 20), \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(new_data_types, BCH_VERSION(0, 21), \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(backpointers, BCH_VERSION(0, 22), \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(inode_v3, BCH_VERSION(0, 23), \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(unwritten_extents, BCH_VERSION(0, 24), \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(bucket_gens, BCH_VERSION(0, 25), \
+ BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)| \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(lru_v2, BCH_VERSION(0, 26), \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(fragmentation_lru, BCH_VERSION(0, 27), \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(no_bps_in_alloc_keys, BCH_VERSION(0, 28), \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(snapshot_trees, BCH_VERSION(0, 29), \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(major_minor, BCH_VERSION(1, 0), \
+ 0) \
+ x(snapshot_skiplists, BCH_VERSION(1, 1), \
+ BIT_ULL(BCH_RECOVERY_PASS_check_snapshots)) \
+ x(deleted_inodes, BCH_VERSION(1, 2), \
+ BIT_ULL(BCH_RECOVERY_PASS_check_inodes)) \
+ x(rebalance_work, BCH_VERSION(1, 3), \
+ BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))
+
+enum bcachefs_metadata_version {
+ bcachefs_metadata_version_min = 9,
+#define x(t, n, upgrade_passes) bcachefs_metadata_version_##t = n,
+ BCH_METADATA_VERSIONS()
+#undef x
+ bcachefs_metadata_version_max
+};
+
+static const __maybe_unused
+unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work;
+
+#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1)
+
+#define BCH_SB_SECTOR 8
+#define BCH_SB_MEMBERS_MAX 64 /* XXX kill */
+
+struct bch_sb_layout {
+ __uuid_t magic; /* bcachefs superblock UUID */
+ __u8 layout_type;
+ __u8 sb_max_size_bits; /* base 2 of 512 byte sectors */
+ __u8 nr_superblocks;
+ __u8 pad[5];
+ __le64 sb_offset[61];
+} __packed __aligned(8);
+
+#define BCH_SB_LAYOUT_SECTOR 7
+
+/*
+ * @offset - sector where this sb was written
+ * @version - on disk format version
+ * @version_min - Oldest metadata version this filesystem contains; so we can
+ * safely drop compatibility code and refuse to mount filesystems
+ * we'd need it for
+ * @magic - identifies as a bcachefs superblock (BCHFS_MAGIC)
+ * @seq - incremented each time superblock is written
+ * @uuid - used for generating various magic numbers and identifying
+ * member devices, never changes
+ * @user_uuid - user visible UUID, may be changed
+ * @label - filesystem label
+ * @seq - identifies most recent superblock, incremented each time
+ * superblock is written
+ * @features - enabled incompatible features
+ */
+struct bch_sb {
+ struct bch_csum csum;
+ __le16 version;
+ __le16 version_min;
+ __le16 pad[2];
+ __uuid_t magic;
+ __uuid_t uuid;
+ __uuid_t user_uuid;
+ __u8 label[BCH_SB_LABEL_SIZE];
+ __le64 offset;
+ __le64 seq;
+
+ __le16 block_size;
+ __u8 dev_idx;
+ __u8 nr_devices;
+ __le32 u64s;
+
+ __le64 time_base_lo;
+ __le32 time_base_hi;
+ __le32 time_precision;
+
+ __le64 flags[8];
+ __le64 features[2];
+ __le64 compat[2];
+
+ struct bch_sb_layout layout;
+
+ struct bch_sb_field start[0];
+ __le64 _data[];
+} __packed __aligned(8);
+
+/*
+ * Flags:
+ * BCH_SB_INITALIZED - set on first mount
+ * BCH_SB_CLEAN - did we shut down cleanly? Just a hint, doesn't affect
+ * behaviour of mount/recovery path:
+ * BCH_SB_INODE_32BIT - limit inode numbers to 32 bits
+ * BCH_SB_128_BIT_MACS - 128 bit macs instead of 80
+ * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides
+ * DATA/META_CSUM_TYPE. Also indicates encryption
+ * algorithm in use, if/when we get more than one
+ */
+
+LE16_BITMASK(BCH_SB_BLOCK_SIZE, struct bch_sb, block_size, 0, 16);
+
+LE64_BITMASK(BCH_SB_INITIALIZED, struct bch_sb, flags[0], 0, 1);
+LE64_BITMASK(BCH_SB_CLEAN, struct bch_sb, flags[0], 1, 2);
+LE64_BITMASK(BCH_SB_CSUM_TYPE, struct bch_sb, flags[0], 2, 8);
+LE64_BITMASK(BCH_SB_ERROR_ACTION, struct bch_sb, flags[0], 8, 12);
+
+LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE, struct bch_sb, flags[0], 12, 28);
+
+LE64_BITMASK(BCH_SB_GC_RESERVE, struct bch_sb, flags[0], 28, 33);
+LE64_BITMASK(BCH_SB_ROOT_RESERVE, struct bch_sb, flags[0], 33, 40);
+
+LE64_BITMASK(BCH_SB_META_CSUM_TYPE, struct bch_sb, flags[0], 40, 44);
+LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE, struct bch_sb, flags[0], 44, 48);
+
+LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[0], 48, 52);
+LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[0], 52, 56);
+
+LE64_BITMASK(BCH_SB_POSIX_ACL, struct bch_sb, flags[0], 56, 57);
+LE64_BITMASK(BCH_SB_USRQUOTA, struct bch_sb, flags[0], 57, 58);
+LE64_BITMASK(BCH_SB_GRPQUOTA, struct bch_sb, flags[0], 58, 59);
+LE64_BITMASK(BCH_SB_PRJQUOTA, struct bch_sb, flags[0], 59, 60);
+
+LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61);
+LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62);
+
+LE64_BITMASK(BCH_SB_BIG_ENDIAN, struct bch_sb, flags[0], 62, 63);
+
+LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4);
+LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1], 4, 8);
+LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9);
+
+LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10);
+LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[1], 10, 14);
+
+/*
+ * Max size of an extent that may require bouncing to read or write
+ * (checksummed, compressed): 64k
+ */
+LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS,
+ struct bch_sb, flags[1], 14, 20);
+
+LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24);
+LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28);
+
+LE64_BITMASK(BCH_SB_PROMOTE_TARGET, struct bch_sb, flags[1], 28, 40);
+LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52);
+LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64);
+
+LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO,
+ struct bch_sb, flags[2], 0, 4);
+LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64);
+
+LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
+LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28);
+LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29);
+LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
+LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
+LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
+LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
+LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
+LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34);
+LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE, struct bch_sb, flags[4], 34, 54);
+LE64_BITMASK(BCH_SB_VERSION_UPGRADE, struct bch_sb, flags[4], 54, 56);
+
+LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_HI,struct bch_sb, flags[4], 56, 60);
+LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI,
+ struct bch_sb, flags[4], 60, 64);
+
+LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE,
+ struct bch_sb, flags[5], 0, 16);
+
+static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
+{
+ return BCH_SB_COMPRESSION_TYPE_LO(sb) | (BCH_SB_COMPRESSION_TYPE_HI(sb) << 4);
+}
+
+static inline void SET_BCH_SB_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
+{
+ SET_BCH_SB_COMPRESSION_TYPE_LO(sb, v);
+ SET_BCH_SB_COMPRESSION_TYPE_HI(sb, v >> 4);
+}
+
+static inline __u64 BCH_SB_BACKGROUND_COMPRESSION_TYPE(const struct bch_sb *sb)
+{
+ return BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb) |
+ (BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb) << 4);
+}
+
+static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
+{
+ SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb, v);
+ SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb, v >> 4);
+}
+
+/*
+ * Features:
+ *
+ * journal_seq_blacklist_v3: gates BCH_SB_FIELD_journal_seq_blacklist
+ * reflink: gates KEY_TYPE_reflink
+ * inline_data: gates KEY_TYPE_inline_data
+ * new_siphash: gates BCH_STR_HASH_siphash
+ * new_extent_overwrite: gates BTREE_NODE_NEW_EXTENT_OVERWRITE
+ */
+#define BCH_SB_FEATURES() \
+ x(lz4, 0) \
+ x(gzip, 1) \
+ x(zstd, 2) \
+ x(atomic_nlink, 3) \
+ x(ec, 4) \
+ x(journal_seq_blacklist_v3, 5) \
+ x(reflink, 6) \
+ x(new_siphash, 7) \
+ x(inline_data, 8) \
+ x(new_extent_overwrite, 9) \
+ x(incompressible, 10) \
+ x(btree_ptr_v2, 11) \
+ x(extents_above_btree_updates, 12) \
+ x(btree_updates_journalled, 13) \
+ x(reflink_inline_data, 14) \
+ x(new_varint, 15) \
+ x(journal_no_flush, 16) \
+ x(alloc_v2, 17) \
+ x(extents_across_btree_nodes, 18)
+
+#define BCH_SB_FEATURES_ALWAYS \
+ ((1ULL << BCH_FEATURE_new_extent_overwrite)| \
+ (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
+ (1ULL << BCH_FEATURE_btree_updates_journalled)|\
+ (1ULL << BCH_FEATURE_alloc_v2)|\
+ (1ULL << BCH_FEATURE_extents_across_btree_nodes))
+
+#define BCH_SB_FEATURES_ALL \
+ (BCH_SB_FEATURES_ALWAYS| \
+ (1ULL << BCH_FEATURE_new_siphash)| \
+ (1ULL << BCH_FEATURE_btree_ptr_v2)| \
+ (1ULL << BCH_FEATURE_new_varint)| \
+ (1ULL << BCH_FEATURE_journal_no_flush))
+
+enum bch_sb_feature {
+#define x(f, n) BCH_FEATURE_##f,
+ BCH_SB_FEATURES()
+#undef x
+ BCH_FEATURE_NR,
+};
+
+#define BCH_SB_COMPAT() \
+ x(alloc_info, 0) \
+ x(alloc_metadata, 1) \
+ x(extents_above_btree_updates_done, 2) \
+ x(bformat_overflow_done, 3)
+
+enum bch_sb_compat {
+#define x(f, n) BCH_COMPAT_##f,
+ BCH_SB_COMPAT()
+#undef x
+ BCH_COMPAT_NR,
+};
+
+/* options: */
+
+#define BCH_VERSION_UPGRADE_OPTS() \
+ x(compatible, 0) \
+ x(incompatible, 1) \
+ x(none, 2)
+
+enum bch_version_upgrade_opts {
+#define x(t, n) BCH_VERSION_UPGRADE_##t = n,
+ BCH_VERSION_UPGRADE_OPTS()
+#undef x
+};
+
+#define BCH_REPLICAS_MAX 4U
+
+#define BCH_BKEY_PTRS_MAX 16U
+
+#define BCH_ERROR_ACTIONS() \
+ x(continue, 0) \
+ x(ro, 1) \
+ x(panic, 2)
+
+enum bch_error_actions {
+#define x(t, n) BCH_ON_ERROR_##t = n,
+ BCH_ERROR_ACTIONS()
+#undef x
+ BCH_ON_ERROR_NR
+};
+
+#define BCH_STR_HASH_TYPES() \
+ x(crc32c, 0) \
+ x(crc64, 1) \
+ x(siphash_old, 2) \
+ x(siphash, 3)
+
+enum bch_str_hash_type {
+#define x(t, n) BCH_STR_HASH_##t = n,
+ BCH_STR_HASH_TYPES()
+#undef x
+ BCH_STR_HASH_NR
+};
+
+#define BCH_STR_HASH_OPTS() \
+ x(crc32c, 0) \
+ x(crc64, 1) \
+ x(siphash, 2)
+
+enum bch_str_hash_opts {
+#define x(t, n) BCH_STR_HASH_OPT_##t = n,
+ BCH_STR_HASH_OPTS()
+#undef x
+ BCH_STR_HASH_OPT_NR
+};
+
+#define BCH_CSUM_TYPES() \
+ x(none, 0) \
+ x(crc32c_nonzero, 1) \
+ x(crc64_nonzero, 2) \
+ x(chacha20_poly1305_80, 3) \
+ x(chacha20_poly1305_128, 4) \
+ x(crc32c, 5) \
+ x(crc64, 6) \
+ x(xxhash, 7)
+
+enum bch_csum_type {
+#define x(t, n) BCH_CSUM_##t = n,
+ BCH_CSUM_TYPES()
+#undef x
+ BCH_CSUM_NR
+};
+
+static const __maybe_unused unsigned bch_crc_bytes[] = {
+ [BCH_CSUM_none] = 0,
+ [BCH_CSUM_crc32c_nonzero] = 4,
+ [BCH_CSUM_crc32c] = 4,
+ [BCH_CSUM_crc64_nonzero] = 8,
+ [BCH_CSUM_crc64] = 8,
+ [BCH_CSUM_xxhash] = 8,
+ [BCH_CSUM_chacha20_poly1305_80] = 10,
+ [BCH_CSUM_chacha20_poly1305_128] = 16,
+};
+
+static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
+{
+ switch (type) {
+ case BCH_CSUM_chacha20_poly1305_80:
+ case BCH_CSUM_chacha20_poly1305_128:
+ return true;
+ default:
+ return false;
+ }
+}
+
+#define BCH_CSUM_OPTS() \
+ x(none, 0) \
+ x(crc32c, 1) \
+ x(crc64, 2) \
+ x(xxhash, 3)
+
+enum bch_csum_opts {
+#define x(t, n) BCH_CSUM_OPT_##t = n,
+ BCH_CSUM_OPTS()
+#undef x
+ BCH_CSUM_OPT_NR
+};
+
+#define BCH_COMPRESSION_TYPES() \
+ x(none, 0) \
+ x(lz4_old, 1) \
+ x(gzip, 2) \
+ x(lz4, 3) \
+ x(zstd, 4) \
+ x(incompressible, 5)
+
+enum bch_compression_type {
+#define x(t, n) BCH_COMPRESSION_TYPE_##t = n,
+ BCH_COMPRESSION_TYPES()
+#undef x
+ BCH_COMPRESSION_TYPE_NR
+};
+
+#define BCH_COMPRESSION_OPTS() \
+ x(none, 0) \
+ x(lz4, 1) \
+ x(gzip, 2) \
+ x(zstd, 3)
+
+enum bch_compression_opts {
+#define x(t, n) BCH_COMPRESSION_OPT_##t = n,
+ BCH_COMPRESSION_OPTS()
+#undef x
+ BCH_COMPRESSION_OPT_NR
+};
+
+/*
+ * Magic numbers
+ *
+ * The various other data structures have their own magic numbers, which are
+ * xored with the first part of the cache set's UUID
+ */
+
+#define BCACHE_MAGIC \
+ UUID_INIT(0xc68573f6, 0x4e1a, 0x45ca, \
+ 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81)
+#define BCHFS_MAGIC \
+ UUID_INIT(0xc68573f6, 0x66ce, 0x90a9, \
+ 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
+
+#define BCACHEFS_STATFS_MAGIC 0xca451a4e
+
+#define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL)
+#define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL)
+
+static inline __le64 __bch2_sb_magic(struct bch_sb *sb)
+{
+ __le64 ret;
+
+ memcpy(&ret, &sb->uuid, sizeof(ret));
+ return ret;
+}
+
+static inline __u64 __jset_magic(struct bch_sb *sb)
+{
+ return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC);
+}
+
+static inline __u64 __bset_magic(struct bch_sb *sb)
+{
+ return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC);
+}
+
+/* Journal */
+
+#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64))
+
+#define BCH_JSET_ENTRY_TYPES() \
+ x(btree_keys, 0) \
+ x(btree_root, 1) \
+ x(prio_ptrs, 2) \
+ x(blacklist, 3) \
+ x(blacklist_v2, 4) \
+ x(usage, 5) \
+ x(data_usage, 6) \
+ x(clock, 7) \
+ x(dev_usage, 8) \
+ x(log, 9) \
+ x(overwrite, 10)
+
+enum {
+#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
+ BCH_JSET_ENTRY_TYPES()
+#undef x
+ BCH_JSET_ENTRY_NR
+};
+
+/*
+ * Journal sequence numbers can be blacklisted: bsets record the max sequence
+ * number of all the journal entries they contain updates for, so that on
+ * recovery we can ignore those bsets that contain index updates newer that what
+ * made it into the journal.
+ *
+ * This means that we can't reuse that journal_seq - we have to skip it, and
+ * then record that we skipped it so that the next time we crash and recover we
+ * don't think there was a missing journal entry.
+ */
+struct jset_entry_blacklist {
+ struct jset_entry entry;
+ __le64 seq;
+};
+
+struct jset_entry_blacklist_v2 {
+ struct jset_entry entry;
+ __le64 start;
+ __le64 end;
+};
+
+#define BCH_FS_USAGE_TYPES() \
+ x(reserved, 0) \
+ x(inodes, 1) \
+ x(key_version, 2)
+
+enum {
+#define x(f, nr) BCH_FS_USAGE_##f = nr,
+ BCH_FS_USAGE_TYPES()
+#undef x
+ BCH_FS_USAGE_NR
+};
+
+struct jset_entry_usage {
+ struct jset_entry entry;
+ __le64 v;
+} __packed;
+
+struct jset_entry_data_usage {
+ struct jset_entry entry;
+ __le64 v;
+ struct bch_replicas_entry r;
+} __packed;
+
+struct jset_entry_clock {
+ struct jset_entry entry;
+ __u8 rw;
+ __u8 pad[7];
+ __le64 time;
+} __packed;
+
+struct jset_entry_dev_usage_type {
+ __le64 buckets;
+ __le64 sectors;
+ __le64 fragmented;
+} __packed;
+
+struct jset_entry_dev_usage {
+ struct jset_entry entry;
+ __le32 dev;
+ __u32 pad;
+
+ __le64 buckets_ec;
+ __le64 _buckets_unavailable; /* No longer used */
+
+ struct jset_entry_dev_usage_type d[];
+};
+
+static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u)
+{
+ return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) /
+ sizeof(struct jset_entry_dev_usage_type);
+}
+
+struct jset_entry_log {
+ struct jset_entry entry;
+ u8 d[];
+} __packed;
+
+/*
+ * On disk format for a journal entry:
+ * seq is monotonically increasing; every journal entry has its own unique
+ * sequence number.
+ *
+ * last_seq is the oldest journal entry that still has keys the btree hasn't
+ * flushed to disk yet.
+ *
+ * version is for on disk format changes.
+ */
+struct jset {
+ struct bch_csum csum;
+
+ __le64 magic;
+ __le64 seq;
+ __le32 version;
+ __le32 flags;
+
+ __le32 u64s; /* size of d[] in u64s */
+
+ __u8 encrypted_start[0];
+
+ __le16 _read_clock; /* no longer used */
+ __le16 _write_clock;
+
+ /* Sequence number of oldest dirty journal entry */
+ __le64 last_seq;
+
+
+ struct jset_entry start[0];
+ __u64 _data[];
+} __packed __aligned(8);
+
+LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4);
+LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5);
+LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6);
+
+#define BCH_JOURNAL_BUCKETS_MIN 8
+
+/* Btree: */
+
+enum btree_id_flags {
+ BTREE_ID_EXTENTS = BIT(0),
+ BTREE_ID_SNAPSHOTS = BIT(1),
+ BTREE_ID_SNAPSHOT_FIELD = BIT(2),
+ BTREE_ID_DATA = BIT(3),
+};
+
+#define BCH_BTREE_IDS() \
+ x(extents, 0, BTREE_ID_EXTENTS|BTREE_ID_SNAPSHOTS|BTREE_ID_DATA,\
+ BIT_ULL(KEY_TYPE_whiteout)| \
+ BIT_ULL(KEY_TYPE_error)| \
+ BIT_ULL(KEY_TYPE_cookie)| \
+ BIT_ULL(KEY_TYPE_extent)| \
+ BIT_ULL(KEY_TYPE_reservation)| \
+ BIT_ULL(KEY_TYPE_reflink_p)| \
+ BIT_ULL(KEY_TYPE_inline_data)) \
+ x(inodes, 1, BTREE_ID_SNAPSHOTS, \
+ BIT_ULL(KEY_TYPE_whiteout)| \
+ BIT_ULL(KEY_TYPE_inode)| \
+ BIT_ULL(KEY_TYPE_inode_v2)| \
+ BIT_ULL(KEY_TYPE_inode_v3)| \
+ BIT_ULL(KEY_TYPE_inode_generation)) \
+ x(dirents, 2, BTREE_ID_SNAPSHOTS, \
+ BIT_ULL(KEY_TYPE_whiteout)| \
+ BIT_ULL(KEY_TYPE_hash_whiteout)| \
+ BIT_ULL(KEY_TYPE_dirent)) \
+ x(xattrs, 3, BTREE_ID_SNAPSHOTS, \
+ BIT_ULL(KEY_TYPE_whiteout)| \
+ BIT_ULL(KEY_TYPE_cookie)| \
+ BIT_ULL(KEY_TYPE_hash_whiteout)| \
+ BIT_ULL(KEY_TYPE_xattr)) \
+ x(alloc, 4, 0, \
+ BIT_ULL(KEY_TYPE_alloc)| \
+ BIT_ULL(KEY_TYPE_alloc_v2)| \
+ BIT_ULL(KEY_TYPE_alloc_v3)| \
+ BIT_ULL(KEY_TYPE_alloc_v4)) \
+ x(quotas, 5, 0, \
+ BIT_ULL(KEY_TYPE_quota)) \
+ x(stripes, 6, 0, \
+ BIT_ULL(KEY_TYPE_stripe)) \
+ x(reflink, 7, BTREE_ID_EXTENTS|BTREE_ID_DATA, \
+ BIT_ULL(KEY_TYPE_reflink_v)| \
+ BIT_ULL(KEY_TYPE_indirect_inline_data)) \
+ x(subvolumes, 8, 0, \
+ BIT_ULL(KEY_TYPE_subvolume)) \
+ x(snapshots, 9, 0, \
+ BIT_ULL(KEY_TYPE_snapshot)) \
+ x(lru, 10, 0, \
+ BIT_ULL(KEY_TYPE_set)) \
+ x(freespace, 11, BTREE_ID_EXTENTS, \
+ BIT_ULL(KEY_TYPE_set)) \
+ x(need_discard, 12, 0, \
+ BIT_ULL(KEY_TYPE_set)) \
+ x(backpointers, 13, 0, \
+ BIT_ULL(KEY_TYPE_backpointer)) \
+ x(bucket_gens, 14, 0, \
+ BIT_ULL(KEY_TYPE_bucket_gens)) \
+ x(snapshot_trees, 15, 0, \
+ BIT_ULL(KEY_TYPE_snapshot_tree)) \
+ x(deleted_inodes, 16, BTREE_ID_SNAPSHOT_FIELD, \
+ BIT_ULL(KEY_TYPE_set)) \
+ x(logged_ops, 17, 0, \
+ BIT_ULL(KEY_TYPE_logged_op_truncate)| \
+ BIT_ULL(KEY_TYPE_logged_op_finsert)) \
+ x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \
+ BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))
+
+enum btree_id {
+#define x(name, nr, ...) BTREE_ID_##name = nr,
+ BCH_BTREE_IDS()
+#undef x
+ BTREE_ID_NR
+};
+
+#define BTREE_MAX_DEPTH 4U
+
+/* Btree nodes */
+
+/*
+ * Btree nodes
+ *
+ * On disk a btree node is a list/log of these; within each set the keys are
+ * sorted
+ */
+struct bset {
+ __le64 seq;
+
+ /*
+ * Highest journal entry this bset contains keys for.
+ * If on recovery we don't see that journal entry, this bset is ignored:
+ * this allows us to preserve the order of all index updates after a
+ * crash, since the journal records a total order of all index updates
+ * and anything that didn't make it to the journal doesn't get used.
+ */
+ __le64 journal_seq;
+
+ __le32 flags;
+ __le16 version;
+ __le16 u64s; /* count of d[] in u64s */
+
+ struct bkey_packed start[0];
+ __u64 _data[];
+} __packed __aligned(8);
+
+LE32_BITMASK(BSET_CSUM_TYPE, struct bset, flags, 0, 4);
+
+LE32_BITMASK(BSET_BIG_ENDIAN, struct bset, flags, 4, 5);
+LE32_BITMASK(BSET_SEPARATE_WHITEOUTS,
+ struct bset, flags, 5, 6);
+
+/* Sector offset within the btree node: */
+LE32_BITMASK(BSET_OFFSET, struct bset, flags, 16, 32);
+
+struct btree_node {
+ struct bch_csum csum;
+ __le64 magic;
+
+ /* this flags field is encrypted, unlike bset->flags: */
+ __le64 flags;
+
+ /* Closed interval: */
+ struct bpos min_key;
+ struct bpos max_key;
+ struct bch_extent_ptr _ptr; /* not used anymore */
+ struct bkey_format format;
+
+ union {
+ struct bset keys;
+ struct {
+ __u8 pad[22];
+ __le16 u64s;
+ __u64 _data[0];
+
+ };
+ };
+} __packed __aligned(8);
+
+LE64_BITMASK(BTREE_NODE_ID_LO, struct btree_node, flags, 0, 4);
+LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8);
+LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE,
+ struct btree_node, flags, 8, 9);
+LE64_BITMASK(BTREE_NODE_ID_HI, struct btree_node, flags, 9, 25);
+/* 25-32 unused */
+LE64_BITMASK(BTREE_NODE_SEQ, struct btree_node, flags, 32, 64);
+
+static inline __u64 BTREE_NODE_ID(struct btree_node *n)
+{
+ return BTREE_NODE_ID_LO(n) | (BTREE_NODE_ID_HI(n) << 4);
+}
+
+static inline void SET_BTREE_NODE_ID(struct btree_node *n, __u64 v)
+{
+ SET_BTREE_NODE_ID_LO(n, v);
+ SET_BTREE_NODE_ID_HI(n, v >> 4);
+}
+
+struct btree_node_entry {
+ struct bch_csum csum;
+
+ union {
+ struct bset keys;
+ struct {
+ __u8 pad[22];
+ __le16 u64s;
+ __u64 _data[0];
+ };
+ };
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_FORMAT_H */
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
new file mode 100644
index 000000000000..f05881f7e113
--- /dev/null
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -0,0 +1,368 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IOCTL_H
+#define _BCACHEFS_IOCTL_H
+
+#include <linux/uuid.h>
+#include <asm/ioctl.h>
+#include "bcachefs_format.h"
+
+/*
+ * Flags common to multiple ioctls:
+ */
+#define BCH_FORCE_IF_DATA_LOST (1 << 0)
+#define BCH_FORCE_IF_METADATA_LOST (1 << 1)
+#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2)
+#define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3)
+
+#define BCH_FORCE_IF_LOST \
+ (BCH_FORCE_IF_DATA_LOST| \
+ BCH_FORCE_IF_METADATA_LOST)
+#define BCH_FORCE_IF_DEGRADED \
+ (BCH_FORCE_IF_DATA_DEGRADED| \
+ BCH_FORCE_IF_METADATA_DEGRADED)
+
+/*
+ * If cleared, ioctl that refer to a device pass it as a pointer to a pathname
+ * (e.g. /dev/sda1); if set, the dev field is the device's index within the
+ * filesystem:
+ */
+#define BCH_BY_INDEX (1 << 4)
+
+/*
+ * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem
+ * wide superblock:
+ */
+#define BCH_READ_DEV (1 << 5)
+
+/* global control dev: */
+
+/* These are currently broken, and probably unnecessary: */
+#if 0
+#define BCH_IOCTL_ASSEMBLE _IOW(0xbc, 1, struct bch_ioctl_assemble)
+#define BCH_IOCTL_INCREMENTAL _IOW(0xbc, 2, struct bch_ioctl_incremental)
+
+struct bch_ioctl_assemble {
+ __u32 flags;
+ __u32 nr_devs;
+ __u64 pad;
+ __u64 devs[];
+};
+
+struct bch_ioctl_incremental {
+ __u32 flags;
+ __u64 pad;
+ __u64 dev;
+};
+#endif
+
+/* filesystem ioctls: */
+
+#define BCH_IOCTL_QUERY_UUID _IOR(0xbc, 1, struct bch_ioctl_query_uuid)
+
+/* These only make sense when we also have incremental assembly */
+#if 0
+#define BCH_IOCTL_START _IOW(0xbc, 2, struct bch_ioctl_start)
+#define BCH_IOCTL_STOP _IO(0xbc, 3)
+#endif
+
+#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state)
+#define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data)
+#define BCH_IOCTL_FS_USAGE _IOWR(0xbc, 11, struct bch_ioctl_fs_usage)
+#define BCH_IOCTL_DEV_USAGE _IOWR(0xbc, 11, struct bch_ioctl_dev_usage)
+#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super)
+#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx)
+#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize)
+#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15, struct bch_ioctl_disk_resize_journal)
+
+#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume)
+#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume)
+
+/* ioctl below act on a particular file, not the filesystem as a whole: */
+
+#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *)
+
+/*
+ * BCH_IOCTL_QUERY_UUID: get filesystem UUID
+ *
+ * Returns user visible UUID, not internal UUID (which may not ever be changed);
+ * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with
+ * this UUID.
+ */
+struct bch_ioctl_query_uuid {
+ __uuid_t uuid;
+};
+
+#if 0
+struct bch_ioctl_start {
+ __u32 flags;
+ __u32 pad;
+};
+#endif
+
+/*
+ * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem
+ *
+ * The specified device must not be open or in use. On success, the new device
+ * will be an online member of the filesystem just like any other member.
+ *
+ * The device must first be prepared by userspace by formatting with a bcachefs
+ * superblock, which is only used for passing in superblock options/parameters
+ * for that device (in struct bch_member). The new device's superblock should
+ * not claim to be a member of any existing filesystem - UUIDs on it will be
+ * ignored.
+ */
+
+/*
+ * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem
+ *
+ * Any data present on @dev will be permanently deleted, and @dev will be
+ * removed from its slot in the filesystem's list of member devices. The device
+ * may be either offline or offline.
+ *
+ * Will fail removing @dev would leave us with insufficient read write devices
+ * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are
+ * set.
+ */
+
+/*
+ * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem
+ * but is not open (e.g. because we started in degraded mode), bring it online
+ *
+ * all existing data on @dev will be available once the device is online,
+ * exactly as if @dev was present when the filesystem was first mounted
+ */
+
+/*
+ * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that
+ * block device, without removing it from the filesystem (so it can be brought
+ * back online later)
+ *
+ * Data present on @dev will be unavailable while @dev is offline (unless
+ * replicated), but will still be intact and untouched if @dev is brought back
+ * online
+ *
+ * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would
+ * leave us with insufficient read write devices or degraded/unavailable data,
+ * unless the approprate BCH_FORCE_IF_* flags are set.
+ */
+
+struct bch_ioctl_disk {
+ __u32 flags;
+ __u32 pad;
+ __u64 dev;
+};
+
+/*
+ * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem
+ *
+ * @new_state - one of the bch_member_state states (rw, ro, failed,
+ * spare)
+ *
+ * Will refuse to change member state if we would then have insufficient devices
+ * to write to, or if it would result in degraded data (when @new_state is
+ * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set.
+ */
+struct bch_ioctl_disk_set_state {
+ __u32 flags;
+ __u8 new_state;
+ __u8 pad[3];
+ __u64 dev;
+};
+
+enum bch_data_ops {
+ BCH_DATA_OP_SCRUB = 0,
+ BCH_DATA_OP_REREPLICATE = 1,
+ BCH_DATA_OP_MIGRATE = 2,
+ BCH_DATA_OP_REWRITE_OLD_NODES = 3,
+ BCH_DATA_OP_NR = 4,
+};
+
+/*
+ * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g.
+ * scrub, rereplicate, migrate).
+ *
+ * This ioctl kicks off a job in the background, and returns a file descriptor.
+ * Reading from the file descriptor returns a struct bch_ioctl_data_event,
+ * indicating current progress, and closing the file descriptor will stop the
+ * job. The file descriptor is O_CLOEXEC.
+ */
+struct bch_ioctl_data {
+ __u16 op;
+ __u8 start_btree;
+ __u8 end_btree;
+ __u32 flags;
+
+ struct bpos start_pos;
+ struct bpos end_pos;
+
+ union {
+ struct {
+ __u32 dev;
+ __u32 pad;
+ } migrate;
+ struct {
+ __u64 pad[8];
+ };
+ };
+} __packed __aligned(8);
+
+enum bch_data_event {
+ BCH_DATA_EVENT_PROGRESS = 0,
+ /* XXX: add an event for reporting errors */
+ BCH_DATA_EVENT_NR = 1,
+};
+
+struct bch_ioctl_data_progress {
+ __u8 data_type;
+ __u8 btree_id;
+ __u8 pad[2];
+ struct bpos pos;
+
+ __u64 sectors_done;
+ __u64 sectors_total;
+} __packed __aligned(8);
+
+struct bch_ioctl_data_event {
+ __u8 type;
+ __u8 pad[7];
+ union {
+ struct bch_ioctl_data_progress p;
+ __u64 pad2[15];
+ };
+} __packed __aligned(8);
+
+struct bch_replicas_usage {
+ __u64 sectors;
+ struct bch_replicas_entry r;
+} __packed;
+
+static inline struct bch_replicas_usage *
+replicas_usage_next(struct bch_replicas_usage *u)
+{
+ return (void *) u + replicas_entry_bytes(&u->r) + 8;
+}
+
+/*
+ * BCH_IOCTL_FS_USAGE: query filesystem disk space usage
+ *
+ * Returns disk space usage broken out by data type, number of replicas, and
+ * by component device
+ *
+ * @replica_entries_bytes - size, in bytes, allocated for replica usage entries
+ *
+ * On success, @replica_entries_bytes will be changed to indicate the number of
+ * bytes actually used.
+ *
+ * Returns -ERANGE if @replica_entries_bytes was too small
+ */
+struct bch_ioctl_fs_usage {
+ __u64 capacity;
+ __u64 used;
+ __u64 online_reserved;
+ __u64 persistent_reserved[BCH_REPLICAS_MAX];
+
+ __u32 replica_entries_bytes;
+ __u32 pad;
+
+ struct bch_replicas_usage replicas[0];
+};
+
+/*
+ * BCH_IOCTL_DEV_USAGE: query device disk space usage
+ *
+ * Returns disk space usage broken out by data type - both by buckets and
+ * sectors.
+ */
+struct bch_ioctl_dev_usage {
+ __u64 dev;
+ __u32 flags;
+ __u8 state;
+ __u8 pad[7];
+
+ __u32 bucket_size;
+ __u64 nr_buckets;
+
+ __u64 buckets_ec;
+
+ struct bch_ioctl_dev_usage_type {
+ __u64 buckets;
+ __u64 sectors;
+ __u64 fragmented;
+ } d[BCH_DATA_NR];
+};
+
+/*
+ * BCH_IOCTL_READ_SUPER: read filesystem superblock
+ *
+ * Equivalent to reading the superblock directly from the block device, except
+ * avoids racing with the kernel writing the superblock or having to figure out
+ * which block device to read
+ *
+ * @sb - buffer to read into
+ * @size - size of userspace allocated buffer
+ * @dev - device to read superblock for, if BCH_READ_DEV flag is
+ * specified
+ *
+ * Returns -ERANGE if buffer provided is too small
+ */
+struct bch_ioctl_read_super {
+ __u32 flags;
+ __u32 pad;
+ __u64 dev;
+ __u64 size;
+ __u64 sb;
+};
+
+/*
+ * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to
+ * determine if disk is a (online) member - if so, returns device's index
+ *
+ * Returns -ENOENT if not found
+ */
+struct bch_ioctl_disk_get_idx {
+ __u64 dev;
+};
+
+/*
+ * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device
+ *
+ * @dev - member to resize
+ * @nbuckets - new number of buckets
+ */
+struct bch_ioctl_disk_resize {
+ __u32 flags;
+ __u32 pad;
+ __u64 dev;
+ __u64 nbuckets;
+};
+
+/*
+ * BCH_IOCTL_DISK_RESIZE_JOURNAL: resize journal on a device
+ *
+ * @dev - member to resize
+ * @nbuckets - new number of buckets
+ */
+struct bch_ioctl_disk_resize_journal {
+ __u32 flags;
+ __u32 pad;
+ __u64 dev;
+ __u64 nbuckets;
+};
+
+struct bch_ioctl_subvolume {
+ __u32 flags;
+ __u32 dirfd;
+ __u16 mode;
+ __u16 pad[3];
+ __u64 dst_ptr;
+ __u64 src_ptr;
+};
+
+#define BCH_SUBVOL_SNAPSHOT_CREATE (1U << 0)
+#define BCH_SUBVOL_SNAPSHOT_RO (1U << 1)
+
+#endif /* _BCACHEFS_IOCTL_H */
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
new file mode 100644
index 000000000000..abdb05507d16
--- /dev/null
+++ b/fs/bcachefs/bkey.c
@@ -0,0 +1,1120 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey.h"
+#include "bkey_cmp.h"
+#include "bkey_methods.h"
+#include "bset.h"
+#include "util.h"
+
+const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
+
+void bch2_bkey_packed_to_binary_text(struct printbuf *out,
+ const struct bkey_format *f,
+ const struct bkey_packed *k)
+{
+ const u64 *p = high_word(f, k);
+ unsigned word_bits = 64 - high_bit_offset;
+ unsigned nr_key_bits = bkey_format_key_bits(f) + high_bit_offset;
+ u64 v = *p & (~0ULL >> high_bit_offset);
+
+ if (!nr_key_bits) {
+ prt_str(out, "(empty)");
+ return;
+ }
+
+ while (1) {
+ unsigned next_key_bits = nr_key_bits;
+
+ if (nr_key_bits < 64) {
+ v >>= 64 - nr_key_bits;
+ next_key_bits = 0;
+ } else {
+ next_key_bits -= 64;
+ }
+
+ bch2_prt_u64_binary(out, v, min(word_bits, nr_key_bits));
+
+ if (!next_key_bits)
+ break;
+
+ prt_char(out, ' ');
+
+ p = next_word(p);
+ v = *p;
+ word_bits = 64;
+ nr_key_bits = next_key_bits;
+ }
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
+ const struct bkey *unpacked,
+ const struct bkey_format *format)
+{
+ struct bkey tmp;
+
+ BUG_ON(bkeyp_val_u64s(format, packed) !=
+ bkey_val_u64s(unpacked));
+
+ BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed));
+
+ tmp = __bch2_bkey_unpack_key(format, packed);
+
+ if (memcmp(&tmp, unpacked, sizeof(struct bkey))) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_printf(&buf, "keys differ: format u64s %u fields %u %u %u %u %u\n",
+ format->key_u64s,
+ format->bits_per_field[0],
+ format->bits_per_field[1],
+ format->bits_per_field[2],
+ format->bits_per_field[3],
+ format->bits_per_field[4]);
+
+ prt_printf(&buf, "compiled unpack: ");
+ bch2_bkey_to_text(&buf, unpacked);
+ prt_newline(&buf);
+
+ prt_printf(&buf, "c unpack: ");
+ bch2_bkey_to_text(&buf, &tmp);
+ prt_newline(&buf);
+
+ prt_printf(&buf, "compiled unpack: ");
+ bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current,
+ (struct bkey_packed *) unpacked);
+ prt_newline(&buf);
+
+ prt_printf(&buf, "c unpack: ");
+ bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current,
+ (struct bkey_packed *) &tmp);
+ prt_newline(&buf);
+
+ panic("%s", buf.buf);
+ }
+}
+
+#else
+static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed,
+ const struct bkey *unpacked,
+ const struct bkey_format *format) {}
+#endif
+
+struct pack_state {
+ const struct bkey_format *format;
+ unsigned bits; /* bits remaining in current word */
+ u64 w; /* current word */
+ u64 *p; /* pointer to next word */
+};
+
+__always_inline
+static struct pack_state pack_state_init(const struct bkey_format *format,
+ struct bkey_packed *k)
+{
+ u64 *p = high_word(format, k);
+
+ return (struct pack_state) {
+ .format = format,
+ .bits = 64 - high_bit_offset,
+ .w = 0,
+ .p = p,
+ };
+}
+
+__always_inline
+static void pack_state_finish(struct pack_state *state,
+ struct bkey_packed *k)
+{
+ EBUG_ON(state->p < k->_data);
+ EBUG_ON(state->p >= (u64 *) k->_data + state->format->key_u64s);
+
+ *state->p = state->w;
+}
+
+struct unpack_state {
+ const struct bkey_format *format;
+ unsigned bits; /* bits remaining in current word */
+ u64 w; /* current word */
+ const u64 *p; /* pointer to next word */
+};
+
+__always_inline
+static struct unpack_state unpack_state_init(const struct bkey_format *format,
+ const struct bkey_packed *k)
+{
+ const u64 *p = high_word(format, k);
+
+ return (struct unpack_state) {
+ .format = format,
+ .bits = 64 - high_bit_offset,
+ .w = *p << high_bit_offset,
+ .p = p,
+ };
+}
+
+__always_inline
+static u64 get_inc_field(struct unpack_state *state, unsigned field)
+{
+ unsigned bits = state->format->bits_per_field[field];
+ u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]);
+
+ if (bits >= state->bits) {
+ v = state->w >> (64 - bits);
+ bits -= state->bits;
+
+ state->p = next_word(state->p);
+ state->w = *state->p;
+ state->bits = 64;
+ }
+
+ /* avoid shift by 64 if bits is 0 - bits is never 64 here: */
+ v |= (state->w >> 1) >> (63 - bits);
+ state->w <<= bits;
+ state->bits -= bits;
+
+ return v + offset;
+}
+
+__always_inline
+static void __set_inc_field(struct pack_state *state, unsigned field, u64 v)
+{
+ unsigned bits = state->format->bits_per_field[field];
+
+ if (bits) {
+ if (bits > state->bits) {
+ bits -= state->bits;
+ /* avoid shift by 64 if bits is 64 - bits is never 0 here: */
+ state->w |= (v >> 1) >> (bits - 1);
+
+ *state->p = state->w;
+ state->p = next_word(state->p);
+ state->w = 0;
+ state->bits = 64;
+ }
+
+ state->bits -= bits;
+ state->w |= v << state->bits;
+ }
+}
+
+__always_inline
+static bool set_inc_field(struct pack_state *state, unsigned field, u64 v)
+{
+ unsigned bits = state->format->bits_per_field[field];
+ u64 offset = le64_to_cpu(state->format->field_offset[field]);
+
+ if (v < offset)
+ return false;
+
+ v -= offset;
+
+ if (fls64(v) > bits)
+ return false;
+
+ __set_inc_field(state, field, v);
+ return true;
+}
+
+/*
+ * Note: does NOT set out->format (we don't know what it should be here!)
+ *
+ * Also: doesn't work on extents - it doesn't preserve the invariant that
+ * if k is packed bkey_start_pos(k) will successfully pack
+ */
+static bool bch2_bkey_transform_key(const struct bkey_format *out_f,
+ struct bkey_packed *out,
+ const struct bkey_format *in_f,
+ const struct bkey_packed *in)
+{
+ struct pack_state out_s = pack_state_init(out_f, out);
+ struct unpack_state in_s = unpack_state_init(in_f, in);
+ u64 *w = out->_data;
+ unsigned i;
+
+ *w = 0;
+
+ for (i = 0; i < BKEY_NR_FIELDS; i++)
+ if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i)))
+ return false;
+
+ /* Can't happen because the val would be too big to unpack: */
+ EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX);
+
+ pack_state_finish(&out_s, out);
+ out->u64s = out_f->key_u64s + in->u64s - in_f->key_u64s;
+ out->needs_whiteout = in->needs_whiteout;
+ out->type = in->type;
+
+ return true;
+}
+
+bool bch2_bkey_transform(const struct bkey_format *out_f,
+ struct bkey_packed *out,
+ const struct bkey_format *in_f,
+ const struct bkey_packed *in)
+{
+ if (!bch2_bkey_transform_key(out_f, out, in_f, in))
+ return false;
+
+ memcpy_u64s((u64 *) out + out_f->key_u64s,
+ (u64 *) in + in_f->key_u64s,
+ (in->u64s - in_f->key_u64s));
+ return true;
+}
+
+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format,
+ const struct bkey_packed *in)
+{
+ struct unpack_state state = unpack_state_init(format, in);
+ struct bkey out;
+
+ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
+ EBUG_ON(in->u64s < format->key_u64s);
+ EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
+ EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX);
+
+ out.u64s = BKEY_U64s + in->u64s - format->key_u64s;
+ out.format = KEY_FORMAT_CURRENT;
+ out.needs_whiteout = in->needs_whiteout;
+ out.type = in->type;
+ out.pad[0] = 0;
+
+#define x(id, field) out.field = get_inc_field(&state, id);
+ bkey_fields()
+#undef x
+
+ return out;
+}
+
+#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
+struct bpos __bkey_unpack_pos(const struct bkey_format *format,
+ const struct bkey_packed *in)
+{
+ struct unpack_state state = unpack_state_init(format, in);
+ struct bpos out;
+
+ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
+ EBUG_ON(in->u64s < format->key_u64s);
+ EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
+
+ out.inode = get_inc_field(&state, BKEY_FIELD_INODE);
+ out.offset = get_inc_field(&state, BKEY_FIELD_OFFSET);
+ out.snapshot = get_inc_field(&state, BKEY_FIELD_SNAPSHOT);
+
+ return out;
+}
+#endif
+
+/**
+ * bch2_bkey_pack_key -- pack just the key, not the value
+ * @out: packed result
+ * @in: key to pack
+ * @format: format of packed result
+ *
+ * Returns: true on success, false on failure
+ */
+bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
+ const struct bkey_format *format)
+{
+ struct pack_state state = pack_state_init(format, out);
+ u64 *w = out->_data;
+
+ EBUG_ON((void *) in == (void *) out);
+ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
+ EBUG_ON(in->format != KEY_FORMAT_CURRENT);
+
+ *w = 0;
+
+#define x(id, field) if (!set_inc_field(&state, id, in->field)) return false;
+ bkey_fields()
+#undef x
+ pack_state_finish(&state, out);
+ out->u64s = format->key_u64s + in->u64s - BKEY_U64s;
+ out->format = KEY_FORMAT_LOCAL_BTREE;
+ out->needs_whiteout = in->needs_whiteout;
+ out->type = in->type;
+
+ bch2_bkey_pack_verify(out, in, format);
+ return true;
+}
+
+/**
+ * bch2_bkey_unpack -- unpack the key and the value
+ * @b: btree node of @src key (for packed format)
+ * @dst: unpacked result
+ * @src: packed input
+ */
+void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst,
+ const struct bkey_packed *src)
+{
+ __bkey_unpack_key(b, &dst->k, src);
+
+ memcpy_u64s(&dst->v,
+ bkeyp_val(&b->format, src),
+ bkeyp_val_u64s(&b->format, src));
+}
+
+/**
+ * bch2_bkey_pack -- pack the key and the value
+ * @dst: packed result
+ * @src: unpacked input
+ * @format: format of packed result
+ *
+ * Returns: true on success, false on failure
+ */
+bool bch2_bkey_pack(struct bkey_packed *dst, const struct bkey_i *src,
+ const struct bkey_format *format)
+{
+ struct bkey_packed tmp;
+
+ if (!bch2_bkey_pack_key(&tmp, &src->k, format))
+ return false;
+
+ memmove_u64s((u64 *) dst + format->key_u64s,
+ &src->v,
+ bkey_val_u64s(&src->k));
+ memcpy_u64s_small(dst, &tmp, format->key_u64s);
+
+ return true;
+}
+
+__always_inline
+static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v)
+{
+ unsigned bits = state->format->bits_per_field[field];
+ u64 offset = le64_to_cpu(state->format->field_offset[field]);
+ bool ret = true;
+
+ EBUG_ON(v < offset);
+ v -= offset;
+
+ if (fls64(v) > bits) {
+ v = ~(~0ULL << bits);
+ ret = false;
+ }
+
+ __set_inc_field(state, field, v);
+ return ret;
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+static bool bkey_packed_successor(struct bkey_packed *out,
+ const struct btree *b,
+ struct bkey_packed k)
+{
+ const struct bkey_format *f = &b->format;
+ unsigned nr_key_bits = b->nr_key_bits;
+ unsigned first_bit, offset;
+ u64 *p;
+
+ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
+
+ if (!nr_key_bits)
+ return false;
+
+ *out = k;
+
+ first_bit = high_bit_offset + nr_key_bits - 1;
+ p = nth_word(high_word(f, out), first_bit >> 6);
+ offset = 63 - (first_bit & 63);
+
+ while (nr_key_bits) {
+ unsigned bits = min(64 - offset, nr_key_bits);
+ u64 mask = (~0ULL >> (64 - bits)) << offset;
+
+ if ((*p & mask) != mask) {
+ *p += 1ULL << offset;
+ EBUG_ON(bch2_bkey_cmp_packed(b, out, &k) <= 0);
+ return true;
+ }
+
+ *p &= ~mask;
+ p = prev_word(p);
+ nr_key_bits -= bits;
+ offset = 0;
+ }
+
+ return false;
+}
+
+static bool bkey_format_has_too_big_fields(const struct bkey_format *f)
+{
+ for (unsigned i = 0; i < f->nr_fields; i++) {
+ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+ u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
+ u64 packed_max = f->bits_per_field[i]
+ ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
+ : 0;
+ u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+ if (packed_max + field_offset < packed_max ||
+ packed_max + field_offset > unpacked_max)
+ return true;
+ }
+
+ return false;
+}
+#endif
+
+/*
+ * Returns a packed key that compares <= in
+ *
+ * This is used in bset_search_tree(), where we need a packed pos in order to be
+ * able to compare against the keys in the auxiliary search tree - and it's
+ * legal to use a packed pos that isn't equivalent to the original pos,
+ * _provided_ it compares <= to the original pos.
+ */
+enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
+ struct bpos in,
+ const struct btree *b)
+{
+ const struct bkey_format *f = &b->format;
+ struct pack_state state = pack_state_init(f, out);
+ u64 *w = out->_data;
+#ifdef CONFIG_BCACHEFS_DEBUG
+ struct bpos orig = in;
+#endif
+ bool exact = true;
+ unsigned i;
+
+ /*
+ * bch2_bkey_pack_key() will write to all of f->key_u64s, minus the 3
+ * byte header, but pack_pos() won't if the len/version fields are big
+ * enough - we need to make sure to zero them out:
+ */
+ for (i = 0; i < f->key_u64s; i++)
+ w[i] = 0;
+
+ if (unlikely(in.snapshot <
+ le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) {
+ if (!in.offset-- &&
+ !in.inode--)
+ return BKEY_PACK_POS_FAIL;
+ in.snapshot = KEY_SNAPSHOT_MAX;
+ exact = false;
+ }
+
+ if (unlikely(in.offset <
+ le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) {
+ if (!in.inode--)
+ return BKEY_PACK_POS_FAIL;
+ in.offset = KEY_OFFSET_MAX;
+ in.snapshot = KEY_SNAPSHOT_MAX;
+ exact = false;
+ }
+
+ if (unlikely(in.inode <
+ le64_to_cpu(f->field_offset[BKEY_FIELD_INODE])))
+ return BKEY_PACK_POS_FAIL;
+
+ if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode))) {
+ in.offset = KEY_OFFSET_MAX;
+ in.snapshot = KEY_SNAPSHOT_MAX;
+ exact = false;
+ }
+
+ if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset))) {
+ in.snapshot = KEY_SNAPSHOT_MAX;
+ exact = false;
+ }
+
+ if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot)))
+ exact = false;
+
+ pack_state_finish(&state, out);
+ out->u64s = f->key_u64s;
+ out->format = KEY_FORMAT_LOCAL_BTREE;
+ out->type = KEY_TYPE_deleted;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+ if (exact) {
+ BUG_ON(bkey_cmp_left_packed(b, out, &orig));
+ } else {
+ struct bkey_packed successor;
+
+ BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0);
+ BUG_ON(bkey_packed_successor(&successor, b, *out) &&
+ bkey_cmp_left_packed(b, &successor, &orig) < 0 &&
+ !bkey_format_has_too_big_fields(f));
+ }
+#endif
+
+ return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER;
+}
+
+void bch2_bkey_format_init(struct bkey_format_state *s)
+{
+ unsigned i;
+
+ for (i = 0; i < ARRAY_SIZE(s->field_min); i++)
+ s->field_min[i] = U64_MAX;
+
+ for (i = 0; i < ARRAY_SIZE(s->field_max); i++)
+ s->field_max[i] = 0;
+
+ /* Make sure we can store a size of 0: */
+ s->field_min[BKEY_FIELD_SIZE] = 0;
+}
+
+void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p)
+{
+ unsigned field = 0;
+
+ __bkey_format_add(s, field++, p.inode);
+ __bkey_format_add(s, field++, p.offset);
+ __bkey_format_add(s, field++, p.snapshot);
+}
+
+/*
+ * We don't want it to be possible for the packed format to represent fields
+ * bigger than a u64... that will cause confusion and issues (like with
+ * bkey_packed_successor())
+ */
+static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i,
+ unsigned bits, u64 offset)
+{
+ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+ u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
+
+ bits = min(bits, unpacked_bits);
+
+ offset = bits == unpacked_bits ? 0 : min(offset, unpacked_max - ((1ULL << bits) - 1));
+
+ f->bits_per_field[i] = bits;
+ f->field_offset[i] = cpu_to_le64(offset);
+}
+
+struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s)
+{
+ unsigned i, bits = KEY_PACKED_BITS_START;
+ struct bkey_format ret = {
+ .nr_fields = BKEY_NR_FIELDS,
+ };
+
+ for (i = 0; i < ARRAY_SIZE(s->field_min); i++) {
+ s->field_min[i] = min(s->field_min[i], s->field_max[i]);
+
+ set_format_field(&ret, i,
+ fls64(s->field_max[i] - s->field_min[i]),
+ s->field_min[i]);
+
+ bits += ret.bits_per_field[i];
+ }
+
+ /* allow for extent merging: */
+ if (ret.bits_per_field[BKEY_FIELD_SIZE]) {
+ unsigned b = min(4U, 32U - ret.bits_per_field[BKEY_FIELD_SIZE]);
+
+ ret.bits_per_field[BKEY_FIELD_SIZE] += b;
+ bits += b;
+ }
+
+ ret.key_u64s = DIV_ROUND_UP(bits, 64);
+
+ /* if we have enough spare bits, round fields up to nearest byte */
+ bits = ret.key_u64s * 64 - bits;
+
+ for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) {
+ unsigned r = round_up(ret.bits_per_field[i], 8) -
+ ret.bits_per_field[i];
+
+ if (r <= bits) {
+ set_format_field(&ret, i,
+ ret.bits_per_field[i] + r,
+ le64_to_cpu(ret.field_offset[i]));
+ bits -= r;
+ }
+ }
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+ {
+ struct printbuf buf = PRINTBUF;
+
+ BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf));
+ printbuf_exit(&buf);
+ }
+#endif
+ return ret;
+}
+
+int bch2_bkey_format_invalid(struct bch_fs *c,
+ struct bkey_format *f,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
+{
+ unsigned i, bits = KEY_PACKED_BITS_START;
+
+ if (f->nr_fields != BKEY_NR_FIELDS) {
+ prt_printf(err, "incorrect number of fields: got %u, should be %u",
+ f->nr_fields, BKEY_NR_FIELDS);
+ return -BCH_ERR_invalid;
+ }
+
+ /*
+ * Verify that the packed format can't represent fields larger than the
+ * unpacked format:
+ */
+ for (i = 0; i < f->nr_fields; i++) {
+ if (!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) {
+ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+ u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
+ u64 packed_max = f->bits_per_field[i]
+ ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
+ : 0;
+ u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+ if (packed_max + field_offset < packed_max ||
+ packed_max + field_offset > unpacked_max) {
+ prt_printf(err, "field %u too large: %llu + %llu > %llu",
+ i, packed_max, field_offset, unpacked_max);
+ return -BCH_ERR_invalid;
+ }
+ }
+
+ bits += f->bits_per_field[i];
+ }
+
+ if (f->key_u64s != DIV_ROUND_UP(bits, 64)) {
+ prt_printf(err, "incorrect key_u64s: got %u, should be %u",
+ f->key_u64s, DIV_ROUND_UP(bits, 64));
+ return -BCH_ERR_invalid;
+ }
+
+ return 0;
+}
+
+void bch2_bkey_format_to_text(struct printbuf *out, const struct bkey_format *f)
+{
+ prt_printf(out, "u64s %u fields ", f->key_u64s);
+
+ for (unsigned i = 0; i < ARRAY_SIZE(f->bits_per_field); i++) {
+ if (i)
+ prt_str(out, ", ");
+ prt_printf(out, "%u:%llu",
+ f->bits_per_field[i],
+ le64_to_cpu(f->field_offset[i]));
+ }
+}
+
+/*
+ * Most significant differing bit
+ * Bits are indexed from 0 - return is [0, nr_key_bits)
+ */
+__pure
+unsigned bch2_bkey_greatest_differing_bit(const struct btree *b,
+ const struct bkey_packed *l_k,
+ const struct bkey_packed *r_k)
+{
+ const u64 *l = high_word(&b->format, l_k);
+ const u64 *r = high_word(&b->format, r_k);
+ unsigned nr_key_bits = b->nr_key_bits;
+ unsigned word_bits = 64 - high_bit_offset;
+ u64 l_v, r_v;
+
+ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
+
+ /* for big endian, skip past header */
+ l_v = *l & (~0ULL >> high_bit_offset);
+ r_v = *r & (~0ULL >> high_bit_offset);
+
+ while (nr_key_bits) {
+ if (nr_key_bits < word_bits) {
+ l_v >>= word_bits - nr_key_bits;
+ r_v >>= word_bits - nr_key_bits;
+ nr_key_bits = 0;
+ } else {
+ nr_key_bits -= word_bits;
+ }
+
+ if (l_v != r_v)
+ return fls64(l_v ^ r_v) - 1 + nr_key_bits;
+
+ l = next_word(l);
+ r = next_word(r);
+
+ l_v = *l;
+ r_v = *r;
+ word_bits = 64;
+ }
+
+ return 0;
+}
+
+/*
+ * First set bit
+ * Bits are indexed from 0 - return is [0, nr_key_bits)
+ */
+__pure
+unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k)
+{
+ const u64 *p = high_word(&b->format, k);
+ unsigned nr_key_bits = b->nr_key_bits;
+ unsigned ret = 0, offset;
+
+ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
+
+ offset = nr_key_bits;
+ while (offset > 64) {
+ p = next_word(p);
+ offset -= 64;
+ }
+
+ offset = 64 - offset;
+
+ while (nr_key_bits) {
+ unsigned bits = nr_key_bits + offset < 64
+ ? nr_key_bits
+ : 64 - offset;
+
+ u64 mask = (~0ULL >> (64 - bits)) << offset;
+
+ if (*p & mask)
+ return ret + __ffs64(*p & mask) - offset;
+
+ p = prev_word(p);
+ nr_key_bits -= bits;
+ ret += bits;
+ offset = 0;
+ }
+
+ return 0;
+}
+
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
+
+#define I(_x) (*(out)++ = (_x))
+#define I1(i0) I(i0)
+#define I2(i0, i1) (I1(i0), I(i1))
+#define I3(i0, i1, i2) (I2(i0, i1), I(i2))
+#define I4(i0, i1, i2, i3) (I3(i0, i1, i2), I(i3))
+#define I5(i0, i1, i2, i3, i4) (I4(i0, i1, i2, i3), I(i4))
+
+static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
+ enum bch_bkey_fields field,
+ unsigned dst_offset, unsigned dst_size,
+ bool *eax_zeroed)
+{
+ unsigned bits = format->bits_per_field[field];
+ u64 offset = le64_to_cpu(format->field_offset[field]);
+ unsigned i, byte, bit_offset, align, shl, shr;
+
+ if (!bits && !offset) {
+ if (!*eax_zeroed) {
+ /* xor eax, eax */
+ I2(0x31, 0xc0);
+ }
+
+ *eax_zeroed = true;
+ goto set_field;
+ }
+
+ if (!bits) {
+ /* just return offset: */
+
+ switch (dst_size) {
+ case 8:
+ if (offset > S32_MAX) {
+ /* mov [rdi + dst_offset], offset */
+ I3(0xc7, 0x47, dst_offset);
+ memcpy(out, &offset, 4);
+ out += 4;
+
+ I3(0xc7, 0x47, dst_offset + 4);
+ memcpy(out, (void *) &offset + 4, 4);
+ out += 4;
+ } else {
+ /* mov [rdi + dst_offset], offset */
+ /* sign extended */
+ I4(0x48, 0xc7, 0x47, dst_offset);
+ memcpy(out, &offset, 4);
+ out += 4;
+ }
+ break;
+ case 4:
+ /* mov [rdi + dst_offset], offset */
+ I3(0xc7, 0x47, dst_offset);
+ memcpy(out, &offset, 4);
+ out += 4;
+ break;
+ default:
+ BUG();
+ }
+
+ return out;
+ }
+
+ bit_offset = format->key_u64s * 64;
+ for (i = 0; i <= field; i++)
+ bit_offset -= format->bits_per_field[i];
+
+ byte = bit_offset / 8;
+ bit_offset -= byte * 8;
+
+ *eax_zeroed = false;
+
+ if (bit_offset == 0 && bits == 8) {
+ /* movzx eax, BYTE PTR [rsi + imm8] */
+ I4(0x0f, 0xb6, 0x46, byte);
+ } else if (bit_offset == 0 && bits == 16) {
+ /* movzx eax, WORD PTR [rsi + imm8] */
+ I4(0x0f, 0xb7, 0x46, byte);
+ } else if (bit_offset + bits <= 32) {
+ align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
+ byte -= align;
+ bit_offset += align * 8;
+
+ BUG_ON(bit_offset + bits > 32);
+
+ /* mov eax, [rsi + imm8] */
+ I3(0x8b, 0x46, byte);
+
+ if (bit_offset) {
+ /* shr eax, imm8 */
+ I3(0xc1, 0xe8, bit_offset);
+ }
+
+ if (bit_offset + bits < 32) {
+ unsigned mask = ~0U >> (32 - bits);
+
+ /* and eax, imm32 */
+ I1(0x25);
+ memcpy(out, &mask, 4);
+ out += 4;
+ }
+ } else if (bit_offset + bits <= 64) {
+ align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7);
+ byte -= align;
+ bit_offset += align * 8;
+
+ BUG_ON(bit_offset + bits > 64);
+
+ /* mov rax, [rsi + imm8] */
+ I4(0x48, 0x8b, 0x46, byte);
+
+ shl = 64 - bit_offset - bits;
+ shr = bit_offset + shl;
+
+ if (shl) {
+ /* shl rax, imm8 */
+ I4(0x48, 0xc1, 0xe0, shl);
+ }
+
+ if (shr) {
+ /* shr rax, imm8 */
+ I4(0x48, 0xc1, 0xe8, shr);
+ }
+ } else {
+ align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
+ byte -= align;
+ bit_offset += align * 8;
+
+ BUG_ON(bit_offset + bits > 96);
+
+ /* mov rax, [rsi + byte] */
+ I4(0x48, 0x8b, 0x46, byte);
+
+ /* mov edx, [rsi + byte + 8] */
+ I3(0x8b, 0x56, byte + 8);
+
+ /* bits from next word: */
+ shr = bit_offset + bits - 64;
+ BUG_ON(shr > bit_offset);
+
+ /* shr rax, bit_offset */
+ I4(0x48, 0xc1, 0xe8, shr);
+
+ /* shl rdx, imm8 */
+ I4(0x48, 0xc1, 0xe2, 64 - shr);
+
+ /* or rax, rdx */
+ I3(0x48, 0x09, 0xd0);
+
+ shr = bit_offset - shr;
+
+ if (shr) {
+ /* shr rax, imm8 */
+ I4(0x48, 0xc1, 0xe8, shr);
+ }
+ }
+
+ /* rax += offset: */
+ if (offset > S32_MAX) {
+ /* mov rdx, imm64 */
+ I2(0x48, 0xba);
+ memcpy(out, &offset, 8);
+ out += 8;
+ /* add %rdx, %rax */
+ I3(0x48, 0x01, 0xd0);
+ } else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) {
+ /* add rax, imm32 */
+ I2(0x48, 0x05);
+ memcpy(out, &offset, 4);
+ out += 4;
+ } else if (offset) {
+ /* add eax, imm32 */
+ I1(0x05);
+ memcpy(out, &offset, 4);
+ out += 4;
+ }
+set_field:
+ switch (dst_size) {
+ case 8:
+ /* mov [rdi + dst_offset], rax */
+ I4(0x48, 0x89, 0x47, dst_offset);
+ break;
+ case 4:
+ /* mov [rdi + dst_offset], eax */
+ I3(0x89, 0x47, dst_offset);
+ break;
+ default:
+ BUG();
+ }
+
+ return out;
+}
+
+int bch2_compile_bkey_format(const struct bkey_format *format, void *_out)
+{
+ bool eax_zeroed = false;
+ u8 *out = _out;
+
+ /*
+ * rdi: dst - unpacked key
+ * rsi: src - packed key
+ */
+
+ /* k->u64s, k->format, k->type */
+
+ /* mov eax, [rsi] */
+ I2(0x8b, 0x06);
+
+ /* add eax, BKEY_U64s - format->key_u64s */
+ I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0);
+
+ /* and eax, imm32: mask out k->pad: */
+ I5(0x25, 0xff, 0xff, 0xff, 0);
+
+ /* mov [rdi], eax */
+ I2(0x89, 0x07);
+
+#define x(id, field) \
+ out = compile_bkey_field(format, out, id, \
+ offsetof(struct bkey, field), \
+ sizeof(((struct bkey *) NULL)->field), \
+ &eax_zeroed);
+ bkey_fields()
+#undef x
+
+ /* retq */
+ I1(0xc3);
+
+ return (void *) out - _out;
+}
+
+#else
+#endif
+
+__pure
+int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l,
+ const struct bkey_packed *r,
+ const struct btree *b)
+{
+ return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b);
+}
+
+__pure __flatten
+int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b,
+ const struct bkey_packed *l,
+ const struct bpos *r)
+{
+ return bpos_cmp(bkey_unpack_pos_format_checked(b, l), *r);
+}
+
+__pure __flatten
+int bch2_bkey_cmp_packed(const struct btree *b,
+ const struct bkey_packed *l,
+ const struct bkey_packed *r)
+{
+ return bch2_bkey_cmp_packed_inlined(b, l, r);
+}
+
+__pure __flatten
+int __bch2_bkey_cmp_left_packed(const struct btree *b,
+ const struct bkey_packed *l,
+ const struct bpos *r)
+{
+ const struct bkey *l_unpacked;
+
+ return unlikely(l_unpacked = packed_to_bkey_c(l))
+ ? bpos_cmp(l_unpacked->p, *r)
+ : __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
+}
+
+void bch2_bpos_swab(struct bpos *p)
+{
+ u8 *l = (u8 *) p;
+ u8 *h = ((u8 *) &p[1]) - 1;
+
+ while (l < h) {
+ swap(*l, *h);
+ l++;
+ --h;
+ }
+}
+
+void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k)
+{
+ const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current;
+ u8 *l = k->key_start;
+ u8 *h = (u8 *) (k->_data + f->key_u64s) - 1;
+
+ while (l < h) {
+ swap(*l, *h);
+ l++;
+ --h;
+ }
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_bkey_pack_test(void)
+{
+ struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0);
+ struct bkey_packed p;
+
+ struct bkey_format test_format = {
+ .key_u64s = 3,
+ .nr_fields = BKEY_NR_FIELDS,
+ .bits_per_field = {
+ 13,
+ 64,
+ 32,
+ },
+ };
+
+ struct unpack_state in_s =
+ unpack_state_init(&bch2_bkey_format_current, (void *) &t);
+ struct pack_state out_s = pack_state_init(&test_format, &p);
+ unsigned i;
+
+ for (i = 0; i < out_s.format->nr_fields; i++) {
+ u64 a, v = get_inc_field(&in_s, i);
+
+ switch (i) {
+#define x(id, field) case id: a = t.field; break;
+ bkey_fields()
+#undef x
+ default:
+ BUG();
+ }
+
+ if (a != v)
+ panic("got %llu actual %llu i %u\n", v, a, i);
+
+ if (!set_inc_field(&out_s, i, v))
+ panic("failed at %u\n", i);
+ }
+
+ BUG_ON(!bch2_bkey_pack_key(&p, &t, &test_format));
+}
+#endif
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
new file mode 100644
index 000000000000..831be01809f2
--- /dev/null
+++ b/fs/bcachefs/bkey.h
@@ -0,0 +1,778 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_H
+#define _BCACHEFS_BKEY_H
+
+#include <linux/bug.h>
+#include "bcachefs_format.h"
+
+#include "btree_types.h"
+#include "util.h"
+#include "vstructs.h"
+
+enum bkey_invalid_flags {
+ BKEY_INVALID_WRITE = (1U << 0),
+ BKEY_INVALID_COMMIT = (1U << 1),
+ BKEY_INVALID_JOURNAL = (1U << 2),
+};
+
+#if 0
+
+/*
+ * compiled unpack functions are disabled, pending a new interface for
+ * dynamically allocating executable memory:
+ */
+
+#ifdef CONFIG_X86_64
+#define HAVE_BCACHEFS_COMPILED_UNPACK 1
+#endif
+#endif
+
+void bch2_bkey_packed_to_binary_text(struct printbuf *,
+ const struct bkey_format *,
+ const struct bkey_packed *);
+
+/* bkey with split value, const */
+struct bkey_s_c {
+ const struct bkey *k;
+ const struct bch_val *v;
+};
+
+/* bkey with split value */
+struct bkey_s {
+ union {
+ struct {
+ struct bkey *k;
+ struct bch_val *v;
+ };
+ struct bkey_s_c s_c;
+ };
+};
+
+#define bkey_p_next(_k) vstruct_next(_k)
+
+static inline struct bkey_i *bkey_next(struct bkey_i *k)
+{
+ return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
+}
+
+#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s)
+
+static inline size_t bkey_val_bytes(const struct bkey *k)
+{
+ return bkey_val_u64s(k) * sizeof(u64);
+}
+
+static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
+{
+ unsigned u64s = BKEY_U64s + val_u64s;
+
+ BUG_ON(u64s > U8_MAX);
+ k->u64s = u64s;
+}
+
+static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
+{
+ set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64)));
+}
+
+#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
+
+#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted)
+
+#define bkey_whiteout(_k) \
+ ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
+
+enum bkey_lr_packed {
+ BKEY_PACKED_BOTH,
+ BKEY_PACKED_RIGHT,
+ BKEY_PACKED_LEFT,
+ BKEY_PACKED_NONE,
+};
+
+#define bkey_lr_packed(_l, _r) \
+ ((_l)->format + ((_r)->format << 1))
+
+static inline void bkey_p_copy(struct bkey_packed *dst, const struct bkey_packed *src)
+{
+ memcpy_u64s_small(dst, src, src->u64s);
+}
+
+static inline void bkey_copy(struct bkey_i *dst, const struct bkey_i *src)
+{
+ memcpy_u64s_small(dst, src, src->k.u64s);
+}
+
+struct btree;
+
+__pure
+unsigned bch2_bkey_greatest_differing_bit(const struct btree *,
+ const struct bkey_packed *,
+ const struct bkey_packed *);
+__pure
+unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *);
+
+__pure
+int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *,
+ const struct bkey_packed *,
+ const struct btree *);
+
+__pure
+int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *,
+ const struct bkey_packed *,
+ const struct bpos *);
+
+__pure
+int bch2_bkey_cmp_packed(const struct btree *,
+ const struct bkey_packed *,
+ const struct bkey_packed *);
+
+__pure
+int __bch2_bkey_cmp_left_packed(const struct btree *,
+ const struct bkey_packed *,
+ const struct bpos *);
+
+static inline __pure
+int bkey_cmp_left_packed(const struct btree *b,
+ const struct bkey_packed *l, const struct bpos *r)
+{
+ return __bch2_bkey_cmp_left_packed(b, l, r);
+}
+
+/*
+ * The compiler generates better code when we pass bpos by ref, but it's often
+ * enough terribly convenient to pass it by val... as much as I hate c++, const
+ * ref would be nice here:
+ */
+__pure __flatten
+static inline int bkey_cmp_left_packed_byval(const struct btree *b,
+ const struct bkey_packed *l,
+ struct bpos r)
+{
+ return bkey_cmp_left_packed(b, l, &r);
+}
+
+static __always_inline bool bpos_eq(struct bpos l, struct bpos r)
+{
+ return !((l.inode ^ r.inode) |
+ (l.offset ^ r.offset) |
+ (l.snapshot ^ r.snapshot));
+}
+
+static __always_inline bool bpos_lt(struct bpos l, struct bpos r)
+{
+ return l.inode != r.inode ? l.inode < r.inode :
+ l.offset != r.offset ? l.offset < r.offset :
+ l.snapshot != r.snapshot ? l.snapshot < r.snapshot : false;
+}
+
+static __always_inline bool bpos_le(struct bpos l, struct bpos r)
+{
+ return l.inode != r.inode ? l.inode < r.inode :
+ l.offset != r.offset ? l.offset < r.offset :
+ l.snapshot != r.snapshot ? l.snapshot < r.snapshot : true;
+}
+
+static __always_inline bool bpos_gt(struct bpos l, struct bpos r)
+{
+ return bpos_lt(r, l);
+}
+
+static __always_inline bool bpos_ge(struct bpos l, struct bpos r)
+{
+ return bpos_le(r, l);
+}
+
+static __always_inline int bpos_cmp(struct bpos l, struct bpos r)
+{
+ return cmp_int(l.inode, r.inode) ?:
+ cmp_int(l.offset, r.offset) ?:
+ cmp_int(l.snapshot, r.snapshot);
+}
+
+static inline struct bpos bpos_min(struct bpos l, struct bpos r)
+{
+ return bpos_lt(l, r) ? l : r;
+}
+
+static inline struct bpos bpos_max(struct bpos l, struct bpos r)
+{
+ return bpos_gt(l, r) ? l : r;
+}
+
+static __always_inline bool bkey_eq(struct bpos l, struct bpos r)
+{
+ return !((l.inode ^ r.inode) |
+ (l.offset ^ r.offset));
+}
+
+static __always_inline bool bkey_lt(struct bpos l, struct bpos r)
+{
+ return l.inode != r.inode
+ ? l.inode < r.inode
+ : l.offset < r.offset;
+}
+
+static __always_inline bool bkey_le(struct bpos l, struct bpos r)
+{
+ return l.inode != r.inode
+ ? l.inode < r.inode
+ : l.offset <= r.offset;
+}
+
+static __always_inline bool bkey_gt(struct bpos l, struct bpos r)
+{
+ return bkey_lt(r, l);
+}
+
+static __always_inline bool bkey_ge(struct bpos l, struct bpos r)
+{
+ return bkey_le(r, l);
+}
+
+static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
+{
+ return cmp_int(l.inode, r.inode) ?:
+ cmp_int(l.offset, r.offset);
+}
+
+static inline struct bpos bkey_min(struct bpos l, struct bpos r)
+{
+ return bkey_lt(l, r) ? l : r;
+}
+
+static inline struct bpos bkey_max(struct bpos l, struct bpos r)
+{
+ return bkey_gt(l, r) ? l : r;
+}
+
+void bch2_bpos_swab(struct bpos *);
+void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
+
+static __always_inline int bversion_cmp(struct bversion l, struct bversion r)
+{
+ return cmp_int(l.hi, r.hi) ?:
+ cmp_int(l.lo, r.lo);
+}
+
+#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 })
+#define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL })
+
+static __always_inline int bversion_zero(struct bversion v)
+{
+ return !bversion_cmp(v, ZERO_VERSION);
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+/* statement expressions confusing unlikely()? */
+#define bkey_packed(_k) \
+ ({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT); \
+ (_k)->format != KEY_FORMAT_CURRENT; })
+#else
+#define bkey_packed(_k) ((_k)->format != KEY_FORMAT_CURRENT)
+#endif
+
+/*
+ * It's safe to treat an unpacked bkey as a packed one, but not the reverse
+ */
+static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k)
+{
+ return (struct bkey_packed *) k;
+}
+
+static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k)
+{
+ return (const struct bkey_packed *) k;
+}
+
+static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k)
+{
+ return bkey_packed(k) ? NULL : (struct bkey_i *) k;
+}
+
+static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k)
+{
+ return bkey_packed(k) ? NULL : (const struct bkey *) k;
+}
+
+static inline unsigned bkey_format_key_bits(const struct bkey_format *format)
+{
+ return format->bits_per_field[BKEY_FIELD_INODE] +
+ format->bits_per_field[BKEY_FIELD_OFFSET] +
+ format->bits_per_field[BKEY_FIELD_SNAPSHOT];
+}
+
+static inline struct bpos bpos_successor(struct bpos p)
+{
+ if (!++p.snapshot &&
+ !++p.offset &&
+ !++p.inode)
+ BUG();
+
+ return p;
+}
+
+static inline struct bpos bpos_predecessor(struct bpos p)
+{
+ if (!p.snapshot-- &&
+ !p.offset-- &&
+ !p.inode--)
+ BUG();
+
+ return p;
+}
+
+static inline struct bpos bpos_nosnap_successor(struct bpos p)
+{
+ p.snapshot = 0;
+
+ if (!++p.offset &&
+ !++p.inode)
+ BUG();
+
+ return p;
+}
+
+static inline struct bpos bpos_nosnap_predecessor(struct bpos p)
+{
+ p.snapshot = 0;
+
+ if (!p.offset-- &&
+ !p.inode--)
+ BUG();
+
+ return p;
+}
+
+static inline u64 bkey_start_offset(const struct bkey *k)
+{
+ return k->p.offset - k->size;
+}
+
+static inline struct bpos bkey_start_pos(const struct bkey *k)
+{
+ return (struct bpos) {
+ .inode = k->p.inode,
+ .offset = bkey_start_offset(k),
+ .snapshot = k->p.snapshot,
+ };
+}
+
+/* Packed helpers */
+
+static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
+ const struct bkey_packed *k)
+{
+ unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s;
+
+ EBUG_ON(k->u64s < ret);
+ return ret;
+}
+
+static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
+ const struct bkey_packed *k)
+{
+ return bkeyp_key_u64s(format, k) * sizeof(u64);
+}
+
+static inline unsigned bkeyp_val_u64s(const struct bkey_format *format,
+ const struct bkey_packed *k)
+{
+ return k->u64s - bkeyp_key_u64s(format, k);
+}
+
+static inline size_t bkeyp_val_bytes(const struct bkey_format *format,
+ const struct bkey_packed *k)
+{
+ return bkeyp_val_u64s(format, k) * sizeof(u64);
+}
+
+static inline void set_bkeyp_val_u64s(const struct bkey_format *format,
+ struct bkey_packed *k, unsigned val_u64s)
+{
+ k->u64s = bkeyp_key_u64s(format, k) + val_u64s;
+}
+
+#define bkeyp_val(_format, _k) \
+ ((struct bch_val *) ((u64 *) (_k)->_data + bkeyp_key_u64s(_format, _k)))
+
+extern const struct bkey_format bch2_bkey_format_current;
+
+bool bch2_bkey_transform(const struct bkey_format *,
+ struct bkey_packed *,
+ const struct bkey_format *,
+ const struct bkey_packed *);
+
+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
+ const struct bkey_packed *);
+
+#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
+struct bpos __bkey_unpack_pos(const struct bkey_format *,
+ const struct bkey_packed *);
+#endif
+
+bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *,
+ const struct bkey_format *);
+
+enum bkey_pack_pos_ret {
+ BKEY_PACK_POS_EXACT,
+ BKEY_PACK_POS_SMALLER,
+ BKEY_PACK_POS_FAIL,
+};
+
+enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos,
+ const struct btree *);
+
+static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in,
+ const struct btree *b)
+{
+ return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT;
+}
+
+void bch2_bkey_unpack(const struct btree *, struct bkey_i *,
+ const struct bkey_packed *);
+bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *,
+ const struct bkey_format *);
+
+typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
+
+static inline void
+__bkey_unpack_key_format_checked(const struct btree *b,
+ struct bkey *dst,
+ const struct bkey_packed *src)
+{
+ if (IS_ENABLED(HAVE_BCACHEFS_COMPILED_UNPACK)) {
+ compiled_unpack_fn unpack_fn = b->aux_data;
+ unpack_fn(dst, src);
+
+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+ bch2_expensive_debug_checks) {
+ struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
+
+ BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
+ }
+ } else {
+ *dst = __bch2_bkey_unpack_key(&b->format, src);
+ }
+}
+
+static inline struct bkey
+bkey_unpack_key_format_checked(const struct btree *b,
+ const struct bkey_packed *src)
+{
+ struct bkey dst;
+
+ __bkey_unpack_key_format_checked(b, &dst, src);
+ return dst;
+}
+
+static inline void __bkey_unpack_key(const struct btree *b,
+ struct bkey *dst,
+ const struct bkey_packed *src)
+{
+ if (likely(bkey_packed(src)))
+ __bkey_unpack_key_format_checked(b, dst, src);
+ else
+ *dst = *packed_to_bkey_c(src);
+}
+
+/**
+ * bkey_unpack_key -- unpack just the key, not the value
+ */
+static inline struct bkey bkey_unpack_key(const struct btree *b,
+ const struct bkey_packed *src)
+{
+ return likely(bkey_packed(src))
+ ? bkey_unpack_key_format_checked(b, src)
+ : *packed_to_bkey_c(src);
+}
+
+static inline struct bpos
+bkey_unpack_pos_format_checked(const struct btree *b,
+ const struct bkey_packed *src)
+{
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
+ return bkey_unpack_key_format_checked(b, src).p;
+#else
+ return __bkey_unpack_pos(&b->format, src);
+#endif
+}
+
+static inline struct bpos bkey_unpack_pos(const struct btree *b,
+ const struct bkey_packed *src)
+{
+ return likely(bkey_packed(src))
+ ? bkey_unpack_pos_format_checked(b, src)
+ : packed_to_bkey_c(src)->p;
+}
+
+/* Disassembled bkeys */
+
+static inline struct bkey_s_c bkey_disassemble(const struct btree *b,
+ const struct bkey_packed *k,
+ struct bkey *u)
+{
+ __bkey_unpack_key(b, u, k);
+
+ return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
+}
+
+/* non const version: */
+static inline struct bkey_s __bkey_disassemble(const struct btree *b,
+ struct bkey_packed *k,
+ struct bkey *u)
+{
+ __bkey_unpack_key(b, u, k);
+
+ return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
+}
+
+static inline u64 bkey_field_max(const struct bkey_format *f,
+ enum bch_bkey_fields nr)
+{
+ return f->bits_per_field[nr] < 64
+ ? (le64_to_cpu(f->field_offset[nr]) +
+ ~(~0ULL << f->bits_per_field[nr]))
+ : U64_MAX;
+}
+
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
+
+int bch2_compile_bkey_format(const struct bkey_format *, void *);
+
+#else
+
+static inline int bch2_compile_bkey_format(const struct bkey_format *format,
+ void *out) { return 0; }
+
+#endif
+
+static inline void bkey_reassemble(struct bkey_i *dst,
+ struct bkey_s_c src)
+{
+ dst->k = *src.k;
+ memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k));
+}
+
+#define bkey_s_null ((struct bkey_s) { .k = NULL })
+#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL })
+
+#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) })
+#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) })
+
+static inline struct bkey_s bkey_to_s(struct bkey *k)
+{
+ return (struct bkey_s) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
+{
+ return (struct bkey_s_c) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
+{
+ return (struct bkey_s) { .k = &k->k, .v = &k->v };
+}
+
+static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
+{
+ return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
+}
+
+/*
+ * For a given type of value (e.g. struct bch_extent), generates the types for
+ * bkey + bch_extent - inline, split, split const - and also all the conversion
+ * functions, which also check that the value is of the correct type.
+ *
+ * We use anonymous unions for upcasting - e.g. converting from e.g. a
+ * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
+ * functions.
+ */
+#define x(name, ...) \
+struct bkey_i_##name { \
+ union { \
+ struct bkey k; \
+ struct bkey_i k_i; \
+ }; \
+ struct bch_##name v; \
+}; \
+ \
+struct bkey_s_c_##name { \
+ union { \
+ struct { \
+ const struct bkey *k; \
+ const struct bch_##name *v; \
+ }; \
+ struct bkey_s_c s_c; \
+ }; \
+}; \
+ \
+struct bkey_s_##name { \
+ union { \
+ struct { \
+ struct bkey *k; \
+ struct bch_##name *v; \
+ }; \
+ struct bkey_s_c_##name c; \
+ struct bkey_s s; \
+ struct bkey_s_c s_c; \
+ }; \
+}; \
+ \
+static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
+ return container_of(&k->k, struct bkey_i_##name, k); \
+} \
+ \
+static inline const struct bkey_i_##name * \
+bkey_i_to_##name##_c(const struct bkey_i *k) \
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
+ return container_of(&k->k, struct bkey_i_##name, k); \
+} \
+ \
+static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
+ return (struct bkey_s_##name) { \
+ .k = k.k, \
+ .v = container_of(k.v, struct bch_##name, v), \
+ }; \
+} \
+ \
+static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
+ return (struct bkey_s_c_##name) { \
+ .k = k.k, \
+ .v = container_of(k.v, struct bch_##name, v), \
+ }; \
+} \
+ \
+static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
+{ \
+ return (struct bkey_s_##name) { \
+ .k = &k->k, \
+ .v = &k->v, \
+ }; \
+} \
+ \
+static inline struct bkey_s_c_##name \
+name##_i_to_s_c(const struct bkey_i_##name *k) \
+{ \
+ return (struct bkey_s_c_##name) { \
+ .k = &k->k, \
+ .v = &k->v, \
+ }; \
+} \
+ \
+static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
+ return (struct bkey_s_##name) { \
+ .k = &k->k, \
+ .v = container_of(&k->v, struct bch_##name, v), \
+ }; \
+} \
+ \
+static inline struct bkey_s_c_##name \
+bkey_i_to_s_c_##name(const struct bkey_i *k) \
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
+ return (struct bkey_s_c_##name) { \
+ .k = &k->k, \
+ .v = container_of(&k->v, struct bch_##name, v), \
+ }; \
+} \
+ \
+static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
+{ \
+ struct bkey_i_##name *k = \
+ container_of(&_k->k, struct bkey_i_##name, k); \
+ \
+ bkey_init(&k->k); \
+ memset(&k->v, 0, sizeof(k->v)); \
+ k->k.type = KEY_TYPE_##name; \
+ set_bkey_val_bytes(&k->k, sizeof(k->v)); \
+ \
+ return k; \
+}
+
+BCH_BKEY_TYPES();
+#undef x
+
+/* byte order helpers */
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+
+static inline unsigned high_word_offset(const struct bkey_format *f)
+{
+ return f->key_u64s - 1;
+}
+
+#define high_bit_offset 0
+#define nth_word(p, n) ((p) - (n))
+
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+
+static inline unsigned high_word_offset(const struct bkey_format *f)
+{
+ return 0;
+}
+
+#define high_bit_offset KEY_PACKED_BITS_START
+#define nth_word(p, n) ((p) + (n))
+
+#else
+#error edit for your odd byteorder.
+#endif
+
+#define high_word(f, k) ((u64 *) (k)->_data + high_word_offset(f))
+#define next_word(p) nth_word(p, 1)
+#define prev_word(p) nth_word(p, -1)
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_bkey_pack_test(void);
+#else
+static inline void bch2_bkey_pack_test(void) {}
+#endif
+
+#define bkey_fields() \
+ x(BKEY_FIELD_INODE, p.inode) \
+ x(BKEY_FIELD_OFFSET, p.offset) \
+ x(BKEY_FIELD_SNAPSHOT, p.snapshot) \
+ x(BKEY_FIELD_SIZE, size) \
+ x(BKEY_FIELD_VERSION_HI, version.hi) \
+ x(BKEY_FIELD_VERSION_LO, version.lo)
+
+struct bkey_format_state {
+ u64 field_min[BKEY_NR_FIELDS];
+ u64 field_max[BKEY_NR_FIELDS];
+};
+
+void bch2_bkey_format_init(struct bkey_format_state *);
+
+static inline void __bkey_format_add(struct bkey_format_state *s, unsigned field, u64 v)
+{
+ s->field_min[field] = min(s->field_min[field], v);
+ s->field_max[field] = max(s->field_max[field], v);
+}
+
+/*
+ * Changes @format so that @k can be successfully packed with @format
+ */
+static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k)
+{
+#define x(id, field) __bkey_format_add(s, id, k->field);
+ bkey_fields()
+#undef x
+}
+
+void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
+struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
+int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *,
+ enum bkey_invalid_flags, struct printbuf *);
+void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *);
+
+#endif /* _BCACHEFS_BKEY_H */
diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h
new file mode 100644
index 000000000000..a30c4ae8eb36
--- /dev/null
+++ b/fs/bcachefs/bkey_buf.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_BUF_H
+#define _BCACHEFS_BKEY_BUF_H
+
+#include "bcachefs.h"
+#include "bkey.h"
+
+struct bkey_buf {
+ struct bkey_i *k;
+ u64 onstack[12];
+};
+
+static inline void bch2_bkey_buf_realloc(struct bkey_buf *s,
+ struct bch_fs *c, unsigned u64s)
+{
+ if (s->k == (void *) s->onstack &&
+ u64s > ARRAY_SIZE(s->onstack)) {
+ s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS);
+ memcpy(s->k, s->onstack, sizeof(s->onstack));
+ }
+}
+
+static inline void bch2_bkey_buf_reassemble(struct bkey_buf *s,
+ struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ bch2_bkey_buf_realloc(s, c, k.k->u64s);
+ bkey_reassemble(s->k, k);
+}
+
+static inline void bch2_bkey_buf_copy(struct bkey_buf *s,
+ struct bch_fs *c,
+ struct bkey_i *src)
+{
+ bch2_bkey_buf_realloc(s, c, src->k.u64s);
+ bkey_copy(s->k, src);
+}
+
+static inline void bch2_bkey_buf_unpack(struct bkey_buf *s,
+ struct bch_fs *c,
+ struct btree *b,
+ struct bkey_packed *src)
+{
+ bch2_bkey_buf_realloc(s, c, BKEY_U64s +
+ bkeyp_val_u64s(&b->format, src));
+ bch2_bkey_unpack(b, s->k, src);
+}
+
+static inline void bch2_bkey_buf_init(struct bkey_buf *s)
+{
+ s->k = (void *) s->onstack;
+}
+
+static inline void bch2_bkey_buf_exit(struct bkey_buf *s, struct bch_fs *c)
+{
+ if (s->k != (void *) s->onstack)
+ mempool_free(s->k, &c->large_bkey_pool);
+ s->k = NULL;
+}
+
+#endif /* _BCACHEFS_BKEY_BUF_H */
diff --git a/fs/bcachefs/bkey_cmp.h b/fs/bcachefs/bkey_cmp.h
new file mode 100644
index 000000000000..5f42a6e69360
--- /dev/null
+++ b/fs/bcachefs/bkey_cmp.h
@@ -0,0 +1,129 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_CMP_H
+#define _BCACHEFS_BKEY_CMP_H
+
+#include "bkey.h"
+
+#ifdef CONFIG_X86_64
+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
+ unsigned nr_key_bits)
+{
+ long d0, d1, d2, d3;
+ int cmp;
+
+ /* we shouldn't need asm for this, but gcc is being retarded: */
+
+ asm(".intel_syntax noprefix;"
+ "xor eax, eax;"
+ "xor edx, edx;"
+ "1:;"
+ "mov r8, [rdi];"
+ "mov r9, [rsi];"
+ "sub ecx, 64;"
+ "jl 2f;"
+
+ "cmp r8, r9;"
+ "jnz 3f;"
+
+ "lea rdi, [rdi - 8];"
+ "lea rsi, [rsi - 8];"
+ "jmp 1b;"
+
+ "2:;"
+ "not ecx;"
+ "shr r8, 1;"
+ "shr r9, 1;"
+ "shr r8, cl;"
+ "shr r9, cl;"
+ "cmp r8, r9;"
+
+ "3:\n"
+ "seta al;"
+ "setb dl;"
+ "sub eax, edx;"
+ ".att_syntax prefix;"
+ : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp)
+ : "0" (l), "1" (r), "3" (nr_key_bits)
+ : "r8", "r9", "cc", "memory");
+
+ return cmp;
+}
+#else
+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
+ unsigned nr_key_bits)
+{
+ u64 l_v, r_v;
+
+ if (!nr_key_bits)
+ return 0;
+
+ /* for big endian, skip past header */
+ nr_key_bits += high_bit_offset;
+ l_v = *l & (~0ULL >> high_bit_offset);
+ r_v = *r & (~0ULL >> high_bit_offset);
+
+ while (1) {
+ if (nr_key_bits < 64) {
+ l_v >>= 64 - nr_key_bits;
+ r_v >>= 64 - nr_key_bits;
+ nr_key_bits = 0;
+ } else {
+ nr_key_bits -= 64;
+ }
+
+ if (!nr_key_bits || l_v != r_v)
+ break;
+
+ l = next_word(l);
+ r = next_word(r);
+
+ l_v = *l;
+ r_v = *r;
+ }
+
+ return cmp_int(l_v, r_v);
+}
+#endif
+
+static inline __pure __flatten
+int __bch2_bkey_cmp_packed_format_checked_inlined(const struct bkey_packed *l,
+ const struct bkey_packed *r,
+ const struct btree *b)
+{
+ const struct bkey_format *f = &b->format;
+ int ret;
+
+ EBUG_ON(!bkey_packed(l) || !bkey_packed(r));
+ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
+
+ ret = __bkey_cmp_bits(high_word(f, l),
+ high_word(f, r),
+ b->nr_key_bits);
+
+ EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l),
+ bkey_unpack_pos(b, r)));
+ return ret;
+}
+
+static inline __pure __flatten
+int bch2_bkey_cmp_packed_inlined(const struct btree *b,
+ const struct bkey_packed *l,
+ const struct bkey_packed *r)
+{
+ struct bkey unpacked;
+
+ if (likely(bkey_packed(l) && bkey_packed(r)))
+ return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b);
+
+ if (bkey_packed(l)) {
+ __bkey_unpack_key_format_checked(b, &unpacked, l);
+ l = (void *) &unpacked;
+ } else if (bkey_packed(r)) {
+ __bkey_unpack_key_format_checked(b, &unpacked, r);
+ r = (void *) &unpacked;
+ }
+
+ return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
+}
+
+#endif /* _BCACHEFS_BKEY_CMP_H */
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
new file mode 100644
index 000000000000..761f5e33b1e6
--- /dev/null
+++ b/fs/bcachefs/bkey_methods.c
@@ -0,0 +1,459 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "backpointers.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+#include "btree_types.h"
+#include "alloc_background.h"
+#include "dirent.h"
+#include "ec.h"
+#include "error.h"
+#include "extents.h"
+#include "inode.h"
+#include "io_misc.h"
+#include "lru.h"
+#include "quota.h"
+#include "reflink.h"
+#include "snapshot.h"
+#include "subvolume.h"
+#include "xattr.h"
+
+const char * const bch2_bkey_types[] = {
+#define x(name, nr) #name,
+ BCH_BKEY_TYPES()
+#undef x
+ NULL
+};
+
+static int deleted_key_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags, struct printbuf *err)
+{
+ return 0;
+}
+
+#define bch2_bkey_ops_deleted ((struct bkey_ops) { \
+ .key_invalid = deleted_key_invalid, \
+})
+
+#define bch2_bkey_ops_whiteout ((struct bkey_ops) { \
+ .key_invalid = deleted_key_invalid, \
+})
+
+static int empty_val_key_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags, struct printbuf *err)
+{
+ int ret = 0;
+
+ bkey_fsck_err_on(bkey_val_bytes(k.k), c, err,
+ bkey_val_size_nonzero,
+ "incorrect value size (%zu != 0)",
+ bkey_val_bytes(k.k));
+fsck_err:
+ return ret;
+}
+
+#define bch2_bkey_ops_error ((struct bkey_ops) { \
+ .key_invalid = empty_val_key_invalid, \
+})
+
+static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags, struct printbuf *err)
+{
+ return 0;
+}
+
+#define bch2_bkey_ops_cookie ((struct bkey_ops) { \
+ .key_invalid = key_type_cookie_invalid, \
+ .min_val_size = 8, \
+})
+
+#define bch2_bkey_ops_hash_whiteout ((struct bkey_ops) {\
+ .key_invalid = empty_val_key_invalid, \
+})
+
+static int key_type_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags, struct printbuf *err)
+{
+ return 0;
+}
+
+static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
+ unsigned datalen = bkey_inline_data_bytes(k.k);
+
+ prt_printf(out, "datalen %u: %*phN",
+ datalen, min(datalen, 32U), d.v->data);
+}
+
+#define bch2_bkey_ops_inline_data ((struct bkey_ops) { \
+ .key_invalid = key_type_inline_data_invalid, \
+ .val_to_text = key_type_inline_data_to_text, \
+})
+
+static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
+{
+ bch2_key_resize(l.k, l.k->size + r.k->size);
+ return true;
+}
+
+#define bch2_bkey_ops_set ((struct bkey_ops) { \
+ .key_invalid = empty_val_key_invalid, \
+ .key_merge = key_type_set_merge, \
+})
+
+const struct bkey_ops bch2_bkey_ops[] = {
+#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name,
+ BCH_BKEY_TYPES()
+#undef x
+};
+
+const struct bkey_ops bch2_bkey_null_ops = {
+};
+
+int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
+{
+ const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
+ int ret = 0;
+
+ bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size, c, err,
+ bkey_val_size_too_small,
+ "bad val size (%zu < %u)",
+ bkey_val_bytes(k.k), ops->min_val_size);
+
+ if (!ops->key_invalid)
+ return 0;
+
+ ret = ops->key_invalid(c, k, flags, err);
+fsck_err:
+ return ret;
+}
+
+static u64 bch2_key_types_allowed[] = {
+ [BKEY_TYPE_btree] =
+ BIT_ULL(KEY_TYPE_deleted)|
+ BIT_ULL(KEY_TYPE_btree_ptr)|
+ BIT_ULL(KEY_TYPE_btree_ptr_v2),
+#define x(name, nr, flags, keys) [BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys,
+ BCH_BTREE_IDS()
+#undef x
+};
+
+const char *bch2_btree_node_type_str(enum btree_node_type type)
+{
+ return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1);
+}
+
+int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum btree_node_type type,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
+{
+ int ret = 0;
+
+ bkey_fsck_err_on(k.k->u64s < BKEY_U64s, c, err,
+ bkey_u64s_too_small,
+ "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
+
+ if (type >= BKEY_TYPE_NR)
+ return 0;
+
+ bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) &&
+ !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err,
+ bkey_invalid_type_for_btree,
+ "invalid key type for btree %s (%s)",
+ bch2_btree_node_type_str(type), bch2_bkey_types[k.k->type]);
+
+ if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
+ bkey_fsck_err_on(k.k->size == 0, c, err,
+ bkey_extent_size_zero,
+ "size == 0");
+
+ bkey_fsck_err_on(k.k->size > k.k->p.offset, c, err,
+ bkey_extent_size_greater_than_offset,
+ "size greater than offset (%u > %llu)",
+ k.k->size, k.k->p.offset);
+ } else {
+ bkey_fsck_err_on(k.k->size, c, err,
+ bkey_size_nonzero,
+ "size != 0");
+ }
+
+ if (type != BKEY_TYPE_btree) {
+ enum btree_id btree = type - 1;
+
+ if (btree_type_has_snapshots(btree)) {
+ bkey_fsck_err_on(!k.k->p.snapshot, c, err,
+ bkey_snapshot_zero,
+ "snapshot == 0");
+ } else if (!btree_type_has_snapshot_field(btree)) {
+ bkey_fsck_err_on(k.k->p.snapshot, c, err,
+ bkey_snapshot_nonzero,
+ "nonzero snapshot");
+ } else {
+ /*
+ * btree uses snapshot field but it's not required to be
+ * nonzero
+ */
+ }
+
+ bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), c, err,
+ bkey_at_pos_max,
+ "key at POS_MAX");
+ }
+fsck_err:
+ return ret;
+}
+
+int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum btree_node_type type,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
+{
+ return __bch2_bkey_invalid(c, k, type, flags, err) ?:
+ bch2_bkey_val_invalid(c, k, flags, err);
+}
+
+int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b,
+ struct bkey_s_c k, struct printbuf *err)
+{
+ int ret = 0;
+
+ bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key), c, err,
+ bkey_before_start_of_btree_node,
+ "key before start of btree node");
+
+ bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key), c, err,
+ bkey_after_end_of_btree_node,
+ "key past end of btree node");
+fsck_err:
+ return ret;
+}
+
+void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
+{
+ if (bpos_eq(pos, POS_MIN))
+ prt_printf(out, "POS_MIN");
+ else if (bpos_eq(pos, POS_MAX))
+ prt_printf(out, "POS_MAX");
+ else if (bpos_eq(pos, SPOS_MAX))
+ prt_printf(out, "SPOS_MAX");
+ else {
+ if (pos.inode == U64_MAX)
+ prt_printf(out, "U64_MAX");
+ else
+ prt_printf(out, "%llu", pos.inode);
+ prt_printf(out, ":");
+ if (pos.offset == U64_MAX)
+ prt_printf(out, "U64_MAX");
+ else
+ prt_printf(out, "%llu", pos.offset);
+ prt_printf(out, ":");
+ if (pos.snapshot == U32_MAX)
+ prt_printf(out, "U32_MAX");
+ else
+ prt_printf(out, "%u", pos.snapshot);
+ }
+}
+
+void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
+{
+ if (k) {
+ prt_printf(out, "u64s %u type ", k->u64s);
+
+ if (k->type < KEY_TYPE_MAX)
+ prt_printf(out, "%s ", bch2_bkey_types[k->type]);
+ else
+ prt_printf(out, "%u ", k->type);
+
+ bch2_bpos_to_text(out, k->p);
+
+ prt_printf(out, " len %u ver %llu", k->size, k->version.lo);
+ } else {
+ prt_printf(out, "(null)");
+ }
+}
+
+void bch2_val_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
+
+ if (likely(ops->val_to_text))
+ ops->val_to_text(out, c, k);
+}
+
+void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ bch2_bkey_to_text(out, k.k);
+
+ if (bkey_val_bytes(k.k)) {
+ prt_printf(out, ": ");
+ bch2_val_to_text(out, c, k);
+ }
+}
+
+void bch2_bkey_swab_val(struct bkey_s k)
+{
+ const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
+
+ if (ops->swab)
+ ops->swab(k);
+}
+
+bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k)
+{
+ const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
+
+ return ops->key_normalize
+ ? ops->key_normalize(c, k)
+ : false;
+}
+
+bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
+{
+ const struct bkey_ops *ops = bch2_bkey_type_ops(l.k->type);
+
+ return ops->key_merge &&
+ bch2_bkey_maybe_mergable(l.k, r.k) &&
+ (u64) l.k->size + r.k->size <= KEY_SIZE_MAX &&
+ !bch2_key_merging_disabled &&
+ ops->key_merge(c, l, r);
+}
+
+static const struct old_bkey_type {
+ u8 btree_node_type;
+ u8 old;
+ u8 new;
+} bkey_renumber_table[] = {
+ {BKEY_TYPE_btree, 128, KEY_TYPE_btree_ptr },
+ {BKEY_TYPE_extents, 128, KEY_TYPE_extent },
+ {BKEY_TYPE_extents, 129, KEY_TYPE_extent },
+ {BKEY_TYPE_extents, 130, KEY_TYPE_reservation },
+ {BKEY_TYPE_inodes, 128, KEY_TYPE_inode },
+ {BKEY_TYPE_inodes, 130, KEY_TYPE_inode_generation },
+ {BKEY_TYPE_dirents, 128, KEY_TYPE_dirent },
+ {BKEY_TYPE_dirents, 129, KEY_TYPE_hash_whiteout },
+ {BKEY_TYPE_xattrs, 128, KEY_TYPE_xattr },
+ {BKEY_TYPE_xattrs, 129, KEY_TYPE_hash_whiteout },
+ {BKEY_TYPE_alloc, 128, KEY_TYPE_alloc },
+ {BKEY_TYPE_quotas, 128, KEY_TYPE_quota },
+};
+
+void bch2_bkey_renumber(enum btree_node_type btree_node_type,
+ struct bkey_packed *k,
+ int write)
+{
+ const struct old_bkey_type *i;
+
+ for (i = bkey_renumber_table;
+ i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table);
+ i++)
+ if (btree_node_type == i->btree_node_type &&
+ k->type == (write ? i->new : i->old)) {
+ k->type = write ? i->old : i->new;
+ break;
+ }
+}
+
+void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
+ unsigned version, unsigned big_endian,
+ int write,
+ struct bkey_format *f,
+ struct bkey_packed *k)
+{
+ const struct bkey_ops *ops;
+ struct bkey uk;
+ unsigned nr_compat = 5;
+ int i;
+
+ /*
+ * Do these operations in reverse order in the write path:
+ */
+
+ for (i = 0; i < nr_compat; i++)
+ switch (!write ? i : nr_compat - 1 - i) {
+ case 0:
+ if (big_endian != CPU_BIG_ENDIAN)
+ bch2_bkey_swab_key(f, k);
+ break;
+ case 1:
+ if (version < bcachefs_metadata_version_bkey_renumber)
+ bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write);
+ break;
+ case 2:
+ if (version < bcachefs_metadata_version_inode_btree_change &&
+ btree_id == BTREE_ID_inodes) {
+ if (!bkey_packed(k)) {
+ struct bkey_i *u = packed_to_bkey(k);
+
+ swap(u->k.p.inode, u->k.p.offset);
+ } else if (f->bits_per_field[BKEY_FIELD_INODE] &&
+ f->bits_per_field[BKEY_FIELD_OFFSET]) {
+ struct bkey_format tmp = *f, *in = f, *out = &tmp;
+
+ swap(tmp.bits_per_field[BKEY_FIELD_INODE],
+ tmp.bits_per_field[BKEY_FIELD_OFFSET]);
+ swap(tmp.field_offset[BKEY_FIELD_INODE],
+ tmp.field_offset[BKEY_FIELD_OFFSET]);
+
+ if (!write)
+ swap(in, out);
+
+ uk = __bch2_bkey_unpack_key(in, k);
+ swap(uk.p.inode, uk.p.offset);
+ BUG_ON(!bch2_bkey_pack_key(k, &uk, out));
+ }
+ }
+ break;
+ case 3:
+ if (version < bcachefs_metadata_version_snapshot &&
+ (level || btree_type_has_snapshots(btree_id))) {
+ struct bkey_i *u = packed_to_bkey(k);
+
+ if (u) {
+ u->k.p.snapshot = write
+ ? 0 : U32_MAX;
+ } else {
+ u64 min_packed = le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]);
+ u64 max_packed = min_packed +
+ ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
+
+ uk = __bch2_bkey_unpack_key(f, k);
+ uk.p.snapshot = write
+ ? min_packed : min_t(u64, U32_MAX, max_packed);
+
+ BUG_ON(!bch2_bkey_pack_key(k, &uk, f));
+ }
+ }
+
+ break;
+ case 4: {
+ struct bkey_s u;
+
+ if (!bkey_packed(k)) {
+ u = bkey_i_to_s(packed_to_bkey(k));
+ } else {
+ uk = __bch2_bkey_unpack_key(f, k);
+ u.k = &uk;
+ u.v = bkeyp_val(f, k);
+ }
+
+ if (big_endian != CPU_BIG_ENDIAN)
+ bch2_bkey_swab_val(u);
+
+ ops = bch2_bkey_type_ops(k->type);
+
+ if (ops->compat)
+ ops->compat(btree_id, version, big_endian, write, u);
+ break;
+ }
+ default:
+ BUG();
+ }
+}
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
new file mode 100644
index 000000000000..3a370b7087ac
--- /dev/null
+++ b/fs/bcachefs/bkey_methods.h
@@ -0,0 +1,179 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_METHODS_H
+#define _BCACHEFS_BKEY_METHODS_H
+
+#include "bkey.h"
+
+struct bch_fs;
+struct btree;
+struct btree_trans;
+struct bkey;
+enum btree_node_type;
+
+extern const char * const bch2_bkey_types[];
+extern const struct bkey_ops bch2_bkey_null_ops;
+
+/*
+ * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If
+ * invalid, entire key will be deleted.
+ *
+ * When invalid, error string is returned via @err. @rw indicates whether key is
+ * being read or written; more aggressive checks can be enabled when rw == WRITE.
+ */
+struct bkey_ops {
+ int (*key_invalid)(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags, struct printbuf *err);
+ void (*val_to_text)(struct printbuf *, struct bch_fs *,
+ struct bkey_s_c);
+ void (*swab)(struct bkey_s);
+ bool (*key_normalize)(struct bch_fs *, struct bkey_s);
+ bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+ int (*trans_trigger)(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_i *, unsigned);
+ int (*atomic_trigger)(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s_c, unsigned);
+ void (*compat)(enum btree_id id, unsigned version,
+ unsigned big_endian, int write,
+ struct bkey_s);
+
+ /* Size of value type when first created: */
+ unsigned min_val_size;
+};
+
+extern const struct bkey_ops bch2_bkey_ops[];
+
+static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type)
+{
+ return likely(type < KEY_TYPE_MAX)
+ ? &bch2_bkey_ops[type]
+ : &bch2_bkey_null_ops;
+}
+
+int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c,
+ enum bkey_invalid_flags, struct printbuf *);
+int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
+ enum bkey_invalid_flags, struct printbuf *);
+int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
+ enum bkey_invalid_flags, struct printbuf *);
+int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *,
+ struct bkey_s_c, struct printbuf *);
+
+void bch2_bpos_to_text(struct printbuf *, struct bpos);
+void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
+void bch2_val_to_text(struct printbuf *, struct bch_fs *,
+ struct bkey_s_c);
+void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *,
+ struct bkey_s_c);
+
+void bch2_bkey_swab_val(struct bkey_s);
+
+bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s);
+
+static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r)
+{
+ return l->type == r->type &&
+ !bversion_cmp(l->version, r->version) &&
+ bpos_eq(l->p, bkey_start_pos(r));
+}
+
+bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+
+static inline int bch2_mark_key(struct btree_trans *trans,
+ enum btree_id btree, unsigned level,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
+{
+ const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type);
+
+ return ops->atomic_trigger
+ ? ops->atomic_trigger(trans, btree, level, old, new, flags)
+ : 0;
+}
+
+enum btree_update_flags {
+ __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END,
+ __BTREE_UPDATE_NOJOURNAL,
+ __BTREE_UPDATE_PREJOURNAL,
+ __BTREE_UPDATE_KEY_CACHE_RECLAIM,
+
+ __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */
+
+ __BTREE_TRIGGER_INSERT,
+ __BTREE_TRIGGER_OVERWRITE,
+
+ __BTREE_TRIGGER_GC,
+ __BTREE_TRIGGER_BUCKET_INVALIDATE,
+ __BTREE_TRIGGER_NOATOMIC,
+};
+
+#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
+#define BTREE_UPDATE_NOJOURNAL (1U << __BTREE_UPDATE_NOJOURNAL)
+#define BTREE_UPDATE_PREJOURNAL (1U << __BTREE_UPDATE_PREJOURNAL)
+#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
+
+#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN)
+
+#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT)
+#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE)
+
+#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC)
+#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
+#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC)
+
+static inline int bch2_trans_mark_key(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_i *new,
+ unsigned flags)
+{
+ const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new->k.type);
+
+ return ops->trans_trigger
+ ? ops->trans_trigger(trans, btree_id, level, old, new, flags)
+ : 0;
+}
+
+static inline int bch2_trans_mark_old(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, unsigned flags)
+{
+ struct bkey_i deleted;
+
+ bkey_init(&deleted.k);
+ deleted.k.p = old.k->p;
+
+ return bch2_trans_mark_key(trans, btree_id, level, old, &deleted,
+ BTREE_TRIGGER_OVERWRITE|flags);
+}
+
+static inline int bch2_trans_mark_new(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_i *new, unsigned flags)
+{
+ struct bkey_i deleted;
+
+ bkey_init(&deleted.k);
+ deleted.k.p = new->k.p;
+
+ return bch2_trans_mark_key(trans, btree_id, level, bkey_i_to_s_c(&deleted), new,
+ BTREE_TRIGGER_INSERT|flags);
+}
+
+void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
+
+void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned,
+ int, struct bkey_format *, struct bkey_packed *);
+
+static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id,
+ unsigned version, unsigned big_endian,
+ int write,
+ struct bkey_format *f,
+ struct bkey_packed *k)
+{
+ if (version < bcachefs_metadata_version_current ||
+ big_endian != CPU_BIG_ENDIAN)
+ __bch2_bkey_compat(level, btree_id, version,
+ big_endian, write, f, k);
+
+}
+
+#endif /* _BCACHEFS_BKEY_METHODS_H */
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
new file mode 100644
index 000000000000..bcca9e76a0b4
--- /dev/null
+++ b/fs/bcachefs/bkey_sort.c
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "bkey_cmp.h"
+#include "bkey_sort.h"
+#include "bset.h"
+#include "extents.h"
+
+typedef int (*sort_cmp_fn)(struct btree *,
+ struct bkey_packed *,
+ struct bkey_packed *);
+
+static inline bool sort_iter_end(struct sort_iter *iter)
+{
+ return !iter->used;
+}
+
+static inline void sort_iter_sift(struct sort_iter *iter, unsigned from,
+ sort_cmp_fn cmp)
+{
+ unsigned i;
+
+ for (i = from;
+ i + 1 < iter->used &&
+ cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0;
+ i++)
+ swap(iter->data[i], iter->data[i + 1]);
+}
+
+static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+ unsigned i = iter->used;
+
+ while (i--)
+ sort_iter_sift(iter, i, cmp);
+}
+
+static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
+{
+ return !sort_iter_end(iter) ? iter->data->k : NULL;
+}
+
+static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+ struct sort_iter_set *i = iter->data;
+
+ BUG_ON(!iter->used);
+
+ i->k = bkey_p_next(i->k);
+
+ BUG_ON(i->k > i->end);
+
+ if (i->k == i->end)
+ array_remove_item(iter->data, iter->used, 0);
+ else
+ sort_iter_sift(iter, 0, cmp);
+}
+
+static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
+ sort_cmp_fn cmp)
+{
+ struct bkey_packed *ret = sort_iter_peek(iter);
+
+ if (ret)
+ sort_iter_advance(iter, cmp);
+
+ return ret;
+}
+
+/*
+ * If keys compare equal, compare by pointer order:
+ */
+static inline int key_sort_fix_overlapping_cmp(struct btree *b,
+ struct bkey_packed *l,
+ struct bkey_packed *r)
+{
+ return bch2_bkey_cmp_packed(b, l, r) ?:
+ cmp_int((unsigned long) l, (unsigned long) r);
+}
+
+static inline bool should_drop_next_key(struct sort_iter *iter)
+{
+ /*
+ * key_sort_cmp() ensures that when keys compare equal the older key
+ * comes first; so if l->k compares equal to r->k then l->k is older
+ * and should be dropped.
+ */
+ return iter->used >= 2 &&
+ !bch2_bkey_cmp_packed(iter->b,
+ iter->data[0].k,
+ iter->data[1].k);
+}
+
+struct btree_nr_keys
+bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
+ struct sort_iter *iter)
+{
+ struct bkey_packed *out = dst->start;
+ struct bkey_packed *k;
+ struct btree_nr_keys nr;
+
+ memset(&nr, 0, sizeof(nr));
+
+ sort_iter_sort(iter, key_sort_fix_overlapping_cmp);
+
+ while ((k = sort_iter_peek(iter))) {
+ if (!bkey_deleted(k) &&
+ !should_drop_next_key(iter)) {
+ bkey_p_copy(out, k);
+ btree_keys_account_key_add(&nr, 0, out);
+ out = bkey_p_next(out);
+ }
+
+ sort_iter_advance(iter, key_sort_fix_overlapping_cmp);
+ }
+
+ dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+ return nr;
+}
+
+/* Sort + repack in a new format: */
+struct btree_nr_keys
+bch2_sort_repack(struct bset *dst, struct btree *src,
+ struct btree_node_iter *src_iter,
+ struct bkey_format *out_f,
+ bool filter_whiteouts)
+{
+ struct bkey_format *in_f = &src->format;
+ struct bkey_packed *in, *out = vstruct_last(dst);
+ struct btree_nr_keys nr;
+ bool transform = memcmp(out_f, &src->format, sizeof(*out_f));
+
+ memset(&nr, 0, sizeof(nr));
+
+ while ((in = bch2_btree_node_iter_next_all(src_iter, src))) {
+ if (filter_whiteouts && bkey_deleted(in))
+ continue;
+
+ if (!transform)
+ bkey_p_copy(out, in);
+ else if (bch2_bkey_transform(out_f, out, bkey_packed(in)
+ ? in_f : &bch2_bkey_format_current, in))
+ out->format = KEY_FORMAT_LOCAL_BTREE;
+ else
+ bch2_bkey_unpack(src, (void *) out, in);
+
+ out->needs_whiteout = false;
+
+ btree_keys_account_key_add(&nr, 0, out);
+ out = bkey_p_next(out);
+ }
+
+ dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+ return nr;
+}
+
+static inline int sort_keys_cmp(struct btree *b,
+ struct bkey_packed *l,
+ struct bkey_packed *r)
+{
+ return bch2_bkey_cmp_packed_inlined(b, l, r) ?:
+ (int) bkey_deleted(r) - (int) bkey_deleted(l) ?:
+ (int) l->needs_whiteout - (int) r->needs_whiteout;
+}
+
+unsigned bch2_sort_keys(struct bkey_packed *dst,
+ struct sort_iter *iter,
+ bool filter_whiteouts)
+{
+ const struct bkey_format *f = &iter->b->format;
+ struct bkey_packed *in, *next, *out = dst;
+
+ sort_iter_sort(iter, sort_keys_cmp);
+
+ while ((in = sort_iter_next(iter, sort_keys_cmp))) {
+ bool needs_whiteout = false;
+
+ if (bkey_deleted(in) &&
+ (filter_whiteouts || !in->needs_whiteout))
+ continue;
+
+ while ((next = sort_iter_peek(iter)) &&
+ !bch2_bkey_cmp_packed_inlined(iter->b, in, next)) {
+ BUG_ON(in->needs_whiteout &&
+ next->needs_whiteout);
+ needs_whiteout |= in->needs_whiteout;
+ in = sort_iter_next(iter, sort_keys_cmp);
+ }
+
+ if (bkey_deleted(in)) {
+ memcpy_u64s_small(out, in, bkeyp_key_u64s(f, in));
+ set_bkeyp_val_u64s(f, out, 0);
+ } else {
+ bkey_p_copy(out, in);
+ }
+ out->needs_whiteout |= needs_whiteout;
+ out = bkey_p_next(out);
+ }
+
+ return (u64 *) out - (u64 *) dst;
+}
diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h
new file mode 100644
index 000000000000..7c0f0b160f18
--- /dev/null
+++ b/fs/bcachefs/bkey_sort.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_SORT_H
+#define _BCACHEFS_BKEY_SORT_H
+
+struct sort_iter {
+ struct btree *b;
+ unsigned used;
+ unsigned size;
+
+ struct sort_iter_set {
+ struct bkey_packed *k, *end;
+ } data[];
+};
+
+static inline void sort_iter_init(struct sort_iter *iter, struct btree *b, unsigned size)
+{
+ iter->b = b;
+ iter->used = 0;
+ iter->size = size;
+}
+
+struct sort_iter_stack {
+ struct sort_iter iter;
+ struct sort_iter_set sets[MAX_BSETS + 1];
+};
+
+static inline void sort_iter_stack_init(struct sort_iter_stack *iter, struct btree *b)
+{
+ sort_iter_init(&iter->iter, b, ARRAY_SIZE(iter->sets));
+}
+
+static inline void sort_iter_add(struct sort_iter *iter,
+ struct bkey_packed *k,
+ struct bkey_packed *end)
+{
+ BUG_ON(iter->used >= iter->size);
+
+ if (k != end)
+ iter->data[iter->used++] = (struct sort_iter_set) { k, end };
+}
+
+struct btree_nr_keys
+bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *,
+ struct sort_iter *);
+
+struct btree_nr_keys
+bch2_sort_repack(struct bset *, struct btree *,
+ struct btree_node_iter *,
+ struct bkey_format *, bool);
+
+unsigned bch2_sort_keys(struct bkey_packed *,
+ struct sort_iter *, bool);
+
+#endif /* _BCACHEFS_BKEY_SORT_H */
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
new file mode 100644
index 000000000000..bb73ba9017b0
--- /dev/null
+++ b/fs/bcachefs/bset.c
@@ -0,0 +1,1592 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Code for working with individual keys, and sorted sets of keys with in a
+ * btree node
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "btree_cache.h"
+#include "bset.h"
+#include "eytzinger.h"
+#include "trace.h"
+#include "util.h"
+
+#include <asm/unaligned.h>
+#include <linux/console.h>
+#include <linux/random.h>
+#include <linux/prefetch.h>
+
+static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *,
+ struct btree *);
+
+static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
+{
+ unsigned n = ARRAY_SIZE(iter->data);
+
+ while (n && __btree_node_iter_set_end(iter, n - 1))
+ --n;
+
+ return n;
+}
+
+struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
+{
+ return bch2_bkey_to_bset_inlined(b, k);
+}
+
+/*
+ * There are never duplicate live keys in the btree - but including keys that
+ * have been flagged as deleted (and will be cleaned up later) we _will_ see
+ * duplicates.
+ *
+ * Thus the sort order is: usual key comparison first, but for keys that compare
+ * equal the deleted key(s) come first, and the (at most one) live version comes
+ * last.
+ *
+ * The main reason for this is insertion: to handle overwrites, we first iterate
+ * over keys that compare equal to our insert key, and then insert immediately
+ * prior to the first key greater than the key we're inserting - our insert
+ * position will be after all keys that compare equal to our insert key, which
+ * by the time we actually do the insert will all be deleted.
+ */
+
+void bch2_dump_bset(struct bch_fs *c, struct btree *b,
+ struct bset *i, unsigned set)
+{
+ struct bkey_packed *_k, *_n;
+ struct bkey uk, n;
+ struct bkey_s_c k;
+ struct printbuf buf = PRINTBUF;
+
+ if (!i->u64s)
+ return;
+
+ for (_k = i->start;
+ _k < vstruct_last(i);
+ _k = _n) {
+ _n = bkey_p_next(_k);
+
+ k = bkey_disassemble(b, _k, &uk);
+
+ printbuf_reset(&buf);
+ if (c)
+ bch2_bkey_val_to_text(&buf, c, k);
+ else
+ bch2_bkey_to_text(&buf, k.k);
+ printk(KERN_ERR "block %u key %5zu: %s\n", set,
+ _k->_data - i->_data, buf.buf);
+
+ if (_n == vstruct_last(i))
+ continue;
+
+ n = bkey_unpack_key(b, _n);
+
+ if (bpos_lt(n.p, k.k->p)) {
+ printk(KERN_ERR "Key skipped backwards\n");
+ continue;
+ }
+
+ if (!bkey_deleted(k.k) && bpos_eq(n.p, k.k->p))
+ printk(KERN_ERR "Duplicate keys\n");
+ }
+
+ printbuf_exit(&buf);
+}
+
+void bch2_dump_btree_node(struct bch_fs *c, struct btree *b)
+{
+ struct bset_tree *t;
+
+ console_lock();
+ for_each_bset(b, t)
+ bch2_dump_bset(c, b, bset(b, t), t - b->set);
+ console_unlock();
+}
+
+void bch2_dump_btree_node_iter(struct btree *b,
+ struct btree_node_iter *iter)
+{
+ struct btree_node_iter_set *set;
+ struct printbuf buf = PRINTBUF;
+
+ printk(KERN_ERR "btree node iter with %u/%u sets:\n",
+ __btree_node_iter_used(iter), b->nsets);
+
+ btree_node_iter_for_each(iter, set) {
+ struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
+ struct bset_tree *t = bch2_bkey_to_bset(b, k);
+ struct bkey uk = bkey_unpack_key(b, k);
+
+ printbuf_reset(&buf);
+ bch2_bkey_to_text(&buf, &uk);
+ printk(KERN_ERR "set %zu key %u: %s\n",
+ t - b->set, set->k, buf.buf);
+ }
+
+ printbuf_exit(&buf);
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void __bch2_verify_btree_nr_keys(struct btree *b)
+{
+ struct bset_tree *t;
+ struct bkey_packed *k;
+ struct btree_nr_keys nr = { 0 };
+
+ for_each_bset(b, t)
+ bset_tree_for_each_key(b, t, k)
+ if (!bkey_deleted(k))
+ btree_keys_account_key_add(&nr, t - b->set, k);
+
+ BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
+}
+
+static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
+ struct btree *b)
+{
+ struct btree_node_iter iter = *_iter;
+ const struct bkey_packed *k, *n;
+
+ k = bch2_btree_node_iter_peek_all(&iter, b);
+ __bch2_btree_node_iter_advance(&iter, b);
+ n = bch2_btree_node_iter_peek_all(&iter, b);
+
+ bkey_unpack_key(b, k);
+
+ if (n &&
+ bkey_iter_cmp(b, k, n) > 0) {
+ struct btree_node_iter_set *set;
+ struct bkey ku = bkey_unpack_key(b, k);
+ struct bkey nu = bkey_unpack_key(b, n);
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
+
+ bch2_dump_btree_node(NULL, b);
+ bch2_bkey_to_text(&buf1, &ku);
+ bch2_bkey_to_text(&buf2, &nu);
+ printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n",
+ buf1.buf, buf2.buf);
+ printk(KERN_ERR "iter was:");
+
+ btree_node_iter_for_each(_iter, set) {
+ struct bkey_packed *k2 = __btree_node_offset_to_key(b, set->k);
+ struct bset_tree *t = bch2_bkey_to_bset(b, k2);
+ printk(" [%zi %zi]", t - b->set,
+ k2->_data - bset(b, t)->_data);
+ }
+ panic("\n");
+ }
+}
+
+void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
+ struct btree *b)
+{
+ struct btree_node_iter_set *set, *s2;
+ struct bkey_packed *k, *p;
+ struct bset_tree *t;
+
+ if (bch2_btree_node_iter_end(iter))
+ return;
+
+ /* Verify no duplicates: */
+ btree_node_iter_for_each(iter, set) {
+ BUG_ON(set->k > set->end);
+ btree_node_iter_for_each(iter, s2)
+ BUG_ON(set != s2 && set->end == s2->end);
+ }
+
+ /* Verify that set->end is correct: */
+ btree_node_iter_for_each(iter, set) {
+ for_each_bset(b, t)
+ if (set->end == t->end_offset)
+ goto found;
+ BUG();
+found:
+ BUG_ON(set->k < btree_bkey_first_offset(t) ||
+ set->k >= t->end_offset);
+ }
+
+ /* Verify iterator is sorted: */
+ btree_node_iter_for_each(iter, set)
+ BUG_ON(set != iter->data &&
+ btree_node_iter_cmp(b, set[-1], set[0]) > 0);
+
+ k = bch2_btree_node_iter_peek_all(iter, b);
+
+ for_each_bset(b, t) {
+ if (iter->data[0].end == t->end_offset)
+ continue;
+
+ p = bch2_bkey_prev_all(b, t,
+ bch2_btree_node_iter_bset_pos(iter, b, t));
+
+ BUG_ON(p && bkey_iter_cmp(b, k, p) < 0);
+ }
+}
+
+void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
+ struct bkey_packed *insert, unsigned clobber_u64s)
+{
+ struct bset_tree *t = bch2_bkey_to_bset(b, where);
+ struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
+ struct bkey_packed *next = (void *) ((u64 *) where->_data + clobber_u64s);
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
+#if 0
+ BUG_ON(prev &&
+ bkey_iter_cmp(b, prev, insert) > 0);
+#else
+ if (prev &&
+ bkey_iter_cmp(b, prev, insert) > 0) {
+ struct bkey k1 = bkey_unpack_key(b, prev);
+ struct bkey k2 = bkey_unpack_key(b, insert);
+
+ bch2_dump_btree_node(NULL, b);
+ bch2_bkey_to_text(&buf1, &k1);
+ bch2_bkey_to_text(&buf2, &k2);
+
+ panic("prev > insert:\n"
+ "prev key %s\n"
+ "insert key %s\n",
+ buf1.buf, buf2.buf);
+ }
+#endif
+#if 0
+ BUG_ON(next != btree_bkey_last(b, t) &&
+ bkey_iter_cmp(b, insert, next) > 0);
+#else
+ if (next != btree_bkey_last(b, t) &&
+ bkey_iter_cmp(b, insert, next) > 0) {
+ struct bkey k1 = bkey_unpack_key(b, insert);
+ struct bkey k2 = bkey_unpack_key(b, next);
+
+ bch2_dump_btree_node(NULL, b);
+ bch2_bkey_to_text(&buf1, &k1);
+ bch2_bkey_to_text(&buf2, &k2);
+
+ panic("insert > next:\n"
+ "insert key %s\n"
+ "next key %s\n",
+ buf1.buf, buf2.buf);
+ }
+#endif
+}
+
+#else
+
+static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
+ struct btree *b) {}
+
+#endif
+
+/* Auxiliary search trees */
+
+#define BFLOAT_FAILED_UNPACKED U8_MAX
+#define BFLOAT_FAILED U8_MAX
+
+struct bkey_float {
+ u8 exponent;
+ u8 key_offset;
+ u16 mantissa;
+};
+#define BKEY_MANTISSA_BITS 16
+
+static unsigned bkey_float_byte_offset(unsigned idx)
+{
+ return idx * sizeof(struct bkey_float);
+}
+
+struct ro_aux_tree {
+ u8 nothing[0];
+ struct bkey_float f[];
+};
+
+struct rw_aux_tree {
+ u16 offset;
+ struct bpos k;
+};
+
+static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
+{
+ BUG_ON(t->aux_data_offset == U16_MAX);
+
+ switch (bset_aux_tree_type(t)) {
+ case BSET_NO_AUX_TREE:
+ return t->aux_data_offset;
+ case BSET_RO_AUX_TREE:
+ return t->aux_data_offset +
+ DIV_ROUND_UP(t->size * sizeof(struct bkey_float) +
+ t->size * sizeof(u8), 8);
+ case BSET_RW_AUX_TREE:
+ return t->aux_data_offset +
+ DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
+ default:
+ BUG();
+ }
+}
+
+static unsigned bset_aux_tree_buf_start(const struct btree *b,
+ const struct bset_tree *t)
+{
+ return t == b->set
+ ? DIV_ROUND_UP(b->unpack_fn_len, 8)
+ : bset_aux_tree_buf_end(t - 1);
+}
+
+static void *__aux_tree_base(const struct btree *b,
+ const struct bset_tree *t)
+{
+ return b->aux_data + t->aux_data_offset * 8;
+}
+
+static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
+ const struct bset_tree *t)
+{
+ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
+
+ return __aux_tree_base(b, t);
+}
+
+static u8 *ro_aux_tree_prev(const struct btree *b,
+ const struct bset_tree *t)
+{
+ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
+
+ return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size);
+}
+
+static struct bkey_float *bkey_float(const struct btree *b,
+ const struct bset_tree *t,
+ unsigned idx)
+{
+ return ro_aux_tree_base(b, t)->f + idx;
+}
+
+static void bset_aux_tree_verify(const struct btree *b)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+ const struct bset_tree *t;
+
+ for_each_bset(b, t) {
+ if (t->aux_data_offset == U16_MAX)
+ continue;
+
+ BUG_ON(t != b->set &&
+ t[-1].aux_data_offset == U16_MAX);
+
+ BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t));
+ BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b));
+ BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b));
+ }
+#endif
+}
+
+void bch2_btree_keys_init(struct btree *b)
+{
+ unsigned i;
+
+ b->nsets = 0;
+ memset(&b->nr, 0, sizeof(b->nr));
+
+ for (i = 0; i < MAX_BSETS; i++)
+ b->set[i].data_offset = U16_MAX;
+
+ bch2_bset_set_no_aux_tree(b, b->set);
+}
+
+/* Binary tree stuff for auxiliary search trees */
+
+/*
+ * Cacheline/offset <-> bkey pointer arithmetic:
+ *
+ * t->tree is a binary search tree in an array; each node corresponds to a key
+ * in one cacheline in t->set (BSET_CACHELINE bytes).
+ *
+ * This means we don't have to store the full index of the key that a node in
+ * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and
+ * then bkey_float->m gives us the offset within that cacheline, in units of 8
+ * bytes.
+ *
+ * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to
+ * make this work.
+ *
+ * To construct the bfloat for an arbitrary key we need to know what the key
+ * immediately preceding it is: we have to check if the two keys differ in the
+ * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
+ * of the previous key so we can walk backwards to it from t->tree[j]'s key.
+ */
+
+static inline void *bset_cacheline(const struct btree *b,
+ const struct bset_tree *t,
+ unsigned cacheline)
+{
+ return (void *) round_down((unsigned long) btree_bkey_first(b, t),
+ L1_CACHE_BYTES) +
+ cacheline * BSET_CACHELINE;
+}
+
+static struct bkey_packed *cacheline_to_bkey(const struct btree *b,
+ const struct bset_tree *t,
+ unsigned cacheline,
+ unsigned offset)
+{
+ return bset_cacheline(b, t, cacheline) + offset * 8;
+}
+
+static unsigned bkey_to_cacheline(const struct btree *b,
+ const struct bset_tree *t,
+ const struct bkey_packed *k)
+{
+ return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE;
+}
+
+static ssize_t __bkey_to_cacheline_offset(const struct btree *b,
+ const struct bset_tree *t,
+ unsigned cacheline,
+ const struct bkey_packed *k)
+{
+ return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline);
+}
+
+static unsigned bkey_to_cacheline_offset(const struct btree *b,
+ const struct bset_tree *t,
+ unsigned cacheline,
+ const struct bkey_packed *k)
+{
+ size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k);
+
+ EBUG_ON(m > U8_MAX);
+ return m;
+}
+
+static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
+ const struct bset_tree *t,
+ unsigned j)
+{
+ return cacheline_to_bkey(b, t,
+ __eytzinger1_to_inorder(j, t->size - 1, t->extra),
+ bkey_float(b, t, j)->key_offset);
+}
+
+static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
+ const struct bset_tree *t,
+ unsigned j)
+{
+ unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
+
+ return (void *) ((u64 *) tree_to_bkey(b, t, j)->_data - prev_u64s);
+}
+
+static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
+ const struct bset_tree *t)
+{
+ EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
+
+ return __aux_tree_base(b, t);
+}
+
+/*
+ * For the write set - the one we're currently inserting keys into - we don't
+ * maintain a full search tree, we just keep a simple lookup table in t->prev.
+ */
+static struct bkey_packed *rw_aux_to_bkey(const struct btree *b,
+ struct bset_tree *t,
+ unsigned j)
+{
+ return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset);
+}
+
+static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t,
+ unsigned j, struct bkey_packed *k)
+{
+ EBUG_ON(k >= btree_bkey_last(b, t));
+
+ rw_aux_tree(b, t)[j] = (struct rw_aux_tree) {
+ .offset = __btree_node_key_to_offset(b, k),
+ .k = bkey_unpack_pos(b, k),
+ };
+}
+
+static void bch2_bset_verify_rw_aux_tree(struct btree *b,
+ struct bset_tree *t)
+{
+ struct bkey_packed *k = btree_bkey_first(b, t);
+ unsigned j = 0;
+
+ if (!bch2_expensive_debug_checks)
+ return;
+
+ BUG_ON(bset_has_ro_aux_tree(t));
+
+ if (!bset_has_rw_aux_tree(t))
+ return;
+
+ BUG_ON(t->size < 1);
+ BUG_ON(rw_aux_to_bkey(b, t, j) != k);
+
+ goto start;
+ while (1) {
+ if (rw_aux_to_bkey(b, t, j) == k) {
+ BUG_ON(!bpos_eq(rw_aux_tree(b, t)[j].k,
+ bkey_unpack_pos(b, k)));
+start:
+ if (++j == t->size)
+ break;
+
+ BUG_ON(rw_aux_tree(b, t)[j].offset <=
+ rw_aux_tree(b, t)[j - 1].offset);
+ }
+
+ k = bkey_p_next(k);
+ BUG_ON(k >= btree_bkey_last(b, t));
+ }
+}
+
+/* returns idx of first entry >= offset: */
+static unsigned rw_aux_tree_bsearch(struct btree *b,
+ struct bset_tree *t,
+ unsigned offset)
+{
+ unsigned bset_offs = offset - btree_bkey_first_offset(t);
+ unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t);
+ unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0;
+
+ EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
+ EBUG_ON(!t->size);
+ EBUG_ON(idx > t->size);
+
+ while (idx < t->size &&
+ rw_aux_tree(b, t)[idx].offset < offset)
+ idx++;
+
+ while (idx &&
+ rw_aux_tree(b, t)[idx - 1].offset >= offset)
+ idx--;
+
+ EBUG_ON(idx < t->size &&
+ rw_aux_tree(b, t)[idx].offset < offset);
+ EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset);
+ EBUG_ON(idx + 1 < t->size &&
+ rw_aux_tree(b, t)[idx].offset ==
+ rw_aux_tree(b, t)[idx + 1].offset);
+
+ return idx;
+}
+
+static inline unsigned bkey_mantissa(const struct bkey_packed *k,
+ const struct bkey_float *f,
+ unsigned idx)
+{
+ u64 v;
+
+ EBUG_ON(!bkey_packed(k));
+
+ v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3)));
+
+ /*
+ * In little endian, we're shifting off low bits (and then the bits we
+ * want are at the low end), in big endian we're shifting off high bits
+ * (and then the bits we want are at the high end, so we shift them
+ * back down):
+ */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ v >>= f->exponent & 7;
+#else
+ v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS;
+#endif
+ return (u16) v;
+}
+
+static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t,
+ unsigned j,
+ struct bkey_packed *min_key,
+ struct bkey_packed *max_key)
+{
+ struct bkey_float *f = bkey_float(b, t, j);
+ struct bkey_packed *m = tree_to_bkey(b, t, j);
+ struct bkey_packed *l = is_power_of_2(j)
+ ? min_key
+ : tree_to_prev_bkey(b, t, j >> ffs(j));
+ struct bkey_packed *r = is_power_of_2(j + 1)
+ ? max_key
+ : tree_to_bkey(b, t, j >> (ffz(j) + 1));
+ unsigned mantissa;
+ int shift, exponent, high_bit;
+
+ /*
+ * for failed bfloats, the lookup code falls back to comparing against
+ * the original key.
+ */
+
+ if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) ||
+ !b->nr_key_bits) {
+ f->exponent = BFLOAT_FAILED_UNPACKED;
+ return;
+ }
+
+ /*
+ * The greatest differing bit of l and r is the first bit we must
+ * include in the bfloat mantissa we're creating in order to do
+ * comparisons - that bit always becomes the high bit of
+ * bfloat->mantissa, and thus the exponent we're calculating here is
+ * the position of what will become the low bit in bfloat->mantissa:
+ *
+ * Note that this may be negative - we may be running off the low end
+ * of the key: we handle this later:
+ */
+ high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r),
+ min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1);
+ exponent = high_bit - (BKEY_MANTISSA_BITS - 1);
+
+ /*
+ * Then we calculate the actual shift value, from the start of the key
+ * (k->_data), to get the key bits starting at exponent:
+ */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
+
+ EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64);
+#else
+ shift = high_bit_offset +
+ b->nr_key_bits -
+ exponent -
+ BKEY_MANTISSA_BITS;
+
+ EBUG_ON(shift < KEY_PACKED_BITS_START);
+#endif
+ EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
+
+ f->exponent = shift;
+ mantissa = bkey_mantissa(m, f, j);
+
+ /*
+ * If we've got garbage bits, set them to all 1s - it's legal for the
+ * bfloat to compare larger than the original key, but not smaller:
+ */
+ if (exponent < 0)
+ mantissa |= ~(~0U << -exponent);
+
+ f->mantissa = mantissa;
+}
+
+/* bytes remaining - only valid for last bset: */
+static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t)
+{
+ bset_aux_tree_verify(b);
+
+ return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
+}
+
+static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t)
+{
+ return __bset_tree_capacity(b, t) /
+ (sizeof(struct bkey_float) + sizeof(u8));
+}
+
+static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t)
+{
+ return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
+}
+
+static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
+{
+ struct bkey_packed *k;
+
+ t->size = 1;
+ t->extra = BSET_RW_AUX_TREE_VAL;
+ rw_aux_tree(b, t)[0].offset =
+ __btree_node_key_to_offset(b, btree_bkey_first(b, t));
+
+ bset_tree_for_each_key(b, t, k) {
+ if (t->size == bset_rw_tree_capacity(b, t))
+ break;
+
+ if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) >
+ L1_CACHE_BYTES)
+ rw_aux_tree_set(b, t, t->size++, k);
+ }
+}
+
+static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
+{
+ struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
+ struct bkey_i min_key, max_key;
+ unsigned j, cacheline = 1;
+
+ t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
+ bset_ro_tree_capacity(b, t));
+retry:
+ if (t->size < 2) {
+ t->size = 0;
+ t->extra = BSET_NO_AUX_TREE_VAL;
+ return;
+ }
+
+ t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
+
+ /* First we figure out where the first key in each cacheline is */
+ eytzinger1_for_each(j, t->size - 1) {
+ while (bkey_to_cacheline(b, t, k) < cacheline)
+ prev = k, k = bkey_p_next(k);
+
+ if (k >= btree_bkey_last(b, t)) {
+ /* XXX: this path sucks */
+ t->size--;
+ goto retry;
+ }
+
+ ro_aux_tree_prev(b, t)[j] = prev->u64s;
+ bkey_float(b, t, j)->key_offset =
+ bkey_to_cacheline_offset(b, t, cacheline++, k);
+
+ EBUG_ON(tree_to_prev_bkey(b, t, j) != prev);
+ EBUG_ON(tree_to_bkey(b, t, j) != k);
+ }
+
+ while (k != btree_bkey_last(b, t))
+ prev = k, k = bkey_p_next(k);
+
+ if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) {
+ bkey_init(&min_key.k);
+ min_key.k.p = b->data->min_key;
+ }
+
+ if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) {
+ bkey_init(&max_key.k);
+ max_key.k.p = b->data->max_key;
+ }
+
+ /* Then we build the tree */
+ eytzinger1_for_each(j, t->size - 1)
+ make_bfloat(b, t, j,
+ bkey_to_packed(&min_key),
+ bkey_to_packed(&max_key));
+}
+
+static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
+{
+ struct bset_tree *i;
+
+ for (i = b->set; i != t; i++)
+ BUG_ON(bset_has_rw_aux_tree(i));
+
+ bch2_bset_set_no_aux_tree(b, t);
+
+ /* round up to next cacheline: */
+ t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t),
+ SMP_CACHE_BYTES / sizeof(u64));
+
+ bset_aux_tree_verify(b);
+}
+
+void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t,
+ bool writeable)
+{
+ if (writeable
+ ? bset_has_rw_aux_tree(t)
+ : bset_has_ro_aux_tree(t))
+ return;
+
+ bset_alloc_tree(b, t);
+
+ if (!__bset_tree_capacity(b, t))
+ return;
+
+ if (writeable)
+ __build_rw_aux_tree(b, t);
+ else
+ __build_ro_aux_tree(b, t);
+
+ bset_aux_tree_verify(b);
+}
+
+void bch2_bset_init_first(struct btree *b, struct bset *i)
+{
+ struct bset_tree *t;
+
+ BUG_ON(b->nsets);
+
+ memset(i, 0, sizeof(*i));
+ get_random_bytes(&i->seq, sizeof(i->seq));
+ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
+
+ t = &b->set[b->nsets++];
+ set_btree_bset(b, t, i);
+}
+
+void bch2_bset_init_next(struct bch_fs *c, struct btree *b,
+ struct btree_node_entry *bne)
+{
+ struct bset *i = &bne->keys;
+ struct bset_tree *t;
+
+ BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c));
+ BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
+ BUG_ON(b->nsets >= MAX_BSETS);
+
+ memset(i, 0, sizeof(*i));
+ i->seq = btree_bset_first(b)->seq;
+ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
+
+ t = &b->set[b->nsets++];
+ set_btree_bset(b, t, i);
+}
+
+/*
+ * find _some_ key in the same bset as @k that precedes @k - not necessarily the
+ * immediate predecessor:
+ */
+static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t,
+ struct bkey_packed *k)
+{
+ struct bkey_packed *p;
+ unsigned offset;
+ int j;
+
+ EBUG_ON(k < btree_bkey_first(b, t) ||
+ k > btree_bkey_last(b, t));
+
+ if (k == btree_bkey_first(b, t))
+ return NULL;
+
+ switch (bset_aux_tree_type(t)) {
+ case BSET_NO_AUX_TREE:
+ p = btree_bkey_first(b, t);
+ break;
+ case BSET_RO_AUX_TREE:
+ j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k));
+
+ do {
+ p = j ? tree_to_bkey(b, t,
+ __inorder_to_eytzinger1(j--,
+ t->size - 1, t->extra))
+ : btree_bkey_first(b, t);
+ } while (p >= k);
+ break;
+ case BSET_RW_AUX_TREE:
+ offset = __btree_node_key_to_offset(b, k);
+ j = rw_aux_tree_bsearch(b, t, offset);
+ p = j ? rw_aux_to_bkey(b, t, j - 1)
+ : btree_bkey_first(b, t);
+ break;
+ }
+
+ return p;
+}
+
+struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
+ struct bset_tree *t,
+ struct bkey_packed *k,
+ unsigned min_key_type)
+{
+ struct bkey_packed *p, *i, *ret = NULL, *orig_k = k;
+
+ while ((p = __bkey_prev(b, t, k)) && !ret) {
+ for (i = p; i != k; i = bkey_p_next(i))
+ if (i->type >= min_key_type)
+ ret = i;
+
+ k = p;
+ }
+
+ if (bch2_expensive_debug_checks) {
+ BUG_ON(ret >= orig_k);
+
+ for (i = ret
+ ? bkey_p_next(ret)
+ : btree_bkey_first(b, t);
+ i != orig_k;
+ i = bkey_p_next(i))
+ BUG_ON(i->type >= min_key_type);
+ }
+
+ return ret;
+}
+
+/* Insert */
+
+static void bch2_bset_fix_lookup_table(struct btree *b,
+ struct bset_tree *t,
+ struct bkey_packed *_where,
+ unsigned clobber_u64s,
+ unsigned new_u64s)
+{
+ int shift = new_u64s - clobber_u64s;
+ unsigned l, j, where = __btree_node_key_to_offset(b, _where);
+
+ EBUG_ON(bset_has_ro_aux_tree(t));
+
+ if (!bset_has_rw_aux_tree(t))
+ return;
+
+ /* returns first entry >= where */
+ l = rw_aux_tree_bsearch(b, t, where);
+
+ if (!l) /* never delete first entry */
+ l++;
+ else if (l < t->size &&
+ where < t->end_offset &&
+ rw_aux_tree(b, t)[l].offset == where)
+ rw_aux_tree_set(b, t, l++, _where);
+
+ /* l now > where */
+
+ for (j = l;
+ j < t->size &&
+ rw_aux_tree(b, t)[j].offset < where + clobber_u64s;
+ j++)
+ ;
+
+ if (j < t->size &&
+ rw_aux_tree(b, t)[j].offset + shift ==
+ rw_aux_tree(b, t)[l - 1].offset)
+ j++;
+
+ memmove(&rw_aux_tree(b, t)[l],
+ &rw_aux_tree(b, t)[j],
+ (void *) &rw_aux_tree(b, t)[t->size] -
+ (void *) &rw_aux_tree(b, t)[j]);
+ t->size -= j - l;
+
+ for (j = l; j < t->size; j++)
+ rw_aux_tree(b, t)[j].offset += shift;
+
+ EBUG_ON(l < t->size &&
+ rw_aux_tree(b, t)[l].offset ==
+ rw_aux_tree(b, t)[l - 1].offset);
+
+ if (t->size < bset_rw_tree_capacity(b, t) &&
+ (l < t->size
+ ? rw_aux_tree(b, t)[l].offset
+ : t->end_offset) -
+ rw_aux_tree(b, t)[l - 1].offset >
+ L1_CACHE_BYTES / sizeof(u64)) {
+ struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1);
+ struct bkey_packed *end = l < t->size
+ ? rw_aux_to_bkey(b, t, l)
+ : btree_bkey_last(b, t);
+ struct bkey_packed *k = start;
+
+ while (1) {
+ k = bkey_p_next(k);
+ if (k == end)
+ break;
+
+ if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
+ memmove(&rw_aux_tree(b, t)[l + 1],
+ &rw_aux_tree(b, t)[l],
+ (void *) &rw_aux_tree(b, t)[t->size] -
+ (void *) &rw_aux_tree(b, t)[l]);
+ t->size++;
+ rw_aux_tree_set(b, t, l, k);
+ break;
+ }
+ }
+ }
+
+ bch2_bset_verify_rw_aux_tree(b, t);
+ bset_aux_tree_verify(b);
+}
+
+void bch2_bset_insert(struct btree *b,
+ struct btree_node_iter *iter,
+ struct bkey_packed *where,
+ struct bkey_i *insert,
+ unsigned clobber_u64s)
+{
+ struct bkey_format *f = &b->format;
+ struct bset_tree *t = bset_tree_last(b);
+ struct bkey_packed packed, *src = bkey_to_packed(insert);
+
+ bch2_bset_verify_rw_aux_tree(b, t);
+ bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s);
+
+ if (bch2_bkey_pack_key(&packed, &insert->k, f))
+ src = &packed;
+
+ if (!bkey_deleted(&insert->k))
+ btree_keys_account_key_add(&b->nr, t - b->set, src);
+
+ if (src->u64s != clobber_u64s) {
+ u64 *src_p = (u64 *) where->_data + clobber_u64s;
+ u64 *dst_p = (u64 *) where->_data + src->u64s;
+
+ EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
+ (int) clobber_u64s - src->u64s);
+
+ memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
+ le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s);
+ set_btree_bset_end(b, t);
+ }
+
+ memcpy_u64s_small(where, src,
+ bkeyp_key_u64s(f, src));
+ memcpy_u64s(bkeyp_val(f, where), &insert->v,
+ bkeyp_val_u64s(f, src));
+
+ if (src->u64s != clobber_u64s)
+ bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
+
+ bch2_verify_btree_nr_keys(b);
+}
+
+void bch2_bset_delete(struct btree *b,
+ struct bkey_packed *where,
+ unsigned clobber_u64s)
+{
+ struct bset_tree *t = bset_tree_last(b);
+ u64 *src_p = (u64 *) where->_data + clobber_u64s;
+ u64 *dst_p = where->_data;
+
+ bch2_bset_verify_rw_aux_tree(b, t);
+
+ EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s);
+
+ memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
+ le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s);
+ set_btree_bset_end(b, t);
+
+ bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0);
+}
+
+/* Lookup */
+
+__flatten
+static struct bkey_packed *bset_search_write_set(const struct btree *b,
+ struct bset_tree *t,
+ struct bpos *search)
+{
+ unsigned l = 0, r = t->size;
+
+ while (l + 1 != r) {
+ unsigned m = (l + r) >> 1;
+
+ if (bpos_lt(rw_aux_tree(b, t)[m].k, *search))
+ l = m;
+ else
+ r = m;
+ }
+
+ return rw_aux_to_bkey(b, t, l);
+}
+
+static inline void prefetch_four_cachelines(void *p)
+{
+#ifdef CONFIG_X86_64
+ asm("prefetcht0 (-127 + 64 * 0)(%0);"
+ "prefetcht0 (-127 + 64 * 1)(%0);"
+ "prefetcht0 (-127 + 64 * 2)(%0);"
+ "prefetcht0 (-127 + 64 * 3)(%0);"
+ :
+ : "r" (p + 127));
+#else
+ prefetch(p + L1_CACHE_BYTES * 0);
+ prefetch(p + L1_CACHE_BYTES * 1);
+ prefetch(p + L1_CACHE_BYTES * 2);
+ prefetch(p + L1_CACHE_BYTES * 3);
+#endif
+}
+
+static inline bool bkey_mantissa_bits_dropped(const struct btree *b,
+ const struct bkey_float *f,
+ unsigned idx)
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits;
+
+ return f->exponent > key_bits_start;
+#else
+ unsigned key_bits_end = high_bit_offset + b->nr_key_bits;
+
+ return f->exponent + BKEY_MANTISSA_BITS < key_bits_end;
+#endif
+}
+
+__flatten
+static struct bkey_packed *bset_search_tree(const struct btree *b,
+ const struct bset_tree *t,
+ const struct bpos *search,
+ const struct bkey_packed *packed_search)
+{
+ struct ro_aux_tree *base = ro_aux_tree_base(b, t);
+ struct bkey_float *f;
+ struct bkey_packed *k;
+ unsigned inorder, n = 1, l, r;
+ int cmp;
+
+ do {
+ if (likely(n << 4 < t->size))
+ prefetch(&base->f[n << 4]);
+
+ f = &base->f[n];
+ if (unlikely(f->exponent >= BFLOAT_FAILED))
+ goto slowpath;
+
+ l = f->mantissa;
+ r = bkey_mantissa(packed_search, f, n);
+
+ if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n))
+ goto slowpath;
+
+ n = n * 2 + (l < r);
+ continue;
+slowpath:
+ k = tree_to_bkey(b, t, n);
+ cmp = bkey_cmp_p_or_unp(b, k, packed_search, search);
+ if (!cmp)
+ return k;
+
+ n = n * 2 + (cmp < 0);
+ } while (n < t->size);
+
+ inorder = __eytzinger1_to_inorder(n >> 1, t->size - 1, t->extra);
+
+ /*
+ * n would have been the node we recursed to - the low bit tells us if
+ * we recursed left or recursed right.
+ */
+ if (likely(!(n & 1))) {
+ --inorder;
+ if (unlikely(!inorder))
+ return btree_bkey_first(b, t);
+
+ f = &base->f[eytzinger1_prev(n >> 1, t->size - 1)];
+ }
+
+ return cacheline_to_bkey(b, t, inorder, f->key_offset);
+}
+
+static __always_inline __flatten
+struct bkey_packed *__bch2_bset_search(struct btree *b,
+ struct bset_tree *t,
+ struct bpos *search,
+ const struct bkey_packed *lossy_packed_search)
+{
+
+ /*
+ * First, we search for a cacheline, then lastly we do a linear search
+ * within that cacheline.
+ *
+ * To search for the cacheline, there's three different possibilities:
+ * * The set is too small to have a search tree, so we just do a linear
+ * search over the whole set.
+ * * The set is the one we're currently inserting into; keeping a full
+ * auxiliary search tree up to date would be too expensive, so we
+ * use a much simpler lookup table to do a binary search -
+ * bset_search_write_set().
+ * * Or we use the auxiliary search tree we constructed earlier -
+ * bset_search_tree()
+ */
+
+ switch (bset_aux_tree_type(t)) {
+ case BSET_NO_AUX_TREE:
+ return btree_bkey_first(b, t);
+ case BSET_RW_AUX_TREE:
+ return bset_search_write_set(b, t, search);
+ case BSET_RO_AUX_TREE:
+ return bset_search_tree(b, t, search, lossy_packed_search);
+ default:
+ BUG();
+ }
+}
+
+static __always_inline __flatten
+struct bkey_packed *bch2_bset_search_linear(struct btree *b,
+ struct bset_tree *t,
+ struct bpos *search,
+ struct bkey_packed *packed_search,
+ const struct bkey_packed *lossy_packed_search,
+ struct bkey_packed *m)
+{
+ if (lossy_packed_search)
+ while (m != btree_bkey_last(b, t) &&
+ bkey_iter_cmp_p_or_unp(b, m,
+ lossy_packed_search, search) < 0)
+ m = bkey_p_next(m);
+
+ if (!packed_search)
+ while (m != btree_bkey_last(b, t) &&
+ bkey_iter_pos_cmp(b, m, search) < 0)
+ m = bkey_p_next(m);
+
+ if (bch2_expensive_debug_checks) {
+ struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
+
+ BUG_ON(prev &&
+ bkey_iter_cmp_p_or_unp(b, prev,
+ packed_search, search) >= 0);
+ }
+
+ return m;
+}
+
+/* Btree node iterator */
+
+static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
+ struct btree *b,
+ const struct bkey_packed *k,
+ const struct bkey_packed *end)
+{
+ if (k != end) {
+ struct btree_node_iter_set *pos;
+
+ btree_node_iter_for_each(iter, pos)
+ ;
+
+ BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data));
+ *pos = (struct btree_node_iter_set) {
+ __btree_node_key_to_offset(b, k),
+ __btree_node_key_to_offset(b, end)
+ };
+ }
+}
+
+void bch2_btree_node_iter_push(struct btree_node_iter *iter,
+ struct btree *b,
+ const struct bkey_packed *k,
+ const struct bkey_packed *end)
+{
+ __bch2_btree_node_iter_push(iter, b, k, end);
+ bch2_btree_node_iter_sort(iter, b);
+}
+
+noinline __flatten __cold
+static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
+ struct btree *b, struct bpos *search)
+{
+ struct bkey_packed *k;
+
+ trace_bkey_pack_pos_fail(search);
+
+ bch2_btree_node_iter_init_from_start(iter, b);
+
+ while ((k = bch2_btree_node_iter_peek(iter, b)) &&
+ bkey_iter_pos_cmp(b, k, search) < 0)
+ bch2_btree_node_iter_advance(iter, b);
+}
+
+/**
+ * bch2_btree_node_iter_init - initialize a btree node iterator, starting from a
+ * given position
+ *
+ * @iter: iterator to initialize
+ * @b: btree node to search
+ * @search: search key
+ *
+ * Main entry point to the lookup code for individual btree nodes:
+ *
+ * NOTE:
+ *
+ * When you don't filter out deleted keys, btree nodes _do_ contain duplicate
+ * keys. This doesn't matter for most code, but it does matter for lookups.
+ *
+ * Some adjacent keys with a string of equal keys:
+ * i j k k k k l m
+ *
+ * If you search for k, the lookup code isn't guaranteed to return you any
+ * specific k. The lookup code is conceptually doing a binary search and
+ * iterating backwards is very expensive so if the pivot happens to land at the
+ * last k that's what you'll get.
+ *
+ * This works out ok, but it's something to be aware of:
+ *
+ * - For non extents, we guarantee that the live key comes last - see
+ * btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't
+ * see will only be deleted keys you don't care about.
+ *
+ * - For extents, deleted keys sort last (see the comment at the top of this
+ * file). But when you're searching for extents, you actually want the first
+ * key strictly greater than your search key - an extent that compares equal
+ * to the search key is going to have 0 sectors after the search key.
+ *
+ * But this does mean that we can't just search for
+ * bpos_successor(start_of_range) to get the first extent that overlaps with
+ * the range we want - if we're unlucky and there's an extent that ends
+ * exactly where we searched, then there could be a deleted key at the same
+ * position and we'd get that when we search instead of the preceding extent
+ * we needed.
+ *
+ * So we've got to search for start_of_range, then after the lookup iterate
+ * past any extents that compare equal to the position we searched for.
+ */
+__flatten
+void bch2_btree_node_iter_init(struct btree_node_iter *iter,
+ struct btree *b, struct bpos *search)
+{
+ struct bkey_packed p, *packed_search = NULL;
+ struct btree_node_iter_set *pos = iter->data;
+ struct bkey_packed *k[MAX_BSETS];
+ unsigned i;
+
+ EBUG_ON(bpos_lt(*search, b->data->min_key));
+ EBUG_ON(bpos_gt(*search, b->data->max_key));
+ bset_aux_tree_verify(b);
+
+ memset(iter, 0, sizeof(*iter));
+
+ switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) {
+ case BKEY_PACK_POS_EXACT:
+ packed_search = &p;
+ break;
+ case BKEY_PACK_POS_SMALLER:
+ packed_search = NULL;
+ break;
+ case BKEY_PACK_POS_FAIL:
+ btree_node_iter_init_pack_failed(iter, b, search);
+ return;
+ }
+
+ for (i = 0; i < b->nsets; i++) {
+ k[i] = __bch2_bset_search(b, b->set + i, search, &p);
+ prefetch_four_cachelines(k[i]);
+ }
+
+ for (i = 0; i < b->nsets; i++) {
+ struct bset_tree *t = b->set + i;
+ struct bkey_packed *end = btree_bkey_last(b, t);
+
+ k[i] = bch2_bset_search_linear(b, t, search,
+ packed_search, &p, k[i]);
+ if (k[i] != end)
+ *pos++ = (struct btree_node_iter_set) {
+ __btree_node_key_to_offset(b, k[i]),
+ __btree_node_key_to_offset(b, end)
+ };
+ }
+
+ bch2_btree_node_iter_sort(iter, b);
+}
+
+void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter,
+ struct btree *b)
+{
+ struct bset_tree *t;
+
+ memset(iter, 0, sizeof(*iter));
+
+ for_each_bset(b, t)
+ __bch2_btree_node_iter_push(iter, b,
+ btree_bkey_first(b, t),
+ btree_bkey_last(b, t));
+ bch2_btree_node_iter_sort(iter, b);
+}
+
+struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter,
+ struct btree *b,
+ struct bset_tree *t)
+{
+ struct btree_node_iter_set *set;
+
+ btree_node_iter_for_each(iter, set)
+ if (set->end == t->end_offset)
+ return __btree_node_offset_to_key(b, set->k);
+
+ return btree_bkey_last(b, t);
+}
+
+static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter,
+ struct btree *b,
+ unsigned first)
+{
+ bool ret;
+
+ if ((ret = (btree_node_iter_cmp(b,
+ iter->data[first],
+ iter->data[first + 1]) > 0)))
+ swap(iter->data[first], iter->data[first + 1]);
+ return ret;
+}
+
+void bch2_btree_node_iter_sort(struct btree_node_iter *iter,
+ struct btree *b)
+{
+ /* unrolled bubble sort: */
+
+ if (!__btree_node_iter_set_end(iter, 2)) {
+ btree_node_iter_sort_two(iter, b, 0);
+ btree_node_iter_sort_two(iter, b, 1);
+ }
+
+ if (!__btree_node_iter_set_end(iter, 1))
+ btree_node_iter_sort_two(iter, b, 0);
+}
+
+void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter,
+ struct btree_node_iter_set *set)
+{
+ struct btree_node_iter_set *last =
+ iter->data + ARRAY_SIZE(iter->data) - 1;
+
+ memmove(&set[0], &set[1], (void *) last - (void *) set);
+ *last = (struct btree_node_iter_set) { 0, 0 };
+}
+
+static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
+ struct btree *b)
+{
+ iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s;
+
+ EBUG_ON(iter->data->k > iter->data->end);
+
+ if (unlikely(__btree_node_iter_set_end(iter, 0))) {
+ /* avoid an expensive memmove call: */
+ iter->data[0] = iter->data[1];
+ iter->data[1] = iter->data[2];
+ iter->data[2] = (struct btree_node_iter_set) { 0, 0 };
+ return;
+ }
+
+ if (__btree_node_iter_set_end(iter, 1))
+ return;
+
+ if (!btree_node_iter_sort_two(iter, b, 0))
+ return;
+
+ if (__btree_node_iter_set_end(iter, 2))
+ return;
+
+ btree_node_iter_sort_two(iter, b, 1);
+}
+
+void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
+ struct btree *b)
+{
+ if (bch2_expensive_debug_checks) {
+ bch2_btree_node_iter_verify(iter, b);
+ bch2_btree_node_iter_next_check(iter, b);
+ }
+
+ __bch2_btree_node_iter_advance(iter, b);
+}
+
+/*
+ * Expensive:
+ */
+struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
+ struct btree *b)
+{
+ struct bkey_packed *k, *prev = NULL;
+ struct btree_node_iter_set *set;
+ struct bset_tree *t;
+ unsigned end = 0;
+
+ if (bch2_expensive_debug_checks)
+ bch2_btree_node_iter_verify(iter, b);
+
+ for_each_bset(b, t) {
+ k = bch2_bkey_prev_all(b, t,
+ bch2_btree_node_iter_bset_pos(iter, b, t));
+ if (k &&
+ (!prev || bkey_iter_cmp(b, k, prev) > 0)) {
+ prev = k;
+ end = t->end_offset;
+ }
+ }
+
+ if (!prev)
+ return NULL;
+
+ /*
+ * We're manually memmoving instead of just calling sort() to ensure the
+ * prev we picked ends up in slot 0 - sort won't necessarily put it
+ * there because of duplicate deleted keys:
+ */
+ btree_node_iter_for_each(iter, set)
+ if (set->end == end)
+ goto found;
+
+ BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]);
+found:
+ BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data));
+
+ memmove(&iter->data[1],
+ &iter->data[0],
+ (void *) set - (void *) &iter->data[0]);
+
+ iter->data[0].k = __btree_node_key_to_offset(b, prev);
+ iter->data[0].end = end;
+
+ if (bch2_expensive_debug_checks)
+ bch2_btree_node_iter_verify(iter, b);
+ return prev;
+}
+
+struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter,
+ struct btree *b)
+{
+ struct bkey_packed *prev;
+
+ do {
+ prev = bch2_btree_node_iter_prev_all(iter, b);
+ } while (prev && bkey_deleted(prev));
+
+ return prev;
+}
+
+struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
+ struct btree *b,
+ struct bkey *u)
+{
+ struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b);
+
+ return k ? bkey_disassemble(b, k, u) : bkey_s_c_null;
+}
+
+/* Mergesort */
+
+void bch2_btree_keys_stats(const struct btree *b, struct bset_stats *stats)
+{
+ const struct bset_tree *t;
+
+ for_each_bset(b, t) {
+ enum bset_aux_tree_type type = bset_aux_tree_type(t);
+ size_t j;
+
+ stats->sets[type].nr++;
+ stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) *
+ sizeof(u64);
+
+ if (bset_has_ro_aux_tree(t)) {
+ stats->floats += t->size - 1;
+
+ for (j = 1; j < t->size; j++)
+ stats->failed +=
+ bkey_float(b, t, j)->exponent ==
+ BFLOAT_FAILED;
+ }
+ }
+}
+
+void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
+ struct bkey_packed *k)
+{
+ struct bset_tree *t = bch2_bkey_to_bset(b, k);
+ struct bkey uk;
+ unsigned j, inorder;
+
+ if (!bset_has_ro_aux_tree(t))
+ return;
+
+ inorder = bkey_to_cacheline(b, t, k);
+ if (!inorder || inorder >= t->size)
+ return;
+
+ j = __inorder_to_eytzinger1(inorder, t->size - 1, t->extra);
+ if (k != tree_to_bkey(b, t, j))
+ return;
+
+ switch (bkey_float(b, t, j)->exponent) {
+ case BFLOAT_FAILED:
+ uk = bkey_unpack_key(b, k);
+ prt_printf(out,
+ " failed unpacked at depth %u\n"
+ "\t",
+ ilog2(j));
+ bch2_bpos_to_text(out, uk.p);
+ prt_printf(out, "\n");
+ break;
+ }
+}
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
new file mode 100644
index 000000000000..632c2b8c5460
--- /dev/null
+++ b/fs/bcachefs/bset.h
@@ -0,0 +1,541 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BSET_H
+#define _BCACHEFS_BSET_H
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#include "bcachefs.h"
+#include "bkey.h"
+#include "bkey_methods.h"
+#include "btree_types.h"
+#include "util.h" /* for time_stats */
+#include "vstructs.h"
+
+/*
+ * BKEYS:
+ *
+ * A bkey contains a key, a size field, a variable number of pointers, and some
+ * ancillary flag bits.
+ *
+ * We use two different functions for validating bkeys, bkey_invalid and
+ * bkey_deleted().
+ *
+ * The one exception to the rule that ptr_invalid() filters out invalid keys is
+ * that it also filters out keys of size 0 - these are keys that have been
+ * completely overwritten. It'd be safe to delete these in memory while leaving
+ * them on disk, just unnecessary work - so we filter them out when resorting
+ * instead.
+ *
+ * We can't filter out stale keys when we're resorting, because garbage
+ * collection needs to find them to ensure bucket gens don't wrap around -
+ * unless we're rewriting the btree node those stale keys still exist on disk.
+ *
+ * We also implement functions here for removing some number of sectors from the
+ * front or the back of a bkey - this is mainly used for fixing overlapping
+ * extents, by removing the overlapping sectors from the older key.
+ *
+ * BSETS:
+ *
+ * A bset is an array of bkeys laid out contiguously in memory in sorted order,
+ * along with a header. A btree node is made up of a number of these, written at
+ * different times.
+ *
+ * There could be many of them on disk, but we never allow there to be more than
+ * 4 in memory - we lazily resort as needed.
+ *
+ * We implement code here for creating and maintaining auxiliary search trees
+ * (described below) for searching an individial bset, and on top of that we
+ * implement a btree iterator.
+ *
+ * BTREE ITERATOR:
+ *
+ * Most of the code in bcache doesn't care about an individual bset - it needs
+ * to search entire btree nodes and iterate over them in sorted order.
+ *
+ * The btree iterator code serves both functions; it iterates through the keys
+ * in a btree node in sorted order, starting from either keys after a specific
+ * point (if you pass it a search key) or the start of the btree node.
+ *
+ * AUXILIARY SEARCH TREES:
+ *
+ * Since keys are variable length, we can't use a binary search on a bset - we
+ * wouldn't be able to find the start of the next key. But binary searches are
+ * slow anyways, due to terrible cache behaviour; bcache originally used binary
+ * searches and that code topped out at under 50k lookups/second.
+ *
+ * So we need to construct some sort of lookup table. Since we only insert keys
+ * into the last (unwritten) set, most of the keys within a given btree node are
+ * usually in sets that are mostly constant. We use two different types of
+ * lookup tables to take advantage of this.
+ *
+ * Both lookup tables share in common that they don't index every key in the
+ * set; they index one key every BSET_CACHELINE bytes, and then a linear search
+ * is used for the rest.
+ *
+ * For sets that have been written to disk and are no longer being inserted
+ * into, we construct a binary search tree in an array - traversing a binary
+ * search tree in an array gives excellent locality of reference and is very
+ * fast, since both children of any node are adjacent to each other in memory
+ * (and their grandchildren, and great grandchildren...) - this means
+ * prefetching can be used to great effect.
+ *
+ * It's quite useful performance wise to keep these nodes small - not just
+ * because they're more likely to be in L2, but also because we can prefetch
+ * more nodes on a single cacheline and thus prefetch more iterations in advance
+ * when traversing this tree.
+ *
+ * Nodes in the auxiliary search tree must contain both a key to compare against
+ * (we don't want to fetch the key from the set, that would defeat the purpose),
+ * and a pointer to the key. We use a few tricks to compress both of these.
+ *
+ * To compress the pointer, we take advantage of the fact that one node in the
+ * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have
+ * a function (to_inorder()) that takes the index of a node in a binary tree and
+ * returns what its index would be in an inorder traversal, so we only have to
+ * store the low bits of the offset.
+ *
+ * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To
+ * compress that, we take advantage of the fact that when we're traversing the
+ * search tree at every iteration we know that both our search key and the key
+ * we're looking for lie within some range - bounded by our previous
+ * comparisons. (We special case the start of a search so that this is true even
+ * at the root of the tree).
+ *
+ * So we know the key we're looking for is between a and b, and a and b don't
+ * differ higher than bit 50, we don't need to check anything higher than bit
+ * 50.
+ *
+ * We don't usually need the rest of the bits, either; we only need enough bits
+ * to partition the key range we're currently checking. Consider key n - the
+ * key our auxiliary search tree node corresponds to, and key p, the key
+ * immediately preceding n. The lowest bit we need to store in the auxiliary
+ * search tree is the highest bit that differs between n and p.
+ *
+ * Note that this could be bit 0 - we might sometimes need all 80 bits to do the
+ * comparison. But we'd really like our nodes in the auxiliary search tree to be
+ * of fixed size.
+ *
+ * The solution is to make them fixed size, and when we're constructing a node
+ * check if p and n differed in the bits we needed them to. If they don't we
+ * flag that node, and when doing lookups we fallback to comparing against the
+ * real key. As long as this doesn't happen to often (and it seems to reliably
+ * happen a bit less than 1% of the time), we win - even on failures, that key
+ * is then more likely to be in cache than if we were doing binary searches all
+ * the way, since we're touching so much less memory.
+ *
+ * The keys in the auxiliary search tree are stored in (software) floating
+ * point, with an exponent and a mantissa. The exponent needs to be big enough
+ * to address all the bits in the original key, but the number of bits in the
+ * mantissa is somewhat arbitrary; more bits just gets us fewer failures.
+ *
+ * We need 7 bits for the exponent and 3 bits for the key's offset (since keys
+ * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes.
+ * We need one node per 128 bytes in the btree node, which means the auxiliary
+ * search trees take up 3% as much memory as the btree itself.
+ *
+ * Constructing these auxiliary search trees is moderately expensive, and we
+ * don't want to be constantly rebuilding the search tree for the last set
+ * whenever we insert another key into it. For the unwritten set, we use a much
+ * simpler lookup table - it's just a flat array, so index i in the lookup table
+ * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing
+ * within each byte range works the same as with the auxiliary search trees.
+ *
+ * These are much easier to keep up to date when we insert a key - we do it
+ * somewhat lazily; when we shift a key up we usually just increment the pointer
+ * to it, only when it would overflow do we go to the trouble of finding the
+ * first key in that range of bytes again.
+ */
+
+enum bset_aux_tree_type {
+ BSET_NO_AUX_TREE,
+ BSET_RO_AUX_TREE,
+ BSET_RW_AUX_TREE,
+};
+
+#define BSET_TREE_NR_TYPES 3
+
+#define BSET_NO_AUX_TREE_VAL (U16_MAX)
+#define BSET_RW_AUX_TREE_VAL (U16_MAX - 1)
+
+static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t)
+{
+ switch (t->extra) {
+ case BSET_NO_AUX_TREE_VAL:
+ EBUG_ON(t->size);
+ return BSET_NO_AUX_TREE;
+ case BSET_RW_AUX_TREE_VAL:
+ EBUG_ON(!t->size);
+ return BSET_RW_AUX_TREE;
+ default:
+ EBUG_ON(!t->size);
+ return BSET_RO_AUX_TREE;
+ }
+}
+
+/*
+ * BSET_CACHELINE was originally intended to match the hardware cacheline size -
+ * it used to be 64, but I realized the lookup code would touch slightly less
+ * memory if it was 128.
+ *
+ * It definites the number of bytes (in struct bset) per struct bkey_float in
+ * the auxiliar search tree - when we're done searching the bset_float tree we
+ * have this many bytes left that we do a linear search over.
+ *
+ * Since (after level 5) every level of the bset_tree is on a new cacheline,
+ * we're touching one fewer cacheline in the bset tree in exchange for one more
+ * cacheline in the linear search - but the linear search might stop before it
+ * gets to the second cacheline.
+ */
+
+#define BSET_CACHELINE 256
+
+static inline size_t btree_keys_cachelines(const struct btree *b)
+{
+ return (1U << b->byte_order) / BSET_CACHELINE;
+}
+
+static inline size_t btree_aux_data_bytes(const struct btree *b)
+{
+ return btree_keys_cachelines(b) * 8;
+}
+
+static inline size_t btree_aux_data_u64s(const struct btree *b)
+{
+ return btree_aux_data_bytes(b) / sizeof(u64);
+}
+
+#define for_each_bset(_b, _t) \
+ for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
+
+#define bset_tree_for_each_key(_b, _t, _k) \
+ for (_k = btree_bkey_first(_b, _t); \
+ _k != btree_bkey_last(_b, _t); \
+ _k = bkey_p_next(_k))
+
+static inline bool bset_has_ro_aux_tree(const struct bset_tree *t)
+{
+ return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;
+}
+
+static inline bool bset_has_rw_aux_tree(struct bset_tree *t)
+{
+ return bset_aux_tree_type(t) == BSET_RW_AUX_TREE;
+}
+
+static inline void bch2_bset_set_no_aux_tree(struct btree *b,
+ struct bset_tree *t)
+{
+ BUG_ON(t < b->set);
+
+ for (; t < b->set + ARRAY_SIZE(b->set); t++) {
+ t->size = 0;
+ t->extra = BSET_NO_AUX_TREE_VAL;
+ t->aux_data_offset = U16_MAX;
+ }
+}
+
+static inline void btree_node_set_format(struct btree *b,
+ struct bkey_format f)
+{
+ int len;
+
+ b->format = f;
+ b->nr_key_bits = bkey_format_key_bits(&f);
+
+ len = bch2_compile_bkey_format(&b->format, b->aux_data);
+ BUG_ON(len < 0 || len > U8_MAX);
+
+ b->unpack_fn_len = len;
+
+ bch2_bset_set_no_aux_tree(b, b->set);
+}
+
+static inline struct bset *bset_next_set(struct btree *b,
+ unsigned block_bytes)
+{
+ struct bset *i = btree_bset_last(b);
+
+ EBUG_ON(!is_power_of_2(block_bytes));
+
+ return ((void *) i) + round_up(vstruct_bytes(i), block_bytes);
+}
+
+void bch2_btree_keys_init(struct btree *);
+
+void bch2_bset_init_first(struct btree *, struct bset *);
+void bch2_bset_init_next(struct bch_fs *, struct btree *,
+ struct btree_node_entry *);
+void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
+
+void bch2_bset_insert(struct btree *, struct btree_node_iter *,
+ struct bkey_packed *, struct bkey_i *, unsigned);
+void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned);
+
+/* Bkey utility code */
+
+/* packed or unpacked */
+static inline int bkey_cmp_p_or_unp(const struct btree *b,
+ const struct bkey_packed *l,
+ const struct bkey_packed *r_packed,
+ const struct bpos *r)
+{
+ EBUG_ON(r_packed && !bkey_packed(r_packed));
+
+ if (unlikely(!bkey_packed(l)))
+ return bpos_cmp(packed_to_bkey_c(l)->p, *r);
+
+ if (likely(r_packed))
+ return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b);
+
+ return __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
+}
+
+static inline struct bset_tree *
+bch2_bkey_to_bset_inlined(struct btree *b, struct bkey_packed *k)
+{
+ unsigned offset = __btree_node_key_to_offset(b, k);
+ struct bset_tree *t;
+
+ for_each_bset(b, t)
+ if (offset <= t->end_offset) {
+ EBUG_ON(offset < btree_bkey_first_offset(t));
+ return t;
+ }
+
+ BUG();
+}
+
+struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *);
+
+struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *,
+ struct bkey_packed *, unsigned);
+
+static inline struct bkey_packed *
+bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
+{
+ return bch2_bkey_prev_filter(b, t, k, 0);
+}
+
+static inline struct bkey_packed *
+bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
+{
+ return bch2_bkey_prev_filter(b, t, k, 1);
+}
+
+/* Btree key iteration */
+
+void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *,
+ const struct bkey_packed *,
+ const struct bkey_packed *);
+void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *,
+ struct bpos *);
+void bch2_btree_node_iter_init_from_start(struct btree_node_iter *,
+ struct btree *);
+struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *,
+ struct btree *,
+ struct bset_tree *);
+
+void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *);
+void bch2_btree_node_iter_set_drop(struct btree_node_iter *,
+ struct btree_node_iter_set *);
+void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *);
+
+#define btree_node_iter_for_each(_iter, _set) \
+ for (_set = (_iter)->data; \
+ _set < (_iter)->data + ARRAY_SIZE((_iter)->data) && \
+ (_set)->k != (_set)->end; \
+ _set++)
+
+static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter,
+ unsigned i)
+{
+ return iter->data[i].k == iter->data[i].end;
+}
+
+static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter)
+{
+ return __btree_node_iter_set_end(iter, 0);
+}
+
+/*
+ * When keys compare equal, deleted keys compare first:
+ *
+ * XXX: only need to compare pointers for keys that are both within a
+ * btree_node_iterator - we need to break ties for prev() to work correctly
+ */
+static inline int bkey_iter_cmp(const struct btree *b,
+ const struct bkey_packed *l,
+ const struct bkey_packed *r)
+{
+ return bch2_bkey_cmp_packed(b, l, r)
+ ?: (int) bkey_deleted(r) - (int) bkey_deleted(l)
+ ?: cmp_int(l, r);
+}
+
+static inline int btree_node_iter_cmp(const struct btree *b,
+ struct btree_node_iter_set l,
+ struct btree_node_iter_set r)
+{
+ return bkey_iter_cmp(b,
+ __btree_node_offset_to_key(b, l.k),
+ __btree_node_offset_to_key(b, r.k));
+}
+
+/* These assume r (the search key) is not a deleted key: */
+static inline int bkey_iter_pos_cmp(const struct btree *b,
+ const struct bkey_packed *l,
+ const struct bpos *r)
+{
+ return bkey_cmp_left_packed(b, l, r)
+ ?: -((int) bkey_deleted(l));
+}
+
+static inline int bkey_iter_cmp_p_or_unp(const struct btree *b,
+ const struct bkey_packed *l,
+ const struct bkey_packed *r_packed,
+ const struct bpos *r)
+{
+ return bkey_cmp_p_or_unp(b, l, r_packed, r)
+ ?: -((int) bkey_deleted(l));
+}
+
+static inline struct bkey_packed *
+__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
+ struct btree *b)
+{
+ return __btree_node_offset_to_key(b, iter->data->k);
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, struct btree *b)
+{
+ return !bch2_btree_node_iter_end(iter)
+ ? __btree_node_offset_to_key(b, iter->data->k)
+ : NULL;
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b)
+{
+ struct bkey_packed *k;
+
+ while ((k = bch2_btree_node_iter_peek_all(iter, b)) &&
+ bkey_deleted(k))
+ bch2_btree_node_iter_advance(iter, b);
+
+ return k;
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b)
+{
+ struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b);
+
+ if (ret)
+ bch2_btree_node_iter_advance(iter, b);
+
+ return ret;
+}
+
+struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *,
+ struct btree *);
+struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *,
+ struct btree *);
+
+struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
+ struct btree *,
+ struct bkey *);
+
+#define for_each_btree_node_key(b, k, iter) \
+ for (bch2_btree_node_iter_init_from_start((iter), (b)); \
+ (k = bch2_btree_node_iter_peek((iter), (b))); \
+ bch2_btree_node_iter_advance(iter, b))
+
+#define for_each_btree_node_key_unpack(b, k, iter, unpacked) \
+ for (bch2_btree_node_iter_init_from_start((iter), (b)); \
+ (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\
+ bch2_btree_node_iter_advance(iter, b))
+
+/* Accounting: */
+
+static inline void btree_keys_account_key(struct btree_nr_keys *n,
+ unsigned bset,
+ struct bkey_packed *k,
+ int sign)
+{
+ n->live_u64s += k->u64s * sign;
+ n->bset_u64s[bset] += k->u64s * sign;
+
+ if (bkey_packed(k))
+ n->packed_keys += sign;
+ else
+ n->unpacked_keys += sign;
+}
+
+static inline void btree_keys_account_val_delta(struct btree *b,
+ struct bkey_packed *k,
+ int delta)
+{
+ struct bset_tree *t = bch2_bkey_to_bset(b, k);
+
+ b->nr.live_u64s += delta;
+ b->nr.bset_u64s[t - b->set] += delta;
+}
+
+#define btree_keys_account_key_add(_nr, _bset_idx, _k) \
+ btree_keys_account_key(_nr, _bset_idx, _k, 1)
+#define btree_keys_account_key_drop(_nr, _bset_idx, _k) \
+ btree_keys_account_key(_nr, _bset_idx, _k, -1)
+
+#define btree_account_key_add(_b, _k) \
+ btree_keys_account_key(&(_b)->nr, \
+ bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1)
+#define btree_account_key_drop(_b, _k) \
+ btree_keys_account_key(&(_b)->nr, \
+ bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1)
+
+struct bset_stats {
+ struct {
+ size_t nr, bytes;
+ } sets[BSET_TREE_NR_TYPES];
+
+ size_t floats;
+ size_t failed;
+};
+
+void bch2_btree_keys_stats(const struct btree *, struct bset_stats *);
+void bch2_bfloat_to_text(struct printbuf *, struct btree *,
+ struct bkey_packed *);
+
+/* Debug stuff */
+
+void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned);
+void bch2_dump_btree_node(struct bch_fs *, struct btree *);
+void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *);
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void __bch2_verify_btree_nr_keys(struct btree *);
+void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
+void bch2_verify_insert_pos(struct btree *, struct bkey_packed *,
+ struct bkey_packed *, unsigned);
+
+#else
+
+static inline void __bch2_verify_btree_nr_keys(struct btree *b) {}
+static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
+ struct btree *b) {}
+static inline void bch2_verify_insert_pos(struct btree *b,
+ struct bkey_packed *where,
+ struct bkey_packed *insert,
+ unsigned clobber_u64s) {}
+#endif
+
+static inline void bch2_verify_btree_nr_keys(struct btree *b)
+{
+ if (bch2_debug_check_btree_accounting)
+ __bch2_verify_btree_nr_keys(b);
+}
+
+#endif /* _BCACHEFS_BSET_H */
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
new file mode 100644
index 000000000000..79495cd7a794
--- /dev/null
+++ b/fs/bcachefs/btree_cache.c
@@ -0,0 +1,1215 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "debug.h"
+#include "errcode.h"
+#include "error.h"
+#include "journal.h"
+#include "trace.h"
+
+#include <linux/prefetch.h>
+#include <linux/sched/mm.h>
+
+const char * const bch2_btree_node_flags[] = {
+#define x(f) #f,
+ BTREE_FLAGS()
+#undef x
+ NULL
+};
+
+void bch2_recalc_btree_reserve(struct bch_fs *c)
+{
+ unsigned i, reserve = 16;
+
+ if (!c->btree_roots_known[0].b)
+ reserve += 8;
+
+ for (i = 0; i < btree_id_nr_alive(c); i++) {
+ struct btree_root *r = bch2_btree_id_root(c, i);
+
+ if (r->b)
+ reserve += min_t(unsigned, 1, r->b->c.level) * 8;
+ }
+
+ c->btree_cache.reserve = reserve;
+}
+
+static inline unsigned btree_cache_can_free(struct btree_cache *bc)
+{
+ return max_t(int, 0, bc->used - bc->reserve);
+}
+
+static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
+{
+ if (b->c.lock.readers)
+ list_move(&b->list, &bc->freed_pcpu);
+ else
+ list_move(&b->list, &bc->freed_nonpcpu);
+}
+
+static void btree_node_data_free(struct bch_fs *c, struct btree *b)
+{
+ struct btree_cache *bc = &c->btree_cache;
+
+ EBUG_ON(btree_node_write_in_flight(b));
+
+ clear_btree_node_just_written(b);
+
+ kvpfree(b->data, btree_bytes(c));
+ b->data = NULL;
+#ifdef __KERNEL__
+ kvfree(b->aux_data);
+#else
+ munmap(b->aux_data, btree_aux_data_bytes(b));
+#endif
+ b->aux_data = NULL;
+
+ bc->used--;
+
+ btree_node_to_freedlist(bc, b);
+}
+
+static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
+ const void *obj)
+{
+ const struct btree *b = obj;
+ const u64 *v = arg->key;
+
+ return b->hash_val == *v ? 0 : 1;
+}
+
+static const struct rhashtable_params bch_btree_cache_params = {
+ .head_offset = offsetof(struct btree, hash),
+ .key_offset = offsetof(struct btree, hash_val),
+ .key_len = sizeof(u64),
+ .obj_cmpfn = bch2_btree_cache_cmp_fn,
+};
+
+static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
+{
+ BUG_ON(b->data || b->aux_data);
+
+ b->data = kvpmalloc(btree_bytes(c), gfp);
+ if (!b->data)
+ return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
+#ifdef __KERNEL__
+ b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
+#else
+ b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
+ PROT_READ|PROT_WRITE|PROT_EXEC,
+ MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
+ if (b->aux_data == MAP_FAILED)
+ b->aux_data = NULL;
+#endif
+ if (!b->aux_data) {
+ kvpfree(b->data, btree_bytes(c));
+ b->data = NULL;
+ return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
+ }
+
+ return 0;
+}
+
+static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
+{
+ struct btree *b;
+
+ b = kzalloc(sizeof(struct btree), gfp);
+ if (!b)
+ return NULL;
+
+ bkey_btree_ptr_init(&b->key);
+ INIT_LIST_HEAD(&b->list);
+ INIT_LIST_HEAD(&b->write_blocked);
+ b->byte_order = ilog2(btree_bytes(c));
+ return b;
+}
+
+struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
+{
+ struct btree_cache *bc = &c->btree_cache;
+ struct btree *b;
+
+ b = __btree_node_mem_alloc(c, GFP_KERNEL);
+ if (!b)
+ return NULL;
+
+ if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
+ kfree(b);
+ return NULL;
+ }
+
+ bch2_btree_lock_init(&b->c, 0);
+
+ bc->used++;
+ list_add(&b->list, &bc->freeable);
+ return b;
+}
+
+/* Btree in memory cache - hash table */
+
+void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
+{
+ int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
+
+ BUG_ON(ret);
+
+ /* Cause future lookups for this node to fail: */
+ b->hash_val = 0;
+}
+
+int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
+{
+ BUG_ON(b->hash_val);
+ b->hash_val = btree_ptr_hash_val(&b->key);
+
+ return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
+ bch_btree_cache_params);
+}
+
+int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
+ unsigned level, enum btree_id id)
+{
+ int ret;
+
+ b->c.level = level;
+ b->c.btree_id = id;
+
+ mutex_lock(&bc->lock);
+ ret = __bch2_btree_node_hash_insert(bc, b);
+ if (!ret)
+ list_add_tail(&b->list, &bc->live);
+ mutex_unlock(&bc->lock);
+
+ return ret;
+}
+
+__flatten
+static inline struct btree *btree_cache_find(struct btree_cache *bc,
+ const struct bkey_i *k)
+{
+ u64 v = btree_ptr_hash_val(k);
+
+ return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
+}
+
+/*
+ * this version is for btree nodes that have already been freed (we're not
+ * reaping a real btree node)
+ */
+static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
+{
+ struct btree_cache *bc = &c->btree_cache;
+ int ret = 0;
+
+ lockdep_assert_held(&bc->lock);
+wait_on_io:
+ if (b->flags & ((1U << BTREE_NODE_dirty)|
+ (1U << BTREE_NODE_read_in_flight)|
+ (1U << BTREE_NODE_write_in_flight))) {
+ if (!flush)
+ return -BCH_ERR_ENOMEM_btree_node_reclaim;
+
+ /* XXX: waiting on IO with btree cache lock held */
+ bch2_btree_node_wait_on_read(b);
+ bch2_btree_node_wait_on_write(b);
+ }
+
+ if (!six_trylock_intent(&b->c.lock))
+ return -BCH_ERR_ENOMEM_btree_node_reclaim;
+
+ if (!six_trylock_write(&b->c.lock))
+ goto out_unlock_intent;
+
+ /* recheck under lock */
+ if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
+ (1U << BTREE_NODE_write_in_flight))) {
+ if (!flush)
+ goto out_unlock;
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
+ goto wait_on_io;
+ }
+
+ if (btree_node_noevict(b) ||
+ btree_node_write_blocked(b) ||
+ btree_node_will_make_reachable(b))
+ goto out_unlock;
+
+ if (btree_node_dirty(b)) {
+ if (!flush)
+ goto out_unlock;
+ /*
+ * Using the underscore version because we don't want to compact
+ * bsets after the write, since this node is about to be evicted
+ * - unless btree verify mode is enabled, since it runs out of
+ * the post write cleanup:
+ */
+ if (bch2_verify_btree_ondisk)
+ bch2_btree_node_write(c, b, SIX_LOCK_intent,
+ BTREE_WRITE_cache_reclaim);
+ else
+ __bch2_btree_node_write(c, b,
+ BTREE_WRITE_cache_reclaim);
+
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
+ goto wait_on_io;
+ }
+out:
+ if (b->hash_val && !ret)
+ trace_and_count(c, btree_cache_reap, c, b);
+ return ret;
+out_unlock:
+ six_unlock_write(&b->c.lock);
+out_unlock_intent:
+ six_unlock_intent(&b->c.lock);
+ ret = -BCH_ERR_ENOMEM_btree_node_reclaim;
+ goto out;
+}
+
+static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
+{
+ return __btree_node_reclaim(c, b, false);
+}
+
+static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
+{
+ return __btree_node_reclaim(c, b, true);
+}
+
+static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct bch_fs *c = shrink->private_data;
+ struct btree_cache *bc = &c->btree_cache;
+ struct btree *b, *t;
+ unsigned long nr = sc->nr_to_scan;
+ unsigned long can_free = 0;
+ unsigned long freed = 0;
+ unsigned long touched = 0;
+ unsigned i, flags;
+ unsigned long ret = SHRINK_STOP;
+ bool trigger_writes = atomic_read(&bc->dirty) + nr >=
+ bc->used * 3 / 4;
+
+ if (bch2_btree_shrinker_disabled)
+ return SHRINK_STOP;
+
+ mutex_lock(&bc->lock);
+ flags = memalloc_nofs_save();
+
+ /*
+ * It's _really_ critical that we don't free too many btree nodes - we
+ * have to always leave ourselves a reserve. The reserve is how we
+ * guarantee that allocating memory for a new btree node can always
+ * succeed, so that inserting keys into the btree can always succeed and
+ * IO can always make forward progress:
+ */
+ can_free = btree_cache_can_free(bc);
+ nr = min_t(unsigned long, nr, can_free);
+
+ i = 0;
+ list_for_each_entry_safe(b, t, &bc->freeable, list) {
+ /*
+ * Leave a few nodes on the freeable list, so that a btree split
+ * won't have to hit the system allocator:
+ */
+ if (++i <= 3)
+ continue;
+
+ touched++;
+
+ if (touched >= nr)
+ goto out;
+
+ if (!btree_node_reclaim(c, b)) {
+ btree_node_data_free(c, b);
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
+ freed++;
+ }
+ }
+restart:
+ list_for_each_entry_safe(b, t, &bc->live, list) {
+ touched++;
+
+ if (btree_node_accessed(b)) {
+ clear_btree_node_accessed(b);
+ } else if (!btree_node_reclaim(c, b)) {
+ freed++;
+ btree_node_data_free(c, b);
+
+ bch2_btree_node_hash_remove(bc, b);
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
+
+ if (freed == nr)
+ goto out_rotate;
+ } else if (trigger_writes &&
+ btree_node_dirty(b) &&
+ !btree_node_will_make_reachable(b) &&
+ !btree_node_write_blocked(b) &&
+ six_trylock_read(&b->c.lock)) {
+ list_move(&bc->live, &b->list);
+ mutex_unlock(&bc->lock);
+ __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
+ six_unlock_read(&b->c.lock);
+ if (touched >= nr)
+ goto out_nounlock;
+ mutex_lock(&bc->lock);
+ goto restart;
+ }
+
+ if (touched >= nr)
+ break;
+ }
+out_rotate:
+ if (&t->list != &bc->live)
+ list_move_tail(&bc->live, &t->list);
+out:
+ mutex_unlock(&bc->lock);
+out_nounlock:
+ ret = freed;
+ memalloc_nofs_restore(flags);
+ trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
+ return ret;
+}
+
+static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct bch_fs *c = shrink->private_data;
+ struct btree_cache *bc = &c->btree_cache;
+
+ if (bch2_btree_shrinker_disabled)
+ return 0;
+
+ return btree_cache_can_free(bc);
+}
+
+void bch2_fs_btree_cache_exit(struct bch_fs *c)
+{
+ struct btree_cache *bc = &c->btree_cache;
+ struct btree *b;
+ unsigned i, flags;
+
+ shrinker_free(bc->shrink);
+
+ /* vfree() can allocate memory: */
+ flags = memalloc_nofs_save();
+ mutex_lock(&bc->lock);
+
+ if (c->verify_data)
+ list_move(&c->verify_data->list, &bc->live);
+
+ kvpfree(c->verify_ondisk, btree_bytes(c));
+
+ for (i = 0; i < btree_id_nr_alive(c); i++) {
+ struct btree_root *r = bch2_btree_id_root(c, i);
+
+ if (r->b)
+ list_add(&r->b->list, &bc->live);
+ }
+
+ list_splice(&bc->freeable, &bc->live);
+
+ while (!list_empty(&bc->live)) {
+ b = list_first_entry(&bc->live, struct btree, list);
+
+ BUG_ON(btree_node_read_in_flight(b) ||
+ btree_node_write_in_flight(b));
+
+ btree_node_data_free(c, b);
+ }
+
+ BUG_ON(!bch2_journal_error(&c->journal) &&
+ atomic_read(&c->btree_cache.dirty));
+
+ list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
+
+ while (!list_empty(&bc->freed_nonpcpu)) {
+ b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
+ list_del(&b->list);
+ six_lock_exit(&b->c.lock);
+ kfree(b);
+ }
+
+ mutex_unlock(&bc->lock);
+ memalloc_nofs_restore(flags);
+
+ if (bc->table_init_done)
+ rhashtable_destroy(&bc->table);
+}
+
+int bch2_fs_btree_cache_init(struct bch_fs *c)
+{
+ struct btree_cache *bc = &c->btree_cache;
+ struct shrinker *shrink;
+ unsigned i;
+ int ret = 0;
+
+ ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
+ if (ret)
+ goto err;
+
+ bc->table_init_done = true;
+
+ bch2_recalc_btree_reserve(c);
+
+ for (i = 0; i < bc->reserve; i++)
+ if (!__bch2_btree_node_mem_alloc(c))
+ goto err;
+
+ list_splice_init(&bc->live, &bc->freeable);
+
+ mutex_init(&c->verify_lock);
+
+ shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
+ if (!shrink)
+ goto err;
+ bc->shrink = shrink;
+ shrink->count_objects = bch2_btree_cache_count;
+ shrink->scan_objects = bch2_btree_cache_scan;
+ shrink->seeks = 4;
+ shrink->private_data = c;
+ shrinker_register(shrink);
+
+ return 0;
+err:
+ return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+}
+
+void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
+{
+ mutex_init(&bc->lock);
+ INIT_LIST_HEAD(&bc->live);
+ INIT_LIST_HEAD(&bc->freeable);
+ INIT_LIST_HEAD(&bc->freed_pcpu);
+ INIT_LIST_HEAD(&bc->freed_nonpcpu);
+}
+
+/*
+ * We can only have one thread cannibalizing other cached btree nodes at a time,
+ * or we'll deadlock. We use an open coded mutex to ensure that, which a
+ * cannibalize_bucket() will take. This means every time we unlock the root of
+ * the btree, we need to release this lock if we have it held.
+ */
+void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
+{
+ struct btree_cache *bc = &c->btree_cache;
+
+ if (bc->alloc_lock == current) {
+ trace_and_count(c, btree_cache_cannibalize_unlock, c);
+ bc->alloc_lock = NULL;
+ closure_wake_up(&bc->alloc_wait);
+ }
+}
+
+int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
+{
+ struct btree_cache *bc = &c->btree_cache;
+ struct task_struct *old;
+
+ old = cmpxchg(&bc->alloc_lock, NULL, current);
+ if (old == NULL || old == current)
+ goto success;
+
+ if (!cl) {
+ trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
+ return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock;
+ }
+
+ closure_wait(&bc->alloc_wait, cl);
+
+ /* Try again, after adding ourselves to waitlist */
+ old = cmpxchg(&bc->alloc_lock, NULL, current);
+ if (old == NULL || old == current) {
+ /* We raced */
+ closure_wake_up(&bc->alloc_wait);
+ goto success;
+ }
+
+ trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
+ return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
+
+success:
+ trace_and_count(c, btree_cache_cannibalize_lock, c);
+ return 0;
+}
+
+static struct btree *btree_node_cannibalize(struct bch_fs *c)
+{
+ struct btree_cache *bc = &c->btree_cache;
+ struct btree *b;
+
+ list_for_each_entry_reverse(b, &bc->live, list)
+ if (!btree_node_reclaim(c, b))
+ return b;
+
+ while (1) {
+ list_for_each_entry_reverse(b, &bc->live, list)
+ if (!btree_node_write_and_reclaim(c, b))
+ return b;
+
+ /*
+ * Rare case: all nodes were intent-locked.
+ * Just busy-wait.
+ */
+ WARN_ONCE(1, "btree cache cannibalize failed\n");
+ cond_resched();
+ }
+}
+
+struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_cache *bc = &c->btree_cache;
+ struct list_head *freed = pcpu_read_locks
+ ? &bc->freed_pcpu
+ : &bc->freed_nonpcpu;
+ struct btree *b, *b2;
+ u64 start_time = local_clock();
+ unsigned flags;
+
+ flags = memalloc_nofs_save();
+ mutex_lock(&bc->lock);
+
+ /*
+ * We never free struct btree itself, just the memory that holds the on
+ * disk node. Check the freed list before allocating a new one:
+ */
+ list_for_each_entry(b, freed, list)
+ if (!btree_node_reclaim(c, b)) {
+ list_del_init(&b->list);
+ goto got_node;
+ }
+
+ b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN);
+ if (!b) {
+ mutex_unlock(&bc->lock);
+ bch2_trans_unlock(trans);
+ b = __btree_node_mem_alloc(c, GFP_KERNEL);
+ if (!b)
+ goto err;
+ mutex_lock(&bc->lock);
+ }
+
+ bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0);
+
+ BUG_ON(!six_trylock_intent(&b->c.lock));
+ BUG_ON(!six_trylock_write(&b->c.lock));
+got_node:
+
+ /*
+ * btree_free() doesn't free memory; it sticks the node on the end of
+ * the list. Check if there's any freed nodes there:
+ */
+ list_for_each_entry(b2, &bc->freeable, list)
+ if (!btree_node_reclaim(c, b2)) {
+ swap(b->data, b2->data);
+ swap(b->aux_data, b2->aux_data);
+ btree_node_to_freedlist(bc, b2);
+ six_unlock_write(&b2->c.lock);
+ six_unlock_intent(&b2->c.lock);
+ goto got_mem;
+ }
+
+ mutex_unlock(&bc->lock);
+
+ if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) {
+ bch2_trans_unlock(trans);
+ if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN))
+ goto err;
+ }
+
+ mutex_lock(&bc->lock);
+ bc->used++;
+got_mem:
+ mutex_unlock(&bc->lock);
+
+ BUG_ON(btree_node_hashed(b));
+ BUG_ON(btree_node_dirty(b));
+ BUG_ON(btree_node_write_in_flight(b));
+out:
+ b->flags = 0;
+ b->written = 0;
+ b->nsets = 0;
+ b->sib_u64s[0] = 0;
+ b->sib_u64s[1] = 0;
+ b->whiteout_u64s = 0;
+ bch2_btree_keys_init(b);
+ set_btree_node_accessed(b);
+
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
+ start_time);
+
+ memalloc_nofs_restore(flags);
+ return b;
+err:
+ mutex_lock(&bc->lock);
+
+ /* Try to cannibalize another cached btree node: */
+ if (bc->alloc_lock == current) {
+ b2 = btree_node_cannibalize(c);
+ clear_btree_node_just_written(b2);
+ bch2_btree_node_hash_remove(bc, b2);
+
+ if (b) {
+ swap(b->data, b2->data);
+ swap(b->aux_data, b2->aux_data);
+ btree_node_to_freedlist(bc, b2);
+ six_unlock_write(&b2->c.lock);
+ six_unlock_intent(&b2->c.lock);
+ } else {
+ b = b2;
+ list_del_init(&b->list);
+ }
+
+ mutex_unlock(&bc->lock);
+
+ trace_and_count(c, btree_cache_cannibalize, c);
+ goto out;
+ }
+
+ mutex_unlock(&bc->lock);
+ memalloc_nofs_restore(flags);
+ return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
+}
+
+/* Slowpath, don't want it inlined into btree_iter_traverse() */
+static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
+ struct btree_path *path,
+ const struct bkey_i *k,
+ enum btree_id btree_id,
+ unsigned level,
+ enum six_lock_type lock_type,
+ bool sync)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_cache *bc = &c->btree_cache;
+ struct btree *b;
+ u32 seq;
+
+ BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
+ /*
+ * Parent node must be locked, else we could read in a btree node that's
+ * been freed:
+ */
+ if (path && !bch2_btree_node_relock(trans, path, level + 1)) {
+ trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
+ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
+ }
+
+ b = bch2_btree_node_mem_alloc(trans, level != 0);
+
+ if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
+ trans->memory_allocation_failure = true;
+ trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
+ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
+ }
+
+ if (IS_ERR(b))
+ return b;
+
+ /*
+ * Btree nodes read in from disk should not have the accessed bit set
+ * initially, so that linear scans don't thrash the cache:
+ */
+ clear_btree_node_accessed(b);
+
+ bkey_copy(&b->key, k);
+ if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
+ /* raced with another fill: */
+
+ /* mark as unhashed... */
+ b->hash_val = 0;
+
+ mutex_lock(&bc->lock);
+ list_add(&b->list, &bc->freeable);
+ mutex_unlock(&bc->lock);
+
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
+ return NULL;
+ }
+
+ set_btree_node_read_in_flight(b);
+
+ six_unlock_write(&b->c.lock);
+ seq = six_lock_seq(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
+
+ /* Unlock before doing IO: */
+ if (path && sync)
+ bch2_trans_unlock_noassert(trans);
+
+ bch2_btree_node_read(c, b, sync);
+
+ if (!sync)
+ return NULL;
+
+ if (path) {
+ int ret = bch2_trans_relock(trans) ?:
+ bch2_btree_path_relock_intent(trans, path);
+ if (ret) {
+ BUG_ON(!trans->restarted);
+ return ERR_PTR(ret);
+ }
+ }
+
+ if (!six_relock_type(&b->c.lock, lock_type, seq)) {
+ if (path)
+ trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
+ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
+ }
+
+ return b;
+}
+
+static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
+{
+ struct printbuf buf = PRINTBUF;
+
+ if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations)
+ return;
+
+ prt_printf(&buf,
+ "btree node header doesn't match ptr\n"
+ "btree %s level %u\n"
+ "ptr: ",
+ bch2_btree_id_str(b->c.btree_id), b->c.level);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+ prt_printf(&buf, "\nheader: btree %s level %llu\n"
+ "min ",
+ bch2_btree_id_str(BTREE_NODE_ID(b->data)),
+ BTREE_NODE_LEVEL(b->data));
+ bch2_bpos_to_text(&buf, b->data->min_key);
+
+ prt_printf(&buf, "\nmax ");
+ bch2_bpos_to_text(&buf, b->data->max_key);
+
+ bch2_fs_inconsistent(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+}
+
+static inline void btree_check_header(struct bch_fs *c, struct btree *b)
+{
+ if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
+ b->c.level != BTREE_NODE_LEVEL(b->data) ||
+ !bpos_eq(b->data->max_key, b->key.k.p) ||
+ (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+ !bpos_eq(b->data->min_key,
+ bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
+ btree_bad_header(c, b);
+}
+
+static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
+ const struct bkey_i *k, unsigned level,
+ enum six_lock_type lock_type,
+ unsigned long trace_ip)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_cache *bc = &c->btree_cache;
+ struct btree *b;
+ struct bset_tree *t;
+ bool need_relock = false;
+ int ret;
+
+ EBUG_ON(level >= BTREE_MAX_DEPTH);
+retry:
+ b = btree_cache_find(bc, k);
+ if (unlikely(!b)) {
+ /*
+ * We must have the parent locked to call bch2_btree_node_fill(),
+ * else we could read in a btree node from disk that's been
+ * freed:
+ */
+ b = bch2_btree_node_fill(trans, path, k, path->btree_id,
+ level, lock_type, true);
+ need_relock = true;
+
+ /* We raced and found the btree node in the cache */
+ if (!b)
+ goto retry;
+
+ if (IS_ERR(b))
+ return b;
+ } else {
+ if (btree_node_read_locked(path, level + 1))
+ btree_node_unlock(trans, path, level + 1);
+
+ ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ return ERR_PTR(ret);
+
+ BUG_ON(ret);
+
+ if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
+ b->c.level != level ||
+ race_fault())) {
+ six_unlock_type(&b->c.lock, lock_type);
+ if (bch2_btree_node_relock(trans, path, level + 1))
+ goto retry;
+
+ trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
+ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
+ }
+
+ /* avoid atomic set bit if it's not needed: */
+ if (!btree_node_accessed(b))
+ set_btree_node_accessed(b);
+ }
+
+ if (unlikely(btree_node_read_in_flight(b))) {
+ u32 seq = six_lock_seq(&b->c.lock);
+
+ six_unlock_type(&b->c.lock, lock_type);
+ bch2_trans_unlock(trans);
+ need_relock = true;
+
+ bch2_btree_node_wait_on_read(b);
+
+ /*
+ * should_be_locked is not set on this path yet, so we need to
+ * relock it specifically:
+ */
+ if (!six_relock_type(&b->c.lock, lock_type, seq))
+ goto retry;
+ }
+
+ if (unlikely(need_relock)) {
+ ret = bch2_trans_relock(trans) ?:
+ bch2_btree_path_relock_intent(trans, path);
+ if (ret) {
+ six_unlock_type(&b->c.lock, lock_type);
+ return ERR_PTR(ret);
+ }
+ }
+
+ prefetch(b->aux_data);
+
+ for_each_bset(b, t) {
+ void *p = (u64 *) b->aux_data + t->aux_data_offset;
+
+ prefetch(p + L1_CACHE_BYTES * 0);
+ prefetch(p + L1_CACHE_BYTES * 1);
+ prefetch(p + L1_CACHE_BYTES * 2);
+ }
+
+ if (unlikely(btree_node_read_error(b))) {
+ six_unlock_type(&b->c.lock, lock_type);
+ return ERR_PTR(-EIO);
+ }
+
+ EBUG_ON(b->c.btree_id != path->btree_id);
+ EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
+ btree_check_header(c, b);
+
+ return b;
+}
+
+/**
+ * bch2_btree_node_get - find a btree node in the cache and lock it, reading it
+ * in from disk if necessary.
+ *
+ * @trans: btree transaction object
+ * @path: btree_path being traversed
+ * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2)
+ * @level: level of btree node being looked up (0 == leaf node)
+ * @lock_type: SIX_LOCK_read or SIX_LOCK_intent
+ * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek())
+ *
+ * The btree node will have either a read or a write lock held, depending on
+ * the @write parameter.
+ *
+ * Returns: btree node or ERR_PTR()
+ */
+struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
+ const struct bkey_i *k, unsigned level,
+ enum six_lock_type lock_type,
+ unsigned long trace_ip)
+{
+ struct bch_fs *c = trans->c;
+ struct btree *b;
+ struct bset_tree *t;
+ int ret;
+
+ EBUG_ON(level >= BTREE_MAX_DEPTH);
+
+ b = btree_node_mem_ptr(k);
+
+ /*
+ * Check b->hash_val _before_ calling btree_node_lock() - this might not
+ * be the node we want anymore, and trying to lock the wrong node could
+ * cause an unneccessary transaction restart:
+ */
+ if (unlikely(!c->opts.btree_node_mem_ptr_optimization ||
+ !b ||
+ b->hash_val != btree_ptr_hash_val(k)))
+ return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
+
+ if (btree_node_read_locked(path, level + 1))
+ btree_node_unlock(trans, path, level + 1);
+
+ ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ return ERR_PTR(ret);
+
+ BUG_ON(ret);
+
+ if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
+ b->c.level != level ||
+ race_fault())) {
+ six_unlock_type(&b->c.lock, lock_type);
+ if (bch2_btree_node_relock(trans, path, level + 1))
+ return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
+
+ trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
+ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
+ }
+
+ if (unlikely(btree_node_read_in_flight(b))) {
+ six_unlock_type(&b->c.lock, lock_type);
+ return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
+ }
+
+ prefetch(b->aux_data);
+
+ for_each_bset(b, t) {
+ void *p = (u64 *) b->aux_data + t->aux_data_offset;
+
+ prefetch(p + L1_CACHE_BYTES * 0);
+ prefetch(p + L1_CACHE_BYTES * 1);
+ prefetch(p + L1_CACHE_BYTES * 2);
+ }
+
+ /* avoid atomic set bit if it's not needed: */
+ if (!btree_node_accessed(b))
+ set_btree_node_accessed(b);
+
+ if (unlikely(btree_node_read_error(b))) {
+ six_unlock_type(&b->c.lock, lock_type);
+ return ERR_PTR(-EIO);
+ }
+
+ EBUG_ON(b->c.btree_id != path->btree_id);
+ EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
+ btree_check_header(c, b);
+
+ return b;
+}
+
+struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
+ const struct bkey_i *k,
+ enum btree_id btree_id,
+ unsigned level,
+ bool nofill)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_cache *bc = &c->btree_cache;
+ struct btree *b;
+ struct bset_tree *t;
+ int ret;
+
+ EBUG_ON(level >= BTREE_MAX_DEPTH);
+
+ if (c->opts.btree_node_mem_ptr_optimization) {
+ b = btree_node_mem_ptr(k);
+ if (b)
+ goto lock_node;
+ }
+retry:
+ b = btree_cache_find(bc, k);
+ if (unlikely(!b)) {
+ if (nofill)
+ goto out;
+
+ b = bch2_btree_node_fill(trans, NULL, k, btree_id,
+ level, SIX_LOCK_read, true);
+
+ /* We raced and found the btree node in the cache */
+ if (!b)
+ goto retry;
+
+ if (IS_ERR(b) &&
+ !bch2_btree_cache_cannibalize_lock(c, NULL))
+ goto retry;
+
+ if (IS_ERR(b))
+ goto out;
+ } else {
+lock_node:
+ ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ return ERR_PTR(ret);
+
+ BUG_ON(ret);
+
+ if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
+ b->c.btree_id != btree_id ||
+ b->c.level != level)) {
+ six_unlock_read(&b->c.lock);
+ goto retry;
+ }
+ }
+
+ /* XXX: waiting on IO with btree locks held: */
+ __bch2_btree_node_wait_on_read(b);
+
+ prefetch(b->aux_data);
+
+ for_each_bset(b, t) {
+ void *p = (u64 *) b->aux_data + t->aux_data_offset;
+
+ prefetch(p + L1_CACHE_BYTES * 0);
+ prefetch(p + L1_CACHE_BYTES * 1);
+ prefetch(p + L1_CACHE_BYTES * 2);
+ }
+
+ /* avoid atomic set bit if it's not needed: */
+ if (!btree_node_accessed(b))
+ set_btree_node_accessed(b);
+
+ if (unlikely(btree_node_read_error(b))) {
+ six_unlock_read(&b->c.lock);
+ b = ERR_PTR(-EIO);
+ goto out;
+ }
+
+ EBUG_ON(b->c.btree_id != btree_id);
+ EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
+ btree_check_header(c, b);
+out:
+ bch2_btree_cache_cannibalize_unlock(c);
+ return b;
+}
+
+int bch2_btree_node_prefetch(struct btree_trans *trans,
+ struct btree_path *path,
+ const struct bkey_i *k,
+ enum btree_id btree_id, unsigned level)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_cache *bc = &c->btree_cache;
+ struct btree *b;
+
+ BUG_ON(trans && !btree_node_locked(path, level + 1));
+ BUG_ON(level >= BTREE_MAX_DEPTH);
+
+ b = btree_cache_find(bc, k);
+ if (b)
+ return 0;
+
+ b = bch2_btree_node_fill(trans, path, k, btree_id,
+ level, SIX_LOCK_read, false);
+ return PTR_ERR_OR_ZERO(b);
+}
+
+void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_cache *bc = &c->btree_cache;
+ struct btree *b;
+
+ b = btree_cache_find(bc, k);
+ if (!b)
+ return;
+wait_on_io:
+ /* not allowed to wait on io with btree locks held: */
+
+ /* XXX we're called from btree_gc which will be holding other btree
+ * nodes locked
+ */
+ __bch2_btree_node_wait_on_read(b);
+ __bch2_btree_node_wait_on_write(b);
+
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
+
+ if (btree_node_dirty(b)) {
+ __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
+ goto wait_on_io;
+ }
+
+ BUG_ON(btree_node_dirty(b));
+
+ mutex_lock(&bc->lock);
+ btree_node_data_free(c, b);
+ bch2_btree_node_hash_remove(bc, b);
+ mutex_unlock(&bc->lock);
+
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
+}
+
+const char *bch2_btree_id_str(enum btree_id btree)
+{
+ return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)";
+}
+
+void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
+{
+ prt_printf(out, "%s level %u/%u\n ",
+ bch2_btree_id_str(b->c.btree_id),
+ b->c.level,
+ bch2_btree_id_root(c, b->c.btree_id)->level);
+ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+}
+
+void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
+{
+ struct bset_stats stats;
+
+ memset(&stats, 0, sizeof(stats));
+
+ bch2_btree_keys_stats(b, &stats);
+
+ prt_printf(out, "l %u ", b->c.level);
+ bch2_bpos_to_text(out, b->data->min_key);
+ prt_printf(out, " - ");
+ bch2_bpos_to_text(out, b->data->max_key);
+ prt_printf(out, ":\n"
+ " ptrs: ");
+ bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+ prt_newline(out);
+
+ prt_printf(out,
+ " format: ");
+ bch2_bkey_format_to_text(out, &b->format);
+
+ prt_printf(out,
+ " unpack fn len: %u\n"
+ " bytes used %zu/%zu (%zu%% full)\n"
+ " sib u64s: %u, %u (merge threshold %u)\n"
+ " nr packed keys %u\n"
+ " nr unpacked keys %u\n"
+ " floats %zu\n"
+ " failed unpacked %zu\n",
+ b->unpack_fn_len,
+ b->nr.live_u64s * sizeof(u64),
+ btree_bytes(c) - sizeof(struct btree_node),
+ b->nr.live_u64s * 100 / btree_max_u64s(c),
+ b->sib_u64s[0],
+ b->sib_u64s[1],
+ c->btree_foreground_merge_threshold,
+ b->nr.packed_keys,
+ b->nr.unpacked_keys,
+ stats.floats,
+ stats.failed);
+}
+
+void bch2_btree_cache_to_text(struct printbuf *out, const struct bch_fs *c)
+{
+ prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used);
+ prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty));
+ prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock);
+}
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
new file mode 100644
index 000000000000..cfb80b201d61
--- /dev/null
+++ b/fs/bcachefs/btree_cache.h
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_CACHE_H
+#define _BCACHEFS_BTREE_CACHE_H
+
+#include "bcachefs.h"
+#include "btree_types.h"
+#include "bkey_methods.h"
+
+extern const char * const bch2_btree_node_flags[];
+
+struct btree_iter;
+
+void bch2_recalc_btree_reserve(struct bch_fs *);
+
+void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
+int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
+int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
+ unsigned, enum btree_id);
+
+void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
+int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
+
+struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
+struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool);
+
+struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
+ const struct bkey_i *, unsigned,
+ enum six_lock_type, unsigned long);
+
+struct btree *bch2_btree_node_get_noiter(struct btree_trans *, const struct bkey_i *,
+ enum btree_id, unsigned, bool);
+
+int bch2_btree_node_prefetch(struct btree_trans *, struct btree_path *,
+ const struct bkey_i *, enum btree_id, unsigned);
+
+void bch2_btree_node_evict(struct btree_trans *, const struct bkey_i *);
+
+void bch2_fs_btree_cache_exit(struct bch_fs *);
+int bch2_fs_btree_cache_init(struct bch_fs *);
+void bch2_fs_btree_cache_init_early(struct btree_cache *);
+
+static inline u64 btree_ptr_hash_val(const struct bkey_i *k)
+{
+ switch (k->k.type) {
+ case KEY_TYPE_btree_ptr:
+ return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start);
+ case KEY_TYPE_btree_ptr_v2:
+ /*
+ * The cast/deref is only necessary to avoid sparse endianness
+ * warnings:
+ */
+ return *((u64 *) &bkey_i_to_btree_ptr_v2_c(k)->v.seq);
+ default:
+ return 0;
+ }
+}
+
+static inline struct btree *btree_node_mem_ptr(const struct bkey_i *k)
+{
+ return k->k.type == KEY_TYPE_btree_ptr_v2
+ ? (void *)(unsigned long)bkey_i_to_btree_ptr_v2_c(k)->v.mem_ptr
+ : NULL;
+}
+
+/* is btree node in hash table? */
+static inline bool btree_node_hashed(struct btree *b)
+{
+ return b->hash_val != 0;
+}
+
+#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \
+ for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl, \
+ &(_c)->btree_cache.table), \
+ _iter = 0; _iter < (_tbl)->size; _iter++) \
+ rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
+
+static inline size_t btree_bytes(struct bch_fs *c)
+{
+ return c->opts.btree_node_size;
+}
+
+static inline size_t btree_max_u64s(struct bch_fs *c)
+{
+ return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64);
+}
+
+static inline size_t btree_pages(struct bch_fs *c)
+{
+ return btree_bytes(c) / PAGE_SIZE;
+}
+
+static inline unsigned btree_blocks(struct bch_fs *c)
+{
+ return btree_sectors(c) >> c->block_bits;
+}
+
+#define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 2 / 3)
+
+#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3)
+#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \
+ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \
+ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) >> 2))
+
+static inline unsigned btree_id_nr_alive(struct bch_fs *c)
+{
+ return BTREE_ID_NR + c->btree_roots_extra.nr;
+}
+
+static inline struct btree_root *bch2_btree_id_root(struct bch_fs *c, unsigned id)
+{
+ if (likely(id < BTREE_ID_NR)) {
+ return &c->btree_roots_known[id];
+ } else {
+ unsigned idx = id - BTREE_ID_NR;
+
+ EBUG_ON(idx >= c->btree_roots_extra.nr);
+ return &c->btree_roots_extra.data[idx];
+ }
+}
+
+static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b)
+{
+ return bch2_btree_id_root(c, b->c.btree_id)->b;
+}
+
+const char *bch2_btree_id_str(enum btree_id);
+void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
+void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
+void bch2_btree_cache_to_text(struct printbuf *, const struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_CACHE_H */
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
new file mode 100644
index 000000000000..30ab78a24517
--- /dev/null
+++ b/fs/bcachefs/btree_gc.c
@@ -0,0 +1,2146 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright (C) 2014 Datera Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "bkey_methods.h"
+#include "bkey_buf.h"
+#include "btree_journal_iter.h"
+#include "btree_key_cache.h"
+#include "btree_locking.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "clock.h"
+#include "debug.h"
+#include "ec.h"
+#include "error.h"
+#include "extents.h"
+#include "journal.h"
+#include "keylist.h"
+#include "move.h"
+#include "recovery.h"
+#include "reflink.h"
+#include "replicas.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate.h>
+#include <linux/sched/task.h>
+
+#define DROP_THIS_NODE 10
+#define DROP_PREV_NODE 11
+
+static bool should_restart_for_topology_repair(struct bch_fs *c)
+{
+ return c->opts.fix_errors != FSCK_FIX_no &&
+ !(c->recovery_passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology));
+}
+
+static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
+{
+ preempt_disable();
+ write_seqcount_begin(&c->gc_pos_lock);
+ c->gc_pos = new_pos;
+ write_seqcount_end(&c->gc_pos_lock);
+ preempt_enable();
+}
+
+static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
+{
+ BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0);
+ __gc_pos_set(c, new_pos);
+}
+
+/*
+ * Missing: if an interior btree node is empty, we need to do something -
+ * perhaps just kill it
+ */
+static int bch2_gc_check_topology(struct bch_fs *c,
+ struct btree *b,
+ struct bkey_buf *prev,
+ struct bkey_buf cur,
+ bool is_last)
+{
+ struct bpos node_start = b->data->min_key;
+ struct bpos node_end = b->data->max_key;
+ struct bpos expected_start = bkey_deleted(&prev->k->k)
+ ? node_start
+ : bpos_successor(prev->k->k.p);
+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+ int ret = 0;
+
+ if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
+ struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k);
+
+ if (!bpos_eq(expected_start, bp->v.min_key)) {
+ bch2_topology_error(c);
+
+ if (bkey_deleted(&prev->k->k)) {
+ prt_printf(&buf1, "start of node: ");
+ bch2_bpos_to_text(&buf1, node_start);
+ } else {
+ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k));
+ }
+ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k));
+
+ if (__fsck_err(c,
+ FSCK_CAN_FIX|
+ FSCK_CAN_IGNORE|
+ FSCK_NO_RATELIMIT,
+ btree_node_topology_bad_min_key,
+ "btree node with incorrect min_key at btree %s level %u:\n"
+ " prev %s\n"
+ " cur %s",
+ bch2_btree_id_str(b->c.btree_id), b->c.level,
+ buf1.buf, buf2.buf) && should_restart_for_topology_repair(c)) {
+ bch_info(c, "Halting mark and sweep to start topology repair pass");
+ ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
+ goto err;
+ } else {
+ set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
+ }
+ }
+ }
+
+ if (is_last && !bpos_eq(cur.k->k.p, node_end)) {
+ bch2_topology_error(c);
+
+ printbuf_reset(&buf1);
+ printbuf_reset(&buf2);
+
+ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k));
+ bch2_bpos_to_text(&buf2, node_end);
+
+ if (__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE|FSCK_NO_RATELIMIT,
+ btree_node_topology_bad_max_key,
+ "btree node with incorrect max_key at btree %s level %u:\n"
+ " %s\n"
+ " expected %s",
+ bch2_btree_id_str(b->c.btree_id), b->c.level,
+ buf1.buf, buf2.buf) &&
+ should_restart_for_topology_repair(c)) {
+ bch_info(c, "Halting mark and sweep to start topology repair pass");
+ ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
+ goto err;
+ } else {
+ set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
+ }
+ }
+
+ bch2_bkey_buf_copy(prev, c, cur.k);
+err:
+fsck_err:
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
+ return ret;
+}
+
+static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
+{
+ switch (b->key.k.type) {
+ case KEY_TYPE_btree_ptr: {
+ struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key);
+
+ dst->k.p = src->k.p;
+ dst->v.mem_ptr = 0;
+ dst->v.seq = b->data->keys.seq;
+ dst->v.sectors_written = 0;
+ dst->v.flags = 0;
+ dst->v.min_key = b->data->min_key;
+ set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k));
+ memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k));
+ break;
+ }
+ case KEY_TYPE_btree_ptr_v2:
+ bkey_copy(&dst->k_i, &b->key);
+ break;
+ default:
+ BUG();
+ }
+}
+
+static void bch2_btree_node_update_key_early(struct btree_trans *trans,
+ enum btree_id btree, unsigned level,
+ struct bkey_s_c old, struct bkey_i *new)
+{
+ struct bch_fs *c = trans->c;
+ struct btree *b;
+ struct bkey_buf tmp;
+ int ret;
+
+ bch2_bkey_buf_init(&tmp);
+ bch2_bkey_buf_reassemble(&tmp, c, old);
+
+ b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true);
+ if (!IS_ERR_OR_NULL(b)) {
+ mutex_lock(&c->btree_cache.lock);
+
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+ bkey_copy(&b->key, new);
+ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+ BUG_ON(ret);
+
+ mutex_unlock(&c->btree_cache.lock);
+ six_unlock_read(&b->c.lock);
+ }
+
+ bch2_bkey_buf_exit(&tmp, c);
+}
+
+static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
+{
+ struct bkey_i_btree_ptr_v2 *new;
+ int ret;
+
+ new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
+ if (!new)
+ return -BCH_ERR_ENOMEM_gc_repair_key;
+
+ btree_ptr_to_v2(b, new);
+ b->data->min_key = new_min;
+ new->v.min_key = new_min;
+ SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
+
+ ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
+ if (ret) {
+ kfree(new);
+ return ret;
+ }
+
+ bch2_btree_node_drop_keys_outside_node(b);
+ bkey_copy(&b->key, &new->k_i);
+ return 0;
+}
+
+static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
+{
+ struct bkey_i_btree_ptr_v2 *new;
+ int ret;
+
+ ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
+ if (ret)
+ return ret;
+
+ new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
+ if (!new)
+ return -BCH_ERR_ENOMEM_gc_repair_key;
+
+ btree_ptr_to_v2(b, new);
+ b->data->max_key = new_max;
+ new->k.p = new_max;
+ SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
+
+ ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
+ if (ret) {
+ kfree(new);
+ return ret;
+ }
+
+ bch2_btree_node_drop_keys_outside_node(b);
+
+ mutex_lock(&c->btree_cache.lock);
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+ bkey_copy(&b->key, &new->k_i);
+ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+ BUG_ON(ret);
+ mutex_unlock(&c->btree_cache.lock);
+ return 0;
+}
+
+static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
+ struct btree *prev, struct btree *cur)
+{
+ struct bpos expected_start = !prev
+ ? b->data->min_key
+ : bpos_successor(prev->key.k.p);
+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+ int ret = 0;
+
+ if (!prev) {
+ prt_printf(&buf1, "start of node: ");
+ bch2_bpos_to_text(&buf1, b->data->min_key);
+ } else {
+ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key));
+ }
+
+ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key));
+
+ if (prev &&
+ bpos_gt(expected_start, cur->data->min_key) &&
+ BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) {
+ /* cur overwrites prev: */
+
+ if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key,
+ cur->data->min_key), c,
+ btree_node_topology_overwritten_by_next_node,
+ "btree node overwritten by next node at btree %s level %u:\n"
+ " node %s\n"
+ " next %s",
+ bch2_btree_id_str(b->c.btree_id), b->c.level,
+ buf1.buf, buf2.buf)) {
+ ret = DROP_PREV_NODE;
+ goto out;
+ }
+
+ if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p,
+ bpos_predecessor(cur->data->min_key)), c,
+ btree_node_topology_bad_max_key,
+ "btree node with incorrect max_key at btree %s level %u:\n"
+ " node %s\n"
+ " next %s",
+ bch2_btree_id_str(b->c.btree_id), b->c.level,
+ buf1.buf, buf2.buf))
+ ret = set_node_max(c, prev,
+ bpos_predecessor(cur->data->min_key));
+ } else {
+ /* prev overwrites cur: */
+
+ if (mustfix_fsck_err_on(bpos_ge(expected_start,
+ cur->data->max_key), c,
+ btree_node_topology_overwritten_by_prev_node,
+ "btree node overwritten by prev node at btree %s level %u:\n"
+ " prev %s\n"
+ " node %s",
+ bch2_btree_id_str(b->c.btree_id), b->c.level,
+ buf1.buf, buf2.buf)) {
+ ret = DROP_THIS_NODE;
+ goto out;
+ }
+
+ if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c,
+ btree_node_topology_bad_min_key,
+ "btree node with incorrect min_key at btree %s level %u:\n"
+ " prev %s\n"
+ " node %s",
+ bch2_btree_id_str(b->c.btree_id), b->c.level,
+ buf1.buf, buf2.buf))
+ ret = set_node_min(c, cur, expected_start);
+ }
+out:
+fsck_err:
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
+ return ret;
+}
+
+static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
+ struct btree *child)
+{
+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+ int ret = 0;
+
+ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key));
+ bch2_bpos_to_text(&buf2, b->key.k.p);
+
+ if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c,
+ btree_node_topology_bad_max_key,
+ "btree node with incorrect max_key at btree %s level %u:\n"
+ " %s\n"
+ " expected %s",
+ bch2_btree_id_str(b->c.btree_id), b->c.level,
+ buf1.buf, buf2.buf)) {
+ ret = set_node_max(c, child, b->key.k.p);
+ if (ret)
+ goto err;
+ }
+err:
+fsck_err:
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
+ return ret;
+}
+
+static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_and_journal_iter iter;
+ struct bkey_s_c k;
+ struct bkey_buf prev_k, cur_k;
+ struct btree *prev = NULL, *cur = NULL;
+ bool have_child, dropped_children = false;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ if (!b->c.level)
+ return 0;
+again:
+ prev = NULL;
+ have_child = dropped_children = false;
+ bch2_bkey_buf_init(&prev_k);
+ bch2_bkey_buf_init(&cur_k);
+ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+
+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+ BUG_ON(bpos_lt(k.k->p, b->data->min_key));
+ BUG_ON(bpos_gt(k.k->p, b->data->max_key));
+
+ bch2_btree_and_journal_iter_advance(&iter);
+ bch2_bkey_buf_reassemble(&cur_k, c, k);
+
+ cur = bch2_btree_node_get_noiter(trans, cur_k.k,
+ b->c.btree_id, b->c.level - 1,
+ false);
+ ret = PTR_ERR_OR_ZERO(cur);
+
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
+
+ if (mustfix_fsck_err_on(ret == -EIO, c,
+ btree_node_unreadable,
+ "Topology repair: unreadable btree node at btree %s level %u:\n"
+ " %s",
+ bch2_btree_id_str(b->c.btree_id),
+ b->c.level - 1,
+ buf.buf)) {
+ bch2_btree_node_evict(trans, cur_k.k);
+ ret = bch2_journal_key_delete(c, b->c.btree_id,
+ b->c.level, cur_k.k->k.p);
+ cur = NULL;
+ if (ret)
+ break;
+ continue;
+ }
+
+ if (ret) {
+ bch_err_msg(c, ret, "getting btree node");
+ break;
+ }
+
+ ret = btree_repair_node_boundaries(c, b, prev, cur);
+
+ if (ret == DROP_THIS_NODE) {
+ six_unlock_read(&cur->c.lock);
+ bch2_btree_node_evict(trans, cur_k.k);
+ ret = bch2_journal_key_delete(c, b->c.btree_id,
+ b->c.level, cur_k.k->k.p);
+ cur = NULL;
+ if (ret)
+ break;
+ continue;
+ }
+
+ if (prev)
+ six_unlock_read(&prev->c.lock);
+ prev = NULL;
+
+ if (ret == DROP_PREV_NODE) {
+ bch2_btree_node_evict(trans, prev_k.k);
+ ret = bch2_journal_key_delete(c, b->c.btree_id,
+ b->c.level, prev_k.k->k.p);
+ if (ret)
+ break;
+
+ bch2_btree_and_journal_iter_exit(&iter);
+ bch2_bkey_buf_exit(&prev_k, c);
+ bch2_bkey_buf_exit(&cur_k, c);
+ goto again;
+ } else if (ret)
+ break;
+
+ prev = cur;
+ cur = NULL;
+ bch2_bkey_buf_copy(&prev_k, c, cur_k.k);
+ }
+
+ if (!ret && !IS_ERR_OR_NULL(prev)) {
+ BUG_ON(cur);
+ ret = btree_repair_node_end(c, b, prev);
+ }
+
+ if (!IS_ERR_OR_NULL(prev))
+ six_unlock_read(&prev->c.lock);
+ prev = NULL;
+ if (!IS_ERR_OR_NULL(cur))
+ six_unlock_read(&cur->c.lock);
+ cur = NULL;
+
+ if (ret)
+ goto err;
+
+ bch2_btree_and_journal_iter_exit(&iter);
+ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+
+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+ bch2_bkey_buf_reassemble(&cur_k, c, k);
+ bch2_btree_and_journal_iter_advance(&iter);
+
+ cur = bch2_btree_node_get_noiter(trans, cur_k.k,
+ b->c.btree_id, b->c.level - 1,
+ false);
+ ret = PTR_ERR_OR_ZERO(cur);
+
+ if (ret) {
+ bch_err_msg(c, ret, "getting btree node");
+ goto err;
+ }
+
+ ret = bch2_btree_repair_topology_recurse(trans, cur);
+ six_unlock_read(&cur->c.lock);
+ cur = NULL;
+
+ if (ret == DROP_THIS_NODE) {
+ bch2_btree_node_evict(trans, cur_k.k);
+ ret = bch2_journal_key_delete(c, b->c.btree_id,
+ b->c.level, cur_k.k->k.p);
+ dropped_children = true;
+ }
+
+ if (ret)
+ goto err;
+
+ have_child = true;
+ }
+
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+ if (mustfix_fsck_err_on(!have_child, c,
+ btree_node_topology_interior_node_empty,
+ "empty interior btree node at btree %s level %u\n"
+ " %s",
+ bch2_btree_id_str(b->c.btree_id),
+ b->c.level, buf.buf))
+ ret = DROP_THIS_NODE;
+err:
+fsck_err:
+ if (!IS_ERR_OR_NULL(prev))
+ six_unlock_read(&prev->c.lock);
+ if (!IS_ERR_OR_NULL(cur))
+ six_unlock_read(&cur->c.lock);
+
+ bch2_btree_and_journal_iter_exit(&iter);
+ bch2_bkey_buf_exit(&prev_k, c);
+ bch2_bkey_buf_exit(&cur_k, c);
+
+ if (!ret && dropped_children)
+ goto again;
+
+ printbuf_exit(&buf);
+ return ret;
+}
+
+int bch2_check_topology(struct bch_fs *c)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree *b;
+ unsigned i;
+ int ret = 0;
+
+ for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
+ struct btree_root *r = bch2_btree_id_root(c, i);
+
+ if (!r->alive)
+ continue;
+
+ b = r->b;
+ if (btree_node_fake(b))
+ continue;
+
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+ ret = bch2_btree_repair_topology_recurse(trans, b);
+ six_unlock_read(&b->c.lock);
+
+ if (ret == DROP_THIS_NODE) {
+ bch_err(c, "empty btree root - repair unimplemented");
+ ret = -BCH_ERR_fsck_repair_unimplemented;
+ }
+ }
+
+ bch2_trans_put(trans);
+
+ return ret;
+}
+
+static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id,
+ unsigned level, bool is_root,
+ struct bkey_s_c *k)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(*k);
+ const union bch_extent_entry *entry_c;
+ struct extent_ptr_decoded p = { 0 };
+ bool do_update = false;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ /*
+ * XXX
+ * use check_bucket_ref here
+ */
+ bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+ struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
+ enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr);
+
+ if (!g->gen_valid &&
+ (c->opts.reconstruct_alloc ||
+ fsck_err(c, ptr_to_missing_alloc_key,
+ "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ p.ptr.gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
+ if (!p.ptr.cached) {
+ g->gen_valid = true;
+ g->gen = p.ptr.gen;
+ } else {
+ do_update = true;
+ }
+ }
+
+ if (gen_cmp(p.ptr.gen, g->gen) > 0 &&
+ (c->opts.reconstruct_alloc ||
+ fsck_err(c, ptr_gen_newer_than_bucket_gen,
+ "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ p.ptr.gen, g->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
+ if (!p.ptr.cached) {
+ g->gen_valid = true;
+ g->gen = p.ptr.gen;
+ g->data_type = 0;
+ g->dirty_sectors = 0;
+ g->cached_sectors = 0;
+ set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+ } else {
+ do_update = true;
+ }
+ }
+
+ if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX &&
+ (c->opts.reconstruct_alloc ||
+ fsck_err(c, ptr_gen_newer_than_bucket_gen,
+ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
+ bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ p.ptr.gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
+ do_update = true;
+
+ if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 &&
+ (c->opts.reconstruct_alloc ||
+ fsck_err(c, stale_dirty_ptr,
+ "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ p.ptr.gen, g->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
+ do_update = true;
+
+ if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
+ continue;
+
+ if (fsck_err_on(bucket_data_type(g->data_type) &&
+ bucket_data_type(g->data_type) != data_type, c,
+ ptr_bucket_data_type_mismatch,
+ "bucket %u:%zu different types of data in same bucket: %s, %s\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_types[g->data_type],
+ bch2_data_types[data_type],
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
+ if (data_type == BCH_DATA_btree) {
+ g->data_type = data_type;
+ set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+ } else {
+ do_update = true;
+ }
+ }
+
+ if (p.has_ec) {
+ struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
+
+ if (fsck_err_on(!m || !m->alive, c,
+ ptr_to_missing_stripe,
+ "pointer to nonexistent stripe %llu\n"
+ "while marking %s",
+ (u64) p.ec.idx,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
+ do_update = true;
+
+ if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c,
+ ptr_to_incorrect_stripe,
+ "pointer does not match stripe %llu\n"
+ "while marking %s",
+ (u64) p.ec.idx,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
+ do_update = true;
+ }
+ }
+
+ if (do_update) {
+ struct bkey_ptrs ptrs;
+ union bch_extent_entry *entry;
+ struct bch_extent_ptr *ptr;
+ struct bkey_i *new;
+
+ if (is_root) {
+ bch_err(c, "cannot update btree roots yet");
+ ret = -EINVAL;
+ goto err;
+ }
+
+ new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
+ if (!new) {
+ bch_err_msg(c, ret, "allocating new key");
+ ret = -BCH_ERR_ENOMEM_gc_repair_key;
+ goto err;
+ }
+
+ bkey_reassemble(new, *k);
+
+ if (level) {
+ /*
+ * We don't want to drop btree node pointers - if the
+ * btree node isn't there anymore, the read path will
+ * sort it out:
+ */
+ ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bucket *g = PTR_GC_BUCKET(ca, ptr);
+
+ ptr->gen = g->gen;
+ }
+ } else {
+ bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bucket *g = PTR_GC_BUCKET(ca, ptr);
+ enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr);
+
+ (ptr->cached &&
+ (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) ||
+ (!ptr->cached &&
+ gen_cmp(ptr->gen, g->gen) < 0) ||
+ gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
+ (g->data_type &&
+ g->data_type != data_type);
+ }));
+again:
+ ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+ bkey_extent_entry_for_each(ptrs, entry) {
+ if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
+ struct gc_stripe *m = genradix_ptr(&c->gc_stripes,
+ entry->stripe_ptr.idx);
+ union bch_extent_entry *next_ptr;
+
+ bkey_extent_entry_for_each_from(ptrs, next_ptr, entry)
+ if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr)
+ goto found;
+ next_ptr = NULL;
+found:
+ if (!next_ptr) {
+ bch_err(c, "aieee, found stripe ptr with no data ptr");
+ continue;
+ }
+
+ if (!m || !m->alive ||
+ !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block],
+ &next_ptr->ptr,
+ m->sectors)) {
+ bch2_bkey_extent_entry_drop(new, entry);
+ goto again;
+ }
+ }
+ }
+ }
+
+ ret = bch2_journal_key_insert_take(c, btree_id, level, new);
+ if (ret) {
+ kfree(new);
+ goto err;
+ }
+
+ if (level)
+ bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new);
+
+ if (0) {
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, *k);
+ bch_info(c, "updated %s", buf.buf);
+
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
+ bch_info(c, "new key %s", buf.buf);
+ }
+
+ *k = bkey_i_to_s_c(new);
+ }
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+/* marking of btree keys/nodes: */
+
+static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
+ unsigned level, bool is_root,
+ struct bkey_s_c *k,
+ bool initial)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey deleted = KEY(0, 0, 0);
+ struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
+ unsigned flags =
+ BTREE_TRIGGER_GC|
+ (initial ? BTREE_TRIGGER_NOATOMIC : 0);
+ int ret = 0;
+
+ deleted.p = k->k->p;
+
+ if (initial) {
+ BUG_ON(bch2_journal_seq_verify &&
+ k->k->version.lo > atomic64_read(&c->journal.seq));
+
+ ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k);
+ if (ret)
+ goto err;
+
+ if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c,
+ bkey_version_in_future,
+ "key version number higher than recorded: %llu > %llu",
+ k->k->version.lo,
+ atomic64_read(&c->key_version)))
+ atomic64_set(&c->key_version, k->k->version.lo);
+ }
+
+ ret = commit_do(trans, NULL, NULL, 0,
+ bch2_mark_key(trans, btree_id, level, old, *k, flags));
+fsck_err:
+err:
+ if (ret)
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_node_iter iter;
+ struct bkey unpacked;
+ struct bkey_s_c k;
+ struct bkey_buf prev, cur;
+ int ret = 0;
+
+ if (!btree_node_type_needs_gc(btree_node_type(b)))
+ return 0;
+
+ bch2_btree_node_iter_init_from_start(&iter, b);
+ bch2_bkey_buf_init(&prev);
+ bch2_bkey_buf_init(&cur);
+ bkey_init(&prev.k->k);
+
+ while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
+ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false,
+ &k, initial);
+ if (ret)
+ break;
+
+ bch2_btree_node_iter_advance(&iter, b);
+
+ if (b->c.level) {
+ bch2_bkey_buf_reassemble(&cur, c, k);
+
+ ret = bch2_gc_check_topology(c, b, &prev, cur,
+ bch2_btree_node_iter_end(&iter));
+ if (ret)
+ break;
+ }
+ }
+
+ bch2_bkey_buf_exit(&cur, c);
+ bch2_bkey_buf_exit(&prev, c);
+ return ret;
+}
+
+static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
+ bool initial, bool metadata_only)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct btree *b;
+ unsigned depth = metadata_only ? 1 : 0;
+ int ret = 0;
+
+ gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
+
+ __for_each_btree_node(trans, iter, btree_id, POS_MIN,
+ 0, depth, BTREE_ITER_PREFETCH, b, ret) {
+ bch2_verify_btree_nr_keys(b);
+
+ gc_pos_set(c, gc_pos_btree_node(b));
+
+ ret = btree_gc_mark_node(trans, b, initial);
+ if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (ret)
+ return ret;
+
+ mutex_lock(&c->btree_root_lock);
+ b = bch2_btree_id_root(c, btree_id)->b;
+ if (!btree_node_fake(b)) {
+ struct bkey_s_c k = bkey_i_to_s_c(&b->key);
+
+ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1,
+ true, &k, initial);
+ }
+ gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
+ mutex_unlock(&c->btree_root_lock);
+
+ return ret;
+}
+
+static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b,
+ unsigned target_depth)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_and_journal_iter iter;
+ struct bkey_s_c k;
+ struct bkey_buf cur, prev;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+ bch2_bkey_buf_init(&prev);
+ bch2_bkey_buf_init(&cur);
+ bkey_init(&prev.k->k);
+
+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+ BUG_ON(bpos_lt(k.k->p, b->data->min_key));
+ BUG_ON(bpos_gt(k.k->p, b->data->max_key));
+
+ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level,
+ false, &k, true);
+ if (ret)
+ goto fsck_err;
+
+ if (b->c.level) {
+ bch2_bkey_buf_reassemble(&cur, c, k);
+ k = bkey_i_to_s_c(cur.k);
+
+ bch2_btree_and_journal_iter_advance(&iter);
+
+ ret = bch2_gc_check_topology(c, b,
+ &prev, cur,
+ !bch2_btree_and_journal_iter_peek(&iter).k);
+ if (ret)
+ goto fsck_err;
+ } else {
+ bch2_btree_and_journal_iter_advance(&iter);
+ }
+ }
+
+ if (b->c.level > target_depth) {
+ bch2_btree_and_journal_iter_exit(&iter);
+ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+
+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+ struct btree *child;
+
+ bch2_bkey_buf_reassemble(&cur, c, k);
+ bch2_btree_and_journal_iter_advance(&iter);
+
+ child = bch2_btree_node_get_noiter(trans, cur.k,
+ b->c.btree_id, b->c.level - 1,
+ false);
+ ret = PTR_ERR_OR_ZERO(child);
+
+ if (ret == -EIO) {
+ bch2_topology_error(c);
+
+ if (__fsck_err(c,
+ FSCK_CAN_FIX|
+ FSCK_CAN_IGNORE|
+ FSCK_NO_RATELIMIT,
+ btree_node_read_error,
+ "Unreadable btree node at btree %s level %u:\n"
+ " %s",
+ bch2_btree_id_str(b->c.btree_id),
+ b->c.level - 1,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) &&
+ should_restart_for_topology_repair(c)) {
+ bch_info(c, "Halting mark and sweep to start topology repair pass");
+ ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
+ goto fsck_err;
+ } else {
+ /* Continue marking when opted to not
+ * fix the error: */
+ ret = 0;
+ set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
+ continue;
+ }
+ } else if (ret) {
+ bch_err_msg(c, ret, "getting btree node");
+ break;
+ }
+
+ ret = bch2_gc_btree_init_recurse(trans, child,
+ target_depth);
+ six_unlock_read(&child->c.lock);
+
+ if (ret)
+ break;
+ }
+ }
+fsck_err:
+ bch2_bkey_buf_exit(&cur, c);
+ bch2_bkey_buf_exit(&prev, c);
+ bch2_btree_and_journal_iter_exit(&iter);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int bch2_gc_btree_init(struct btree_trans *trans,
+ enum btree_id btree_id,
+ bool metadata_only)
+{
+ struct bch_fs *c = trans->c;
+ struct btree *b;
+ unsigned target_depth = metadata_only ? 1 : 0;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ b = bch2_btree_id_root(c, btree_id)->b;
+
+ if (btree_node_fake(b))
+ return 0;
+
+ six_lock_read(&b->c.lock, NULL, NULL);
+ printbuf_reset(&buf);
+ bch2_bpos_to_text(&buf, b->data->min_key);
+ if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), c,
+ btree_root_bad_min_key,
+ "btree root with incorrect min_key: %s", buf.buf)) {
+ bch_err(c, "repair unimplemented");
+ ret = -BCH_ERR_fsck_repair_unimplemented;
+ goto fsck_err;
+ }
+
+ printbuf_reset(&buf);
+ bch2_bpos_to_text(&buf, b->data->max_key);
+ if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), c,
+ btree_root_bad_max_key,
+ "btree root with incorrect max_key: %s", buf.buf)) {
+ bch_err(c, "repair unimplemented");
+ ret = -BCH_ERR_fsck_repair_unimplemented;
+ goto fsck_err;
+ }
+
+ if (b->c.level >= target_depth)
+ ret = bch2_gc_btree_init_recurse(trans, b, target_depth);
+
+ if (!ret) {
+ struct bkey_s_c k = bkey_i_to_s_c(&b->key);
+
+ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, true,
+ &k, true);
+ }
+fsck_err:
+ six_unlock_read(&b->c.lock);
+
+ if (ret < 0)
+ bch_err_fn(c, ret);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
+{
+ return (int) btree_id_to_gc_phase(l) -
+ (int) btree_id_to_gc_phase(r);
+}
+
+static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ enum btree_id ids[BTREE_ID_NR];
+ unsigned i;
+ int ret = 0;
+
+ for (i = 0; i < BTREE_ID_NR; i++)
+ ids[i] = i;
+ bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
+
+ for (i = 0; i < BTREE_ID_NR && !ret; i++)
+ ret = initial
+ ? bch2_gc_btree_init(trans, ids[i], metadata_only)
+ : bch2_gc_btree(trans, ids[i], initial, metadata_only);
+
+ for (i = BTREE_ID_NR; i < btree_id_nr_alive(c) && !ret; i++) {
+ if (!bch2_btree_id_root(c, i)->alive)
+ continue;
+
+ ret = initial
+ ? bch2_gc_btree_init(trans, i, metadata_only)
+ : bch2_gc_btree(trans, i, initial, metadata_only);
+ }
+
+ if (ret < 0)
+ bch_err_fn(c, ret);
+
+ bch2_trans_put(trans);
+ return ret;
+}
+
+static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
+ u64 start, u64 end,
+ enum bch_data_type type,
+ unsigned flags)
+{
+ u64 b = sector_to_bucket(ca, start);
+
+ do {
+ unsigned sectors =
+ min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
+
+ bch2_mark_metadata_bucket(c, ca, b, type, sectors,
+ gc_phase(GC_PHASE_SB), flags);
+ b++;
+ start += sectors;
+ } while (start < end);
+}
+
+static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
+ unsigned flags)
+{
+ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+ unsigned i;
+ u64 b;
+
+ for (i = 0; i < layout->nr_superblocks; i++) {
+ u64 offset = le64_to_cpu(layout->sb_offset[i]);
+
+ if (offset == BCH_SB_SECTOR)
+ mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
+ BCH_DATA_sb, flags);
+
+ mark_metadata_sectors(c, ca, offset,
+ offset + (1 << layout->sb_max_size_bits),
+ BCH_DATA_sb, flags);
+ }
+
+ for (i = 0; i < ca->journal.nr; i++) {
+ b = ca->journal.buckets[i];
+ bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal,
+ ca->mi.bucket_size,
+ gc_phase(GC_PHASE_SB), flags);
+ }
+}
+
+static void bch2_mark_superblocks(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i;
+
+ mutex_lock(&c->sb_lock);
+ gc_pos_set(c, gc_phase(GC_PHASE_SB));
+
+ for_each_online_member(ca, c, i)
+ bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC);
+ mutex_unlock(&c->sb_lock);
+}
+
+#if 0
+/* Also see bch2_pending_btree_node_free_insert_done() */
+static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
+{
+ struct btree_update *as;
+ struct pending_btree_node_free *d;
+
+ mutex_lock(&c->btree_interior_update_lock);
+ gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE));
+
+ for_each_pending_btree_node_free(c, as, d)
+ if (d->index_update_done)
+ bch2_mark_key(c, bkey_i_to_s_c(&d->key), BTREE_TRIGGER_GC);
+
+ mutex_unlock(&c->btree_interior_update_lock);
+}
+#endif
+
+static void bch2_gc_free(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i;
+
+ genradix_free(&c->reflink_gc_table);
+ genradix_free(&c->gc_stripes);
+
+ for_each_member_device(ca, c, i) {
+ kvpfree(rcu_dereference_protected(ca->buckets_gc, 1),
+ sizeof(struct bucket_array) +
+ ca->mi.nbuckets * sizeof(struct bucket));
+ ca->buckets_gc = NULL;
+
+ free_percpu(ca->usage_gc);
+ ca->usage_gc = NULL;
+ }
+
+ free_percpu(c->usage_gc);
+ c->usage_gc = NULL;
+}
+
+static int bch2_gc_done(struct bch_fs *c,
+ bool initial, bool metadata_only)
+{
+ struct bch_dev *ca = NULL;
+ struct printbuf buf = PRINTBUF;
+ bool verify = !metadata_only &&
+ !c->opts.reconstruct_alloc &&
+ (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
+ unsigned i, dev;
+ int ret = 0;
+
+ percpu_down_write(&c->mark_lock);
+
+#define copy_field(_err, _f, _msg, ...) \
+ if (dst->_f != src->_f && \
+ (!verify || \
+ fsck_err(c, _err, _msg ": got %llu, should be %llu" \
+ , ##__VA_ARGS__, dst->_f, src->_f))) \
+ dst->_f = src->_f
+#define copy_dev_field(_err, _f, _msg, ...) \
+ copy_field(_err, _f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
+#define copy_fs_field(_err, _f, _msg, ...) \
+ copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__)
+
+ for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+ bch2_fs_usage_acc_to_base(c, i);
+
+ for_each_member_device(ca, c, dev) {
+ struct bch_dev_usage *dst = ca->usage_base;
+ struct bch_dev_usage *src = (void *)
+ bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc,
+ dev_usage_u64s());
+
+ for (i = 0; i < BCH_DATA_NR; i++) {
+ copy_dev_field(dev_usage_buckets_wrong,
+ d[i].buckets, "%s buckets", bch2_data_types[i]);
+ copy_dev_field(dev_usage_sectors_wrong,
+ d[i].sectors, "%s sectors", bch2_data_types[i]);
+ copy_dev_field(dev_usage_fragmented_wrong,
+ d[i].fragmented, "%s fragmented", bch2_data_types[i]);
+ }
+
+ copy_dev_field(dev_usage_buckets_ec_wrong,
+ buckets_ec, "buckets_ec");
+ }
+
+ {
+ unsigned nr = fs_usage_u64s(c);
+ struct bch_fs_usage *dst = c->usage_base;
+ struct bch_fs_usage *src = (void *)
+ bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr);
+
+ copy_fs_field(fs_usage_hidden_wrong,
+ hidden, "hidden");
+ copy_fs_field(fs_usage_btree_wrong,
+ btree, "btree");
+
+ if (!metadata_only) {
+ copy_fs_field(fs_usage_data_wrong,
+ data, "data");
+ copy_fs_field(fs_usage_cached_wrong,
+ cached, "cached");
+ copy_fs_field(fs_usage_reserved_wrong,
+ reserved, "reserved");
+ copy_fs_field(fs_usage_nr_inodes_wrong,
+ nr_inodes,"nr_inodes");
+
+ for (i = 0; i < BCH_REPLICAS_MAX; i++)
+ copy_fs_field(fs_usage_persistent_reserved_wrong,
+ persistent_reserved[i],
+ "persistent_reserved[%i]", i);
+ }
+
+ for (i = 0; i < c->replicas.nr; i++) {
+ struct bch_replicas_entry *e =
+ cpu_replicas_entry(&c->replicas, i);
+
+ if (metadata_only &&
+ (e->data_type == BCH_DATA_user ||
+ e->data_type == BCH_DATA_cached))
+ continue;
+
+ printbuf_reset(&buf);
+ bch2_replicas_entry_to_text(&buf, e);
+
+ copy_fs_field(fs_usage_replicas_wrong,
+ replicas[i], "%s", buf.buf);
+ }
+ }
+
+#undef copy_fs_field
+#undef copy_dev_field
+#undef copy_stripe_field
+#undef copy_field
+fsck_err:
+ if (ca)
+ percpu_ref_put(&ca->ref);
+ if (ret)
+ bch_err_fn(c, ret);
+
+ percpu_up_write(&c->mark_lock);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int bch2_gc_start(struct bch_fs *c)
+{
+ struct bch_dev *ca = NULL;
+ unsigned i;
+
+ BUG_ON(c->usage_gc);
+
+ c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
+ sizeof(u64), GFP_KERNEL);
+ if (!c->usage_gc) {
+ bch_err(c, "error allocating c->usage_gc");
+ return -BCH_ERR_ENOMEM_gc_start;
+ }
+
+ for_each_member_device(ca, c, i) {
+ BUG_ON(ca->usage_gc);
+
+ ca->usage_gc = alloc_percpu(struct bch_dev_usage);
+ if (!ca->usage_gc) {
+ bch_err(c, "error allocating ca->usage_gc");
+ percpu_ref_put(&ca->ref);
+ return -BCH_ERR_ENOMEM_gc_start;
+ }
+
+ this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets,
+ ca->mi.nbuckets - ca->mi.first_bucket);
+ }
+
+ return 0;
+}
+
+static int bch2_gc_reset(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i;
+
+ for_each_member_device(ca, c, i) {
+ free_percpu(ca->usage_gc);
+ ca->usage_gc = NULL;
+ }
+
+ free_percpu(c->usage_gc);
+ c->usage_gc = NULL;
+
+ return bch2_gc_start(c);
+}
+
+/* returns true if not equal */
+static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
+ struct bch_alloc_v4 r)
+{
+ return l.gen != r.gen ||
+ l.oldest_gen != r.oldest_gen ||
+ l.data_type != r.data_type ||
+ l.dirty_sectors != r.dirty_sectors ||
+ l.cached_sectors != r.cached_sectors ||
+ l.stripe_redundancy != r.stripe_redundancy ||
+ l.stripe != r.stripe;
+}
+
+static int bch2_alloc_write_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ bool metadata_only)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
+ struct bucket gc, *b;
+ struct bkey_i_alloc_v4 *a;
+ struct bch_alloc_v4 old_convert, new;
+ const struct bch_alloc_v4 *old;
+ enum bch_data_type type;
+ int ret;
+
+ if (bkey_ge(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets)))
+ return 1;
+
+ old = bch2_alloc_to_v4(k, &old_convert);
+ new = *old;
+
+ percpu_down_read(&c->mark_lock);
+ b = gc_bucket(ca, iter->pos.offset);
+
+ /*
+ * b->data_type doesn't yet include need_discard & need_gc_gen states -
+ * fix that here:
+ */
+ type = __alloc_data_type(b->dirty_sectors,
+ b->cached_sectors,
+ b->stripe,
+ *old,
+ b->data_type);
+ if (b->data_type != type) {
+ struct bch_dev_usage *u;
+
+ preempt_disable();
+ u = this_cpu_ptr(ca->usage_gc);
+ u->d[b->data_type].buckets--;
+ b->data_type = type;
+ u->d[b->data_type].buckets++;
+ preempt_enable();
+ }
+
+ gc = *b;
+ percpu_up_read(&c->mark_lock);
+
+ if (metadata_only &&
+ gc.data_type != BCH_DATA_sb &&
+ gc.data_type != BCH_DATA_journal &&
+ gc.data_type != BCH_DATA_btree)
+ return 0;
+
+ if (gen_after(old->gen, gc.gen))
+ return 0;
+
+ if (c->opts.reconstruct_alloc ||
+ fsck_err_on(new.data_type != gc.data_type, c,
+ alloc_key_data_type_wrong,
+ "bucket %llu:%llu gen %u has wrong data_type"
+ ": got %s, should be %s",
+ iter->pos.inode, iter->pos.offset,
+ gc.gen,
+ bch2_data_types[new.data_type],
+ bch2_data_types[gc.data_type]))
+ new.data_type = gc.data_type;
+
+#define copy_bucket_field(_errtype, _f) \
+ if (c->opts.reconstruct_alloc || \
+ fsck_err_on(new._f != gc._f, c, _errtype, \
+ "bucket %llu:%llu gen %u data type %s has wrong " #_f \
+ ": got %u, should be %u", \
+ iter->pos.inode, iter->pos.offset, \
+ gc.gen, \
+ bch2_data_types[gc.data_type], \
+ new._f, gc._f)) \
+ new._f = gc._f; \
+
+ copy_bucket_field(alloc_key_gen_wrong,
+ gen);
+ copy_bucket_field(alloc_key_dirty_sectors_wrong,
+ dirty_sectors);
+ copy_bucket_field(alloc_key_cached_sectors_wrong,
+ cached_sectors);
+ copy_bucket_field(alloc_key_stripe_wrong,
+ stripe);
+ copy_bucket_field(alloc_key_stripe_redundancy_wrong,
+ stripe_redundancy);
+#undef copy_bucket_field
+
+ if (!bch2_alloc_v4_cmp(*old, new))
+ return 0;
+
+ a = bch2_alloc_to_v4_mut(trans, k);
+ ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ return ret;
+
+ a->v = new;
+
+ /*
+ * The trigger normally makes sure this is set, but we're not running
+ * triggers:
+ */
+ if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ])
+ a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
+
+ ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN);
+fsck_err:
+ return ret;
+}
+
+static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_dev *ca;
+ unsigned i;
+ int ret = 0;
+
+ for_each_member_device(ca, c, i) {
+ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
+ POS(ca->dev_idx, ca->mi.first_bucket),
+ BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_LAZY_RW,
+ bch2_alloc_write_key(trans, &iter, k, metadata_only));
+
+ if (ret < 0) {
+ bch_err_fn(c, ret);
+ percpu_ref_put(&ca->ref);
+ break;
+ }
+ }
+
+ bch2_trans_put(trans);
+ return ret < 0 ? ret : 0;
+}
+
+static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
+{
+ struct bch_dev *ca;
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bucket *g;
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a;
+ unsigned i;
+ int ret;
+
+ for_each_member_device(ca, c, i) {
+ struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
+ ca->mi.nbuckets * sizeof(struct bucket),
+ GFP_KERNEL|__GFP_ZERO);
+ if (!buckets) {
+ percpu_ref_put(&ca->ref);
+ bch_err(c, "error allocating ca->buckets[gc]");
+ ret = -BCH_ERR_ENOMEM_gc_alloc_start;
+ goto err;
+ }
+
+ buckets->first_bucket = ca->mi.first_bucket;
+ buckets->nbuckets = ca->mi.nbuckets;
+ rcu_assign_pointer(ca->buckets_gc, buckets);
+ }
+
+ ret = for_each_btree_key2(trans, iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ({
+ ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ g = gc_bucket(ca, k.k->p.offset);
+
+ a = bch2_alloc_to_v4(k, &a_convert);
+
+ g->gen_valid = 1;
+ g->gen = a->gen;
+
+ if (metadata_only &&
+ (a->data_type == BCH_DATA_user ||
+ a->data_type == BCH_DATA_cached ||
+ a->data_type == BCH_DATA_parity)) {
+ g->data_type = a->data_type;
+ g->dirty_sectors = a->dirty_sectors;
+ g->cached_sectors = a->cached_sectors;
+ g->stripe = a->stripe;
+ g->stripe_redundancy = a->stripe_redundancy;
+ }
+
+ 0;
+ }));
+err:
+ bch2_trans_put(trans);
+ if (ret)
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
+{
+ struct bch_dev *ca;
+ unsigned i;
+
+ for_each_member_device(ca, c, i) {
+ struct bucket_array *buckets = gc_bucket_array(ca);
+ struct bucket *g;
+
+ for_each_bucket(g, buckets) {
+ if (metadata_only &&
+ (g->data_type == BCH_DATA_user ||
+ g->data_type == BCH_DATA_cached ||
+ g->data_type == BCH_DATA_parity))
+ continue;
+ g->data_type = 0;
+ g->dirty_sectors = 0;
+ g->cached_sectors = 0;
+ }
+ }
+}
+
+static int bch2_gc_write_reflink_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ size_t *idx)
+{
+ struct bch_fs *c = trans->c;
+ const __le64 *refcount = bkey_refcount_c(k);
+ struct printbuf buf = PRINTBUF;
+ struct reflink_gc *r;
+ int ret = 0;
+
+ if (!refcount)
+ return 0;
+
+ while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) &&
+ r->offset < k.k->p.offset)
+ ++*idx;
+
+ if (!r ||
+ r->offset != k.k->p.offset ||
+ r->size != k.k->size) {
+ bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
+ return -EINVAL;
+ }
+
+ if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
+ reflink_v_refcount_wrong,
+ "reflink key has wrong refcount:\n"
+ " %s\n"
+ " should be %u",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf),
+ r->refcount)) {
+ struct bkey_i *new = bch2_bkey_make_mut(trans, iter, &k, 0);
+
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ return ret;
+
+ if (!r->refcount)
+ new->k.type = KEY_TYPE_deleted;
+ else
+ *bkey_refcount(new) = cpu_to_le64(r->refcount);
+ }
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
+{
+ struct btree_trans *trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ size_t idx = 0;
+ int ret = 0;
+
+ if (metadata_only)
+ return 0;
+
+ trans = bch2_trans_get(c);
+
+ ret = for_each_btree_key_commit(trans, iter,
+ BTREE_ID_reflink, POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL,
+ bch2_gc_write_reflink_key(trans, &iter, k, &idx));
+
+ c->reflink_gc_nr = 0;
+ bch2_trans_put(trans);
+ return ret;
+}
+
+static int bch2_gc_reflink_start(struct bch_fs *c,
+ bool metadata_only)
+{
+ struct btree_trans *trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct reflink_gc *r;
+ int ret = 0;
+
+ if (metadata_only)
+ return 0;
+
+ trans = bch2_trans_get(c);
+ c->reflink_gc_nr = 0;
+
+ for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ const __le64 *refcount = bkey_refcount_c(k);
+
+ if (!refcount)
+ continue;
+
+ r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
+ GFP_KERNEL);
+ if (!r) {
+ ret = -BCH_ERR_ENOMEM_gc_reflink_start;
+ break;
+ }
+
+ r->offset = k.k->p.offset;
+ r->size = k.k->size;
+ r->refcount = 0;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ bch2_trans_put(trans);
+ return ret;
+}
+
+static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only)
+{
+ struct genradix_iter iter;
+ struct reflink_gc *r;
+
+ genradix_for_each(&c->reflink_gc_table, iter, r)
+ r->refcount = 0;
+}
+
+static int bch2_gc_write_stripes_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+ const struct bch_stripe *s;
+ struct gc_stripe *m;
+ bool bad = false;
+ unsigned i;
+ int ret = 0;
+
+ if (k.k->type != KEY_TYPE_stripe)
+ return 0;
+
+ s = bkey_s_c_to_stripe(k).v;
+ m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
+
+ for (i = 0; i < s->nr_blocks; i++) {
+ u32 old = stripe_blockcount_get(s, i);
+ u32 new = (m ? m->block_sectors[i] : 0);
+
+ if (old != new) {
+ prt_printf(&buf, "stripe block %u has wrong sector count: got %u, should be %u\n",
+ i, old, new);
+ bad = true;
+ }
+ }
+
+ if (bad)
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ if (fsck_err_on(bad, c, stripe_sector_count_wrong,
+ "%s", buf.buf)) {
+ struct bkey_i_stripe *new;
+
+ new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ return ret;
+
+ bkey_reassemble(&new->k_i, k);
+
+ for (i = 0; i < new->v.nr_blocks; i++)
+ stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
+
+ ret = bch2_trans_update(trans, iter, &new->k_i, 0);
+ }
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
+{
+ struct btree_trans *trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ if (metadata_only)
+ return 0;
+
+ trans = bch2_trans_get(c);
+
+ ret = for_each_btree_key_commit(trans, iter,
+ BTREE_ID_stripes, POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL,
+ bch2_gc_write_stripes_key(trans, &iter, k));
+
+ bch2_trans_put(trans);
+ return ret;
+}
+
+static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
+{
+ genradix_free(&c->gc_stripes);
+}
+
+/**
+ * bch2_gc - walk _all_ references to buckets, and recompute them:
+ *
+ * @c: filesystem object
+ * @initial: are we in recovery?
+ * @metadata_only: are we just checking metadata references, or everything?
+ *
+ * Returns: 0 on success, or standard errcode on failure
+ *
+ * Order matters here:
+ * - Concurrent GC relies on the fact that we have a total ordering for
+ * everything that GC walks - see gc_will_visit_node(),
+ * gc_will_visit_root()
+ *
+ * - also, references move around in the course of index updates and
+ * various other crap: everything needs to agree on the ordering
+ * references are allowed to move around in - e.g., we're allowed to
+ * start with a reference owned by an open_bucket (the allocator) and
+ * move it to the btree, but not the reverse.
+ *
+ * This is necessary to ensure that gc doesn't miss references that
+ * move around - if references move backwards in the ordering GC
+ * uses, GC could skip past them
+ */
+int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
+{
+ unsigned iter = 0;
+ int ret;
+
+ lockdep_assert_held(&c->state_lock);
+
+ down_write(&c->gc_lock);
+
+ bch2_btree_interior_updates_flush(c);
+
+ ret = bch2_gc_start(c) ?:
+ bch2_gc_alloc_start(c, metadata_only) ?:
+ bch2_gc_reflink_start(c, metadata_only);
+ if (ret)
+ goto out;
+again:
+ gc_pos_set(c, gc_phase(GC_PHASE_START));
+
+ bch2_mark_superblocks(c);
+
+ ret = bch2_gc_btrees(c, initial, metadata_only);
+
+ if (ret)
+ goto out;
+
+#if 0
+ bch2_mark_pending_btree_node_frees(c);
+#endif
+ c->gc_count++;
+
+ if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) ||
+ (!iter && bch2_test_restart_gc)) {
+ if (iter++ > 2) {
+ bch_info(c, "Unable to fix bucket gens, looping");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * XXX: make sure gens we fixed got saved
+ */
+ bch_info(c, "Second GC pass needed, restarting:");
+ clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
+
+ bch2_gc_stripes_reset(c, metadata_only);
+ bch2_gc_alloc_reset(c, metadata_only);
+ bch2_gc_reflink_reset(c, metadata_only);
+ ret = bch2_gc_reset(c);
+ if (ret)
+ goto out;
+
+ /* flush fsck errors, reset counters */
+ bch2_flush_fsck_errs(c);
+ goto again;
+ }
+out:
+ if (!ret) {
+ bch2_journal_block(&c->journal);
+
+ ret = bch2_gc_stripes_done(c, metadata_only) ?:
+ bch2_gc_reflink_done(c, metadata_only) ?:
+ bch2_gc_alloc_done(c, metadata_only) ?:
+ bch2_gc_done(c, initial, metadata_only);
+
+ bch2_journal_unblock(&c->journal);
+ }
+
+ percpu_down_write(&c->mark_lock);
+ /* Indicates that gc is no longer in progress: */
+ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
+
+ bch2_gc_free(c);
+ percpu_up_write(&c->mark_lock);
+
+ up_write(&c->gc_lock);
+
+ /*
+ * At startup, allocations can happen directly instead of via the
+ * allocator thread - issue wakeup in case they blocked on gc_lock:
+ */
+ closure_wake_up(&c->freelist_wait);
+
+ if (ret)
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+static int gc_btree_gens_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
+ struct bkey_i *u;
+ int ret;
+
+ percpu_down_read(&c->mark_lock);
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+ if (ptr_stale(ca, ptr) > 16) {
+ percpu_up_read(&c->mark_lock);
+ goto update;
+ }
+ }
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
+
+ if (gen_after(*gen, ptr->gen))
+ *gen = ptr->gen;
+ }
+ percpu_up_read(&c->mark_lock);
+ return 0;
+update:
+ u = bch2_bkey_make_mut(trans, iter, &k, 0);
+ ret = PTR_ERR_OR_ZERO(u);
+ if (ret)
+ return ret;
+
+ bch2_extent_normalize(c, bkey_i_to_s(u));
+ return 0;
+}
+
+static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode);
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
+ struct bkey_i_alloc_v4 *a_mut;
+ int ret;
+
+ if (a->oldest_gen == ca->oldest_gen[iter->pos.offset])
+ return 0;
+
+ a_mut = bch2_alloc_to_v4_mut(trans, k);
+ ret = PTR_ERR_OR_ZERO(a_mut);
+ if (ret)
+ return ret;
+
+ a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
+ a_mut->v.data_type = alloc_data_type(a_mut->v, a_mut->v.data_type);
+
+ return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
+}
+
+int bch2_gc_gens(struct bch_fs *c)
+{
+ struct btree_trans *trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_dev *ca;
+ u64 b, start_time = local_clock();
+ unsigned i;
+ int ret;
+
+ /*
+ * Ideally we would be using state_lock and not gc_lock here, but that
+ * introduces a deadlock in the RO path - we currently take the state
+ * lock at the start of going RO, thus the gc thread may get stuck:
+ */
+ if (!mutex_trylock(&c->gc_gens_lock))
+ return 0;
+
+ trace_and_count(c, gc_gens_start, c);
+ down_read(&c->gc_lock);
+ trans = bch2_trans_get(c);
+
+ for_each_member_device(ca, c, i) {
+ struct bucket_gens *gens = bucket_gens(ca);
+
+ BUG_ON(ca->oldest_gen);
+
+ ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL);
+ if (!ca->oldest_gen) {
+ percpu_ref_put(&ca->ref);
+ ret = -BCH_ERR_ENOMEM_gc_gens;
+ goto err;
+ }
+
+ for (b = gens->first_bucket;
+ b < gens->nbuckets; b++)
+ ca->oldest_gen[b] = gens->b[b];
+ }
+
+ for (i = 0; i < BTREE_ID_NR; i++)
+ if (btree_type_has_ptrs(i)) {
+ c->gc_gens_btree = i;
+ c->gc_gens_pos = POS_MIN;
+
+ ret = for_each_btree_key_commit(trans, iter, i,
+ POS_MIN,
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+ k,
+ NULL, NULL,
+ BTREE_INSERT_NOFAIL,
+ gc_btree_gens_key(trans, &iter, k));
+ if (ret && !bch2_err_matches(ret, EROFS))
+ bch_err_fn(c, ret);
+ if (ret)
+ goto err;
+ }
+
+ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
+ POS_MIN,
+ BTREE_ITER_PREFETCH,
+ k,
+ NULL, NULL,
+ BTREE_INSERT_NOFAIL,
+ bch2_alloc_write_oldest_gen(trans, &iter, k));
+ if (ret && !bch2_err_matches(ret, EROFS))
+ bch_err_fn(c, ret);
+ if (ret)
+ goto err;
+
+ c->gc_gens_btree = 0;
+ c->gc_gens_pos = POS_MIN;
+
+ c->gc_count++;
+
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
+ trace_and_count(c, gc_gens_end, c);
+err:
+ for_each_member_device(ca, c, i) {
+ kvfree(ca->oldest_gen);
+ ca->oldest_gen = NULL;
+ }
+
+ bch2_trans_put(trans);
+ up_read(&c->gc_lock);
+ mutex_unlock(&c->gc_gens_lock);
+ return ret;
+}
+
+static int bch2_gc_thread(void *arg)
+{
+ struct bch_fs *c = arg;
+ struct io_clock *clock = &c->io_clock[WRITE];
+ unsigned long last = atomic64_read(&clock->now);
+ unsigned last_kick = atomic_read(&c->kick_gc);
+ int ret;
+
+ set_freezable();
+
+ while (1) {
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (kthread_should_stop()) {
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+
+ if (atomic_read(&c->kick_gc) != last_kick)
+ break;
+
+ if (c->btree_gc_periodic) {
+ unsigned long next = last + c->capacity / 16;
+
+ if (atomic64_read(&clock->now) >= next)
+ break;
+
+ bch2_io_clock_schedule_timeout(clock, next);
+ } else {
+ schedule();
+ }
+
+ try_to_freeze();
+ }
+ __set_current_state(TASK_RUNNING);
+
+ last = atomic64_read(&clock->now);
+ last_kick = atomic_read(&c->kick_gc);
+
+ /*
+ * Full gc is currently incompatible with btree key cache:
+ */
+#if 0
+ ret = bch2_gc(c, false, false);
+#else
+ ret = bch2_gc_gens(c);
+#endif
+ if (ret < 0)
+ bch_err_fn(c, ret);
+
+ debug_check_no_locks_held();
+ }
+
+ return 0;
+}
+
+void bch2_gc_thread_stop(struct bch_fs *c)
+{
+ struct task_struct *p;
+
+ p = c->gc_thread;
+ c->gc_thread = NULL;
+
+ if (p) {
+ kthread_stop(p);
+ put_task_struct(p);
+ }
+}
+
+int bch2_gc_thread_start(struct bch_fs *c)
+{
+ struct task_struct *p;
+
+ if (c->gc_thread)
+ return 0;
+
+ p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name);
+ if (IS_ERR(p)) {
+ bch_err_fn(c, PTR_ERR(p));
+ return PTR_ERR(p);
+ }
+
+ get_task_struct(p);
+ c->gc_thread = p;
+ wake_up_process(p);
+ return 0;
+}
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
new file mode 100644
index 000000000000..607575f83a00
--- /dev/null
+++ b/fs/bcachefs/btree_gc.h
@@ -0,0 +1,114 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_GC_H
+#define _BCACHEFS_BTREE_GC_H
+
+#include "bkey.h"
+#include "btree_types.h"
+
+int bch2_check_topology(struct bch_fs *);
+int bch2_gc(struct bch_fs *, bool, bool);
+int bch2_gc_gens(struct bch_fs *);
+void bch2_gc_thread_stop(struct bch_fs *);
+int bch2_gc_thread_start(struct bch_fs *);
+
+/*
+ * For concurrent mark and sweep (with other index updates), we define a total
+ * ordering of _all_ references GC walks:
+ *
+ * Note that some references will have the same GC position as others - e.g.
+ * everything within the same btree node; in those cases we're relying on
+ * whatever locking exists for where those references live, i.e. the write lock
+ * on a btree node.
+ *
+ * That locking is also required to ensure GC doesn't pass the updater in
+ * between the updater adding/removing the reference and updating the GC marks;
+ * without that, we would at best double count sometimes.
+ *
+ * That part is important - whenever calling bch2_mark_pointers(), a lock _must_
+ * be held that prevents GC from passing the position the updater is at.
+ *
+ * (What about the start of gc, when we're clearing all the marks? GC clears the
+ * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc
+ * position inside its cmpxchg loop, so crap magically works).
+ */
+
+/* Position of (the start of) a gc phase: */
+static inline struct gc_pos gc_phase(enum gc_phase phase)
+{
+ return (struct gc_pos) {
+ .phase = phase,
+ .pos = POS_MIN,
+ .level = 0,
+ };
+}
+
+static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
+{
+ return cmp_int(l.phase, r.phase) ?:
+ bpos_cmp(l.pos, r.pos) ?:
+ cmp_int(l.level, r.level);
+}
+
+static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
+{
+ switch (id) {
+#define x(name, v, ...) case BTREE_ID_##name: return GC_PHASE_BTREE_##name;
+ BCH_BTREE_IDS()
+#undef x
+ default:
+ BUG();
+ }
+}
+
+static inline struct gc_pos gc_pos_btree(enum btree_id id,
+ struct bpos pos, unsigned level)
+{
+ return (struct gc_pos) {
+ .phase = btree_id_to_gc_phase(id),
+ .pos = pos,
+ .level = level,
+ };
+}
+
+/*
+ * GC position of the pointers within a btree node: note, _not_ for &b->key
+ * itself, that lives in the parent node:
+ */
+static inline struct gc_pos gc_pos_btree_node(struct btree *b)
+{
+ return gc_pos_btree(b->c.btree_id, b->key.k.p, b->c.level);
+}
+
+/*
+ * GC position of the pointer to a btree root: we don't use
+ * gc_pos_pointer_to_btree_node() here to avoid a potential race with
+ * btree_split() increasing the tree depth - the new root will have level > the
+ * old root and thus have a greater gc position than the old root, but that
+ * would be incorrect since once gc has marked the root it's not coming back.
+ */
+static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
+{
+ return gc_pos_btree(id, SPOS_MAX, BTREE_MAX_DEPTH);
+}
+
+static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
+{
+ unsigned seq;
+ bool ret;
+
+ do {
+ seq = read_seqcount_begin(&c->gc_pos_lock);
+ ret = gc_pos_cmp(pos, c->gc_pos) <= 0;
+ } while (read_seqcount_retry(&c->gc_pos_lock, seq));
+
+ return ret;
+}
+
+static inline void bch2_do_gc_gens(struct bch_fs *c)
+{
+ atomic_inc(&c->kick_gc);
+ if (c->gc_thread)
+ wake_up_process(c->gc_thread);
+}
+
+#endif /* _BCACHEFS_BTREE_GC_H */
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
new file mode 100644
index 000000000000..5a720f0cd5a6
--- /dev/null
+++ b/fs/bcachefs/btree_io.c
@@ -0,0 +1,2297 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "bkey_sort.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "io_write.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+#include "recovery.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/sched/mm.h>
+
+void bch2_btree_node_io_unlock(struct btree *b)
+{
+ EBUG_ON(!btree_node_write_in_flight(b));
+
+ clear_btree_node_write_in_flight_inner(b);
+ clear_btree_node_write_in_flight(b);
+ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
+}
+
+void bch2_btree_node_io_lock(struct btree *b)
+{
+ bch2_assert_btree_nodes_not_locked();
+
+ wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
+ TASK_UNINTERRUPTIBLE);
+}
+
+void __bch2_btree_node_wait_on_read(struct btree *b)
+{
+ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+ TASK_UNINTERRUPTIBLE);
+}
+
+void __bch2_btree_node_wait_on_write(struct btree *b)
+{
+ wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
+ TASK_UNINTERRUPTIBLE);
+}
+
+void bch2_btree_node_wait_on_read(struct btree *b)
+{
+ bch2_assert_btree_nodes_not_locked();
+
+ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+ TASK_UNINTERRUPTIBLE);
+}
+
+void bch2_btree_node_wait_on_write(struct btree *b)
+{
+ bch2_assert_btree_nodes_not_locked();
+
+ wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
+ TASK_UNINTERRUPTIBLE);
+}
+
+static void verify_no_dups(struct btree *b,
+ struct bkey_packed *start,
+ struct bkey_packed *end)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+ struct bkey_packed *k, *p;
+
+ if (start == end)
+ return;
+
+ for (p = start, k = bkey_p_next(start);
+ k != end;
+ p = k, k = bkey_p_next(k)) {
+ struct bkey l = bkey_unpack_key(b, p);
+ struct bkey r = bkey_unpack_key(b, k);
+
+ BUG_ON(bpos_ge(l.p, bkey_start_pos(&r)));
+ }
+#endif
+}
+
+static void set_needs_whiteout(struct bset *i, int v)
+{
+ struct bkey_packed *k;
+
+ for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
+ k->needs_whiteout = v;
+}
+
+static void btree_bounce_free(struct bch_fs *c, size_t size,
+ bool used_mempool, void *p)
+{
+ if (used_mempool)
+ mempool_free(p, &c->btree_bounce_pool);
+ else
+ vpfree(p, size);
+}
+
+static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
+ bool *used_mempool)
+{
+ unsigned flags = memalloc_nofs_save();
+ void *p;
+
+ BUG_ON(size > btree_bytes(c));
+
+ *used_mempool = false;
+ p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
+ if (!p) {
+ *used_mempool = true;
+ p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
+ }
+ memalloc_nofs_restore(flags);
+ return p;
+}
+
+static void sort_bkey_ptrs(const struct btree *bt,
+ struct bkey_packed **ptrs, unsigned nr)
+{
+ unsigned n = nr, a = nr / 2, b, c, d;
+
+ if (!a)
+ return;
+
+ /* Heap sort: see lib/sort.c: */
+ while (1) {
+ if (a)
+ a--;
+ else if (--n)
+ swap(ptrs[0], ptrs[n]);
+ else
+ break;
+
+ for (b = a; c = 2 * b + 1, (d = c + 1) < n;)
+ b = bch2_bkey_cmp_packed(bt,
+ ptrs[c],
+ ptrs[d]) >= 0 ? c : d;
+ if (d == n)
+ b = c;
+
+ while (b != a &&
+ bch2_bkey_cmp_packed(bt,
+ ptrs[a],
+ ptrs[b]) >= 0)
+ b = (b - 1) / 2;
+ c = b;
+ while (b != a) {
+ b = (b - 1) / 2;
+ swap(ptrs[b], ptrs[c]);
+ }
+ }
+}
+
+static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
+{
+ struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k;
+ bool used_mempool = false;
+ size_t bytes = b->whiteout_u64s * sizeof(u64);
+
+ if (!b->whiteout_u64s)
+ return;
+
+ new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool);
+
+ ptrs = ptrs_end = ((void *) new_whiteouts + bytes);
+
+ for (k = unwritten_whiteouts_start(c, b);
+ k != unwritten_whiteouts_end(c, b);
+ k = bkey_p_next(k))
+ *--ptrs = k;
+
+ sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs);
+
+ k = new_whiteouts;
+
+ while (ptrs != ptrs_end) {
+ bkey_p_copy(k, *ptrs);
+ k = bkey_p_next(k);
+ ptrs++;
+ }
+
+ verify_no_dups(b, new_whiteouts,
+ (void *) ((u64 *) new_whiteouts + b->whiteout_u64s));
+
+ memcpy_u64s(unwritten_whiteouts_start(c, b),
+ new_whiteouts, b->whiteout_u64s);
+
+ btree_bounce_free(c, bytes, used_mempool, new_whiteouts);
+}
+
+static bool should_compact_bset(struct btree *b, struct bset_tree *t,
+ bool compacting, enum compact_mode mode)
+{
+ if (!bset_dead_u64s(b, t))
+ return false;
+
+ switch (mode) {
+ case COMPACT_LAZY:
+ return should_compact_bset_lazy(b, t) ||
+ (compacting && !bset_written(b, bset(b, t)));
+ case COMPACT_ALL:
+ return true;
+ default:
+ BUG();
+ }
+}
+
+static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
+{
+ struct bset_tree *t;
+ bool ret = false;
+
+ for_each_bset(b, t) {
+ struct bset *i = bset(b, t);
+ struct bkey_packed *k, *n, *out, *start, *end;
+ struct btree_node_entry *src = NULL, *dst = NULL;
+
+ if (t != b->set && !bset_written(b, i)) {
+ src = container_of(i, struct btree_node_entry, keys);
+ dst = max(write_block(b),
+ (void *) btree_bkey_last(b, t - 1));
+ }
+
+ if (src != dst)
+ ret = true;
+
+ if (!should_compact_bset(b, t, ret, mode)) {
+ if (src != dst) {
+ memmove(dst, src, sizeof(*src) +
+ le16_to_cpu(src->keys.u64s) *
+ sizeof(u64));
+ i = &dst->keys;
+ set_btree_bset(b, t, i);
+ }
+ continue;
+ }
+
+ start = btree_bkey_first(b, t);
+ end = btree_bkey_last(b, t);
+
+ if (src != dst) {
+ memmove(dst, src, sizeof(*src));
+ i = &dst->keys;
+ set_btree_bset(b, t, i);
+ }
+
+ out = i->start;
+
+ for (k = start; k != end; k = n) {
+ n = bkey_p_next(k);
+
+ if (!bkey_deleted(k)) {
+ bkey_p_copy(out, k);
+ out = bkey_p_next(out);
+ } else {
+ BUG_ON(k->needs_whiteout);
+ }
+ }
+
+ i->u64s = cpu_to_le16((u64 *) out - i->_data);
+ set_btree_bset_end(b, t);
+ bch2_bset_set_no_aux_tree(b, t);
+ ret = true;
+ }
+
+ bch2_verify_btree_nr_keys(b);
+
+ bch2_btree_build_aux_trees(b);
+
+ return ret;
+}
+
+bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
+ enum compact_mode mode)
+{
+ return bch2_drop_whiteouts(b, mode);
+}
+
+static void btree_node_sort(struct bch_fs *c, struct btree *b,
+ unsigned start_idx,
+ unsigned end_idx,
+ bool filter_whiteouts)
+{
+ struct btree_node *out;
+ struct sort_iter_stack sort_iter;
+ struct bset_tree *t;
+ struct bset *start_bset = bset(b, &b->set[start_idx]);
+ bool used_mempool = false;
+ u64 start_time, seq = 0;
+ unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1;
+ bool sorting_entire_node = start_idx == 0 &&
+ end_idx == b->nsets;
+
+ sort_iter_stack_init(&sort_iter, b);
+
+ for (t = b->set + start_idx;
+ t < b->set + end_idx;
+ t++) {
+ u64s += le16_to_cpu(bset(b, t)->u64s);
+ sort_iter_add(&sort_iter.iter,
+ btree_bkey_first(b, t),
+ btree_bkey_last(b, t));
+ }
+
+ bytes = sorting_entire_node
+ ? btree_bytes(c)
+ : __vstruct_bytes(struct btree_node, u64s);
+
+ out = btree_bounce_alloc(c, bytes, &used_mempool);
+
+ start_time = local_clock();
+
+ u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter, filter_whiteouts);
+
+ out->keys.u64s = cpu_to_le16(u64s);
+
+ BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes);
+
+ if (sorting_entire_node)
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
+ start_time);
+
+ /* Make sure we preserve bset journal_seq: */
+ for (t = b->set + start_idx; t < b->set + end_idx; t++)
+ seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq));
+ start_bset->journal_seq = cpu_to_le64(seq);
+
+ if (sorting_entire_node) {
+ u64s = le16_to_cpu(out->keys.u64s);
+
+ BUG_ON(bytes != btree_bytes(c));
+
+ /*
+ * Our temporary buffer is the same size as the btree node's
+ * buffer, we can just swap buffers instead of doing a big
+ * memcpy()
+ */
+ *out = *b->data;
+ out->keys.u64s = cpu_to_le16(u64s);
+ swap(out, b->data);
+ set_btree_bset(b, b->set, &b->data->keys);
+ } else {
+ start_bset->u64s = out->keys.u64s;
+ memcpy_u64s(start_bset->start,
+ out->keys.start,
+ le16_to_cpu(out->keys.u64s));
+ }
+
+ for (i = start_idx + 1; i < end_idx; i++)
+ b->nr.bset_u64s[start_idx] +=
+ b->nr.bset_u64s[i];
+
+ b->nsets -= shift;
+
+ for (i = start_idx + 1; i < b->nsets; i++) {
+ b->nr.bset_u64s[i] = b->nr.bset_u64s[i + shift];
+ b->set[i] = b->set[i + shift];
+ }
+
+ for (i = b->nsets; i < MAX_BSETS; i++)
+ b->nr.bset_u64s[i] = 0;
+
+ set_btree_bset_end(b, &b->set[start_idx]);
+ bch2_bset_set_no_aux_tree(b, &b->set[start_idx]);
+
+ btree_bounce_free(c, bytes, used_mempool, out);
+
+ bch2_verify_btree_nr_keys(b);
+}
+
+void bch2_btree_sort_into(struct bch_fs *c,
+ struct btree *dst,
+ struct btree *src)
+{
+ struct btree_nr_keys nr;
+ struct btree_node_iter src_iter;
+ u64 start_time = local_clock();
+
+ BUG_ON(dst->nsets != 1);
+
+ bch2_bset_set_no_aux_tree(dst, dst->set);
+
+ bch2_btree_node_iter_init_from_start(&src_iter, src);
+
+ nr = bch2_sort_repack(btree_bset_first(dst),
+ src, &src_iter,
+ &dst->format,
+ true);
+
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
+ start_time);
+
+ set_btree_bset_end(dst, dst->set);
+
+ dst->nr.live_u64s += nr.live_u64s;
+ dst->nr.bset_u64s[0] += nr.bset_u64s[0];
+ dst->nr.packed_keys += nr.packed_keys;
+ dst->nr.unpacked_keys += nr.unpacked_keys;
+
+ bch2_verify_btree_nr_keys(dst);
+}
+
+/*
+ * We're about to add another bset to the btree node, so if there's currently
+ * too many bsets - sort some of them together:
+ */
+static bool btree_node_compact(struct bch_fs *c, struct btree *b)
+{
+ unsigned unwritten_idx;
+ bool ret = false;
+
+ for (unwritten_idx = 0;
+ unwritten_idx < b->nsets;
+ unwritten_idx++)
+ if (!bset_written(b, bset(b, &b->set[unwritten_idx])))
+ break;
+
+ if (b->nsets - unwritten_idx > 1) {
+ btree_node_sort(c, b, unwritten_idx,
+ b->nsets, false);
+ ret = true;
+ }
+
+ if (unwritten_idx > 1) {
+ btree_node_sort(c, b, 0, unwritten_idx, false);
+ ret = true;
+ }
+
+ return ret;
+}
+
+void bch2_btree_build_aux_trees(struct btree *b)
+{
+ struct bset_tree *t;
+
+ for_each_bset(b, t)
+ bch2_bset_build_aux_tree(b, t,
+ !bset_written(b, bset(b, t)) &&
+ t == bset_tree_last(b));
+}
+
+/*
+ * If we have MAX_BSETS (3) bsets, should we sort them all down to just one?
+ *
+ * The first bset is going to be of similar order to the size of the node, the
+ * last bset is bounded by btree_write_set_buffer(), which is set to keep the
+ * memmove on insert from being too expensive: the middle bset should, ideally,
+ * be the geometric mean of the first and the last.
+ *
+ * Returns true if the middle bset is greater than that geometric mean:
+ */
+static inline bool should_compact_all(struct bch_fs *c, struct btree *b)
+{
+ unsigned mid_u64s_bits =
+ (ilog2(btree_max_u64s(c)) + BTREE_WRITE_SET_U64s_BITS) / 2;
+
+ return bset_u64s(&b->set[1]) > 1U << mid_u64s_bits;
+}
+
+/*
+ * @bch_btree_init_next - initialize a new (unwritten) bset that can then be
+ * inserted into
+ *
+ * Safe to call if there already is an unwritten bset - will only add a new bset
+ * if @b doesn't already have one.
+ *
+ * Returns true if we sorted (i.e. invalidated iterators
+ */
+void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_node_entry *bne;
+ bool reinit_iter = false;
+
+ EBUG_ON(!six_lock_counts(&b->c.lock).n[SIX_LOCK_write]);
+ BUG_ON(bset_written(b, bset(b, &b->set[1])));
+ BUG_ON(btree_node_just_written(b));
+
+ if (b->nsets == MAX_BSETS &&
+ !btree_node_write_in_flight(b) &&
+ should_compact_all(c, b)) {
+ bch2_btree_node_write(c, b, SIX_LOCK_write,
+ BTREE_WRITE_init_next_bset);
+ reinit_iter = true;
+ }
+
+ if (b->nsets == MAX_BSETS &&
+ btree_node_compact(c, b))
+ reinit_iter = true;
+
+ BUG_ON(b->nsets >= MAX_BSETS);
+
+ bne = want_new_bset(c, b);
+ if (bne)
+ bch2_bset_init_next(c, b, bne);
+
+ bch2_btree_build_aux_trees(b);
+
+ if (reinit_iter)
+ bch2_trans_node_reinit_iter(trans, b);
+}
+
+static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
+ struct bch_dev *ca,
+ struct btree *b, struct bset *i,
+ unsigned offset, int write)
+{
+ prt_printf(out, bch2_log_msg(c, "%s"),
+ write == READ
+ ? "error validating btree node "
+ : "corrupt btree node before write ");
+ if (ca)
+ prt_printf(out, "on %s ", ca->name);
+ prt_printf(out, "at btree ");
+ bch2_btree_pos_to_text(out, c, b);
+
+ prt_printf(out, "\n node offset %u", b->written);
+ if (i)
+ prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
+ prt_str(out, ": ");
+}
+
+__printf(9, 10)
+static int __btree_err(int ret,
+ struct bch_fs *c,
+ struct bch_dev *ca,
+ struct btree *b,
+ struct bset *i,
+ int write,
+ bool have_retry,
+ enum bch_sb_error_id err_type,
+ const char *fmt, ...)
+{
+ struct printbuf out = PRINTBUF;
+ va_list args;
+
+ btree_err_msg(&out, c, ca, b, i, b->written, write);
+
+ va_start(args, fmt);
+ prt_vprintf(&out, fmt, args);
+ va_end(args);
+
+ if (write == WRITE) {
+ bch2_print_string_as_lines(KERN_ERR, out.buf);
+ ret = c->opts.errors == BCH_ON_ERROR_continue
+ ? 0
+ : -BCH_ERR_fsck_errors_not_fixed;
+ goto out;
+ }
+
+ if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry)
+ ret = -BCH_ERR_btree_node_read_err_fixable;
+ if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
+ ret = -BCH_ERR_btree_node_read_err_bad_node;
+
+ if (ret != -BCH_ERR_btree_node_read_err_fixable)
+ bch2_sb_error_count(c, err_type);
+
+ switch (ret) {
+ case -BCH_ERR_btree_node_read_err_fixable:
+ ret = bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf);
+ if (ret != -BCH_ERR_fsck_fix &&
+ ret != -BCH_ERR_fsck_ignore)
+ goto fsck_err;
+ ret = -BCH_ERR_fsck_fix;
+ break;
+ case -BCH_ERR_btree_node_read_err_want_retry:
+ case -BCH_ERR_btree_node_read_err_must_retry:
+ bch2_print_string_as_lines(KERN_ERR, out.buf);
+ break;
+ case -BCH_ERR_btree_node_read_err_bad_node:
+ bch2_print_string_as_lines(KERN_ERR, out.buf);
+ bch2_topology_error(c);
+ ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -EIO;
+ break;
+ case -BCH_ERR_btree_node_read_err_incompatible:
+ bch2_print_string_as_lines(KERN_ERR, out.buf);
+ ret = -BCH_ERR_fsck_errors_not_fixed;
+ break;
+ default:
+ BUG();
+ }
+out:
+fsck_err:
+ printbuf_exit(&out);
+ return ret;
+}
+
+#define btree_err(type, c, ca, b, i, _err_type, msg, ...) \
+({ \
+ int _ret = __btree_err(type, c, ca, b, i, write, have_retry, \
+ BCH_FSCK_ERR_##_err_type, \
+ msg, ##__VA_ARGS__); \
+ \
+ if (_ret != -BCH_ERR_fsck_fix) { \
+ ret = _ret; \
+ goto fsck_err; \
+ } \
+ \
+ *saw_error = true; \
+})
+
+#define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false)
+
+/*
+ * When btree topology repair changes the start or end of a node, that might
+ * mean we have to drop keys that are no longer inside the node:
+ */
+__cold
+void bch2_btree_node_drop_keys_outside_node(struct btree *b)
+{
+ struct bset_tree *t;
+
+ for_each_bset(b, t) {
+ struct bset *i = bset(b, t);
+ struct bkey_packed *k;
+
+ for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
+ if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0)
+ break;
+
+ if (k != i->start) {
+ unsigned shift = (u64 *) k - (u64 *) i->start;
+
+ memmove_u64s_down(i->start, k,
+ (u64 *) vstruct_end(i) - (u64 *) k);
+ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift);
+ set_btree_bset_end(b, t);
+ }
+
+ for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
+ if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0)
+ break;
+
+ if (k != vstruct_last(i)) {
+ i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start);
+ set_btree_bset_end(b, t);
+ }
+ }
+
+ /*
+ * Always rebuild search trees: eytzinger search tree nodes directly
+ * depend on the values of min/max key:
+ */
+ bch2_bset_set_no_aux_tree(b, b->set);
+ bch2_btree_build_aux_trees(b);
+
+ struct bkey_s_c k;
+ struct bkey unpacked;
+ struct btree_node_iter iter;
+ for_each_btree_node_key_unpack(b, k, &iter, &unpacked) {
+ BUG_ON(bpos_lt(k.k->p, b->data->min_key));
+ BUG_ON(bpos_gt(k.k->p, b->data->max_key));
+ }
+}
+
+static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
+ struct btree *b, struct bset *i,
+ unsigned offset, unsigned sectors,
+ int write, bool have_retry, bool *saw_error)
+{
+ unsigned version = le16_to_cpu(i->version);
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
+ int ret = 0;
+
+ btree_err_on(!bch2_version_compatible(version),
+ -BCH_ERR_btree_node_read_err_incompatible,
+ c, ca, b, i,
+ btree_node_unsupported_version,
+ "unsupported bset version %u.%u",
+ BCH_VERSION_MAJOR(version),
+ BCH_VERSION_MINOR(version));
+
+ if (btree_err_on(version < c->sb.version_min,
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, i,
+ btree_node_bset_older_than_sb_min,
+ "bset version %u older than superblock version_min %u",
+ version, c->sb.version_min)) {
+ mutex_lock(&c->sb_lock);
+ c->disk_sb.sb->version_min = cpu_to_le16(version);
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+ }
+
+ if (btree_err_on(BCH_VERSION_MAJOR(version) >
+ BCH_VERSION_MAJOR(c->sb.version),
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, i,
+ btree_node_bset_newer_than_sb,
+ "bset version %u newer than superblock version %u",
+ version, c->sb.version)) {
+ mutex_lock(&c->sb_lock);
+ c->disk_sb.sb->version = cpu_to_le16(version);
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+ }
+
+ btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
+ -BCH_ERR_btree_node_read_err_incompatible,
+ c, ca, b, i,
+ btree_node_unsupported_version,
+ "BSET_SEPARATE_WHITEOUTS no longer supported");
+
+ if (btree_err_on(offset + sectors > btree_sectors(c),
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, ca, b, i,
+ bset_past_end_of_btree_node,
+ "bset past end of btree node")) {
+ i->u64s = 0;
+ ret = 0;
+ goto out;
+ }
+
+ btree_err_on(offset && !i->u64s,
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, ca, b, i,
+ bset_empty,
+ "empty bset");
+
+ btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset,
+ -BCH_ERR_btree_node_read_err_want_retry,
+ c, ca, b, i,
+ bset_wrong_sector_offset,
+ "bset at wrong sector offset");
+
+ if (!offset) {
+ struct btree_node *bn =
+ container_of(i, struct btree_node, keys);
+ /* These indicate that we read the wrong btree node: */
+
+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+ struct bch_btree_ptr_v2 *bp =
+ &bkey_i_to_btree_ptr_v2(&b->key)->v;
+
+ /* XXX endianness */
+ btree_err_on(bp->seq != bn->keys.seq,
+ -BCH_ERR_btree_node_read_err_must_retry,
+ c, ca, b, NULL,
+ bset_bad_seq,
+ "incorrect sequence number (wrong btree node)");
+ }
+
+ btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
+ -BCH_ERR_btree_node_read_err_must_retry,
+ c, ca, b, i,
+ btree_node_bad_btree,
+ "incorrect btree id");
+
+ btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
+ -BCH_ERR_btree_node_read_err_must_retry,
+ c, ca, b, i,
+ btree_node_bad_level,
+ "incorrect level");
+
+ if (!write)
+ compat_btree_node(b->c.level, b->c.btree_id, version,
+ BSET_BIG_ENDIAN(i), write, bn);
+
+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+ struct bch_btree_ptr_v2 *bp =
+ &bkey_i_to_btree_ptr_v2(&b->key)->v;
+
+ if (BTREE_PTR_RANGE_UPDATED(bp)) {
+ b->data->min_key = bp->min_key;
+ b->data->max_key = b->key.k.p;
+ }
+
+ btree_err_on(!bpos_eq(b->data->min_key, bp->min_key),
+ -BCH_ERR_btree_node_read_err_must_retry,
+ c, ca, b, NULL,
+ btree_node_bad_min_key,
+ "incorrect min_key: got %s should be %s",
+ (printbuf_reset(&buf1),
+ bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf),
+ (printbuf_reset(&buf2),
+ bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf));
+ }
+
+ btree_err_on(!bpos_eq(bn->max_key, b->key.k.p),
+ -BCH_ERR_btree_node_read_err_must_retry,
+ c, ca, b, i,
+ btree_node_bad_max_key,
+ "incorrect max key %s",
+ (printbuf_reset(&buf1),
+ bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf));
+
+ if (write)
+ compat_btree_node(b->c.level, b->c.btree_id, version,
+ BSET_BIG_ENDIAN(i), write, bn);
+
+ btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1),
+ -BCH_ERR_btree_node_read_err_bad_node,
+ c, ca, b, i,
+ btree_node_bad_format,
+ "invalid bkey format: %s\n %s", buf1.buf,
+ (printbuf_reset(&buf2),
+ bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf));
+ printbuf_reset(&buf1);
+
+ compat_bformat(b->c.level, b->c.btree_id, version,
+ BSET_BIG_ENDIAN(i), write,
+ &bn->format);
+ }
+out:
+fsck_err:
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
+ return ret;
+}
+
+static int bset_key_invalid(struct bch_fs *c, struct btree *b,
+ struct bkey_s_c k,
+ bool updated_range, int rw,
+ struct printbuf *err)
+{
+ return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?:
+ (!updated_range ? bch2_bkey_in_btree_node(c, b, k, err) : 0) ?:
+ (rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0);
+}
+
+static int validate_bset_keys(struct bch_fs *c, struct btree *b,
+ struct bset *i, int write,
+ bool have_retry, bool *saw_error)
+{
+ unsigned version = le16_to_cpu(i->version);
+ struct bkey_packed *k, *prev = NULL;
+ struct printbuf buf = PRINTBUF;
+ bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+ BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
+ int ret = 0;
+
+ for (k = i->start;
+ k != vstruct_last(i);) {
+ struct bkey_s u;
+ struct bkey tmp;
+
+ if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, i,
+ btree_node_bkey_past_bset_end,
+ "key extends past end of bset")) {
+ i->u64s = cpu_to_le16((u64 *) k - i->_data);
+ break;
+ }
+
+ if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, i,
+ btree_node_bkey_bad_format,
+ "invalid bkey format %u", k->format)) {
+ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+ memmove_u64s_down(k, bkey_p_next(k),
+ (u64 *) vstruct_end(i) - (u64 *) k);
+ continue;
+ }
+
+ /* XXX: validate k->u64s */
+ if (!write)
+ bch2_bkey_compat(b->c.level, b->c.btree_id, version,
+ BSET_BIG_ENDIAN(i), write,
+ &b->format, k);
+
+ u = __bkey_disassemble(b, k, &tmp);
+
+ printbuf_reset(&buf);
+ if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) {
+ printbuf_reset(&buf);
+ bset_key_invalid(c, b, u.s_c, updated_range, write, &buf);
+ prt_printf(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, u.s_c);
+
+ btree_err(-BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, i,
+ btree_node_bad_bkey,
+ "invalid bkey: %s", buf.buf);
+
+ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+ memmove_u64s_down(k, bkey_p_next(k),
+ (u64 *) vstruct_end(i) - (u64 *) k);
+ continue;
+ }
+
+ if (write)
+ bch2_bkey_compat(b->c.level, b->c.btree_id, version,
+ BSET_BIG_ENDIAN(i), write,
+ &b->format, k);
+
+ if (prev && bkey_iter_cmp(b, prev, k) > 0) {
+ struct bkey up = bkey_unpack_key(b, prev);
+
+ printbuf_reset(&buf);
+ prt_printf(&buf, "keys out of order: ");
+ bch2_bkey_to_text(&buf, &up);
+ prt_printf(&buf, " > ");
+ bch2_bkey_to_text(&buf, u.k);
+
+ bch2_dump_bset(c, b, i, 0);
+
+ if (btree_err(-BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, i,
+ btree_node_bkey_out_of_order,
+ "%s", buf.buf)) {
+ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+ memmove_u64s_down(k, bkey_p_next(k),
+ (u64 *) vstruct_end(i) - (u64 *) k);
+ continue;
+ }
+ }
+
+ prev = k;
+ k = bkey_p_next(k);
+ }
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
+ struct btree *b, bool have_retry, bool *saw_error)
+{
+ struct btree_node_entry *bne;
+ struct sort_iter *iter;
+ struct btree_node *sorted;
+ struct bkey_packed *k;
+ struct bch_extent_ptr *ptr;
+ struct bset *i;
+ bool used_mempool, blacklisted;
+ bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+ BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
+ unsigned u64s;
+ unsigned ptr_written = btree_ptr_sectors_written(&b->key);
+ struct printbuf buf = PRINTBUF;
+ int ret = 0, retry_read = 0, write = READ;
+
+ b->version_ondisk = U16_MAX;
+ /* We might get called multiple times on read retry: */
+ b->written = 0;
+
+ iter = mempool_alloc(&c->fill_iter, GFP_NOFS);
+ sort_iter_init(iter, b, (btree_blocks(c) + 1) * 2);
+
+ if (bch2_meta_read_fault("btree"))
+ btree_err(-BCH_ERR_btree_node_read_err_must_retry,
+ c, ca, b, NULL,
+ btree_node_fault_injected,
+ "dynamic fault");
+
+ btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
+ -BCH_ERR_btree_node_read_err_must_retry,
+ c, ca, b, NULL,
+ btree_node_bad_magic,
+ "bad magic: want %llx, got %llx",
+ bset_magic(c), le64_to_cpu(b->data->magic));
+
+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+ struct bch_btree_ptr_v2 *bp =
+ &bkey_i_to_btree_ptr_v2(&b->key)->v;
+
+ btree_err_on(b->data->keys.seq != bp->seq,
+ -BCH_ERR_btree_node_read_err_must_retry,
+ c, ca, b, NULL,
+ btree_node_bad_seq,
+ "got wrong btree node (seq %llx want %llx)",
+ b->data->keys.seq, bp->seq);
+ } else {
+ btree_err_on(!b->data->keys.seq,
+ -BCH_ERR_btree_node_read_err_must_retry,
+ c, ca, b, NULL,
+ btree_node_bad_seq,
+ "bad btree header: seq 0");
+ }
+
+ while (b->written < (ptr_written ?: btree_sectors(c))) {
+ unsigned sectors;
+ struct nonce nonce;
+ bool first = !b->written;
+ bool csum_bad;
+
+ if (!b->written) {
+ i = &b->data->keys;
+
+ btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
+ -BCH_ERR_btree_node_read_err_want_retry,
+ c, ca, b, i,
+ bset_unknown_csum,
+ "unknown checksum type %llu", BSET_CSUM_TYPE(i));
+
+ nonce = btree_nonce(i, b->written << 9);
+
+ csum_bad = bch2_crc_cmp(b->data->csum,
+ csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data));
+ if (csum_bad)
+ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+
+ btree_err_on(csum_bad,
+ -BCH_ERR_btree_node_read_err_want_retry,
+ c, ca, b, i,
+ bset_bad_csum,
+ "invalid checksum");
+
+ ret = bset_encrypt(c, i, b->written << 9);
+ if (bch2_fs_fatal_err_on(ret, c,
+ "error decrypting btree node: %i", ret))
+ goto fsck_err;
+
+ btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
+ !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
+ -BCH_ERR_btree_node_read_err_incompatible,
+ c, NULL, b, NULL,
+ btree_node_unsupported_version,
+ "btree node does not have NEW_EXTENT_OVERWRITE set");
+
+ sectors = vstruct_sectors(b->data, c->block_bits);
+ } else {
+ bne = write_block(b);
+ i = &bne->keys;
+
+ if (i->seq != b->data->keys.seq)
+ break;
+
+ btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
+ -BCH_ERR_btree_node_read_err_want_retry,
+ c, ca, b, i,
+ bset_unknown_csum,
+ "unknown checksum type %llu", BSET_CSUM_TYPE(i));
+
+ nonce = btree_nonce(i, b->written << 9);
+ csum_bad = bch2_crc_cmp(bne->csum,
+ csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne));
+ if (csum_bad)
+ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+
+ btree_err_on(csum_bad,
+ -BCH_ERR_btree_node_read_err_want_retry,
+ c, ca, b, i,
+ bset_bad_csum,
+ "invalid checksum");
+
+ ret = bset_encrypt(c, i, b->written << 9);
+ if (bch2_fs_fatal_err_on(ret, c,
+ "error decrypting btree node: %i\n", ret))
+ goto fsck_err;
+
+ sectors = vstruct_sectors(bne, c->block_bits);
+ }
+
+ b->version_ondisk = min(b->version_ondisk,
+ le16_to_cpu(i->version));
+
+ ret = validate_bset(c, ca, b, i, b->written, sectors,
+ READ, have_retry, saw_error);
+ if (ret)
+ goto fsck_err;
+
+ if (!b->written)
+ btree_node_set_format(b, b->data->format);
+
+ ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error);
+ if (ret)
+ goto fsck_err;
+
+ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
+
+ blacklisted = bch2_journal_seq_is_blacklisted(c,
+ le64_to_cpu(i->journal_seq),
+ true);
+
+ btree_err_on(blacklisted && first,
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, ca, b, i,
+ bset_blacklisted_journal_seq,
+ "first btree node bset has blacklisted journal seq (%llu)",
+ le64_to_cpu(i->journal_seq));
+
+ btree_err_on(blacklisted && ptr_written,
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, ca, b, i,
+ first_bset_blacklisted_journal_seq,
+ "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
+ le64_to_cpu(i->journal_seq),
+ b->written, b->written + sectors, ptr_written);
+
+ b->written += sectors;
+
+ if (blacklisted && !first)
+ continue;
+
+ sort_iter_add(iter,
+ vstruct_idx(i, 0),
+ vstruct_last(i));
+ }
+
+ if (ptr_written) {
+ btree_err_on(b->written < ptr_written,
+ -BCH_ERR_btree_node_read_err_want_retry,
+ c, ca, b, NULL,
+ btree_node_data_missing,
+ "btree node data missing: expected %u sectors, found %u",
+ ptr_written, b->written);
+ } else {
+ for (bne = write_block(b);
+ bset_byte_offset(b, bne) < btree_bytes(c);
+ bne = (void *) bne + block_bytes(c))
+ btree_err_on(bne->keys.seq == b->data->keys.seq &&
+ !bch2_journal_seq_is_blacklisted(c,
+ le64_to_cpu(bne->keys.journal_seq),
+ true),
+ -BCH_ERR_btree_node_read_err_want_retry,
+ c, ca, b, NULL,
+ btree_node_bset_after_end,
+ "found bset signature after last bset");
+ }
+
+ sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
+ sorted->keys.u64s = 0;
+
+ set_btree_bset(b, b->set, &b->data->keys);
+
+ b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter);
+
+ u64s = le16_to_cpu(sorted->keys.u64s);
+ *sorted = *b->data;
+ sorted->keys.u64s = cpu_to_le16(u64s);
+ swap(sorted, b->data);
+ set_btree_bset(b, b->set, &b->data->keys);
+ b->nsets = 1;
+
+ BUG_ON(b->nr.live_u64s != u64s);
+
+ btree_bounce_free(c, btree_bytes(c), used_mempool, sorted);
+
+ if (updated_range)
+ bch2_btree_node_drop_keys_outside_node(b);
+
+ i = &b->data->keys;
+ for (k = i->start; k != vstruct_last(i);) {
+ struct bkey tmp;
+ struct bkey_s u = __bkey_disassemble(b, k, &tmp);
+
+ printbuf_reset(&buf);
+
+ if (bch2_bkey_val_invalid(c, u.s_c, READ, &buf) ||
+ (bch2_inject_invalid_keys &&
+ !bversion_cmp(u.k->version, MAX_VERSION))) {
+ printbuf_reset(&buf);
+
+ prt_printf(&buf, "invalid bkey: ");
+ bch2_bkey_val_invalid(c, u.s_c, READ, &buf);
+ prt_printf(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, u.s_c);
+
+ btree_err(-BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, i,
+ btree_node_bad_bkey,
+ "%s", buf.buf);
+
+ btree_keys_account_key_drop(&b->nr, 0, k);
+
+ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+ memmove_u64s_down(k, bkey_p_next(k),
+ (u64 *) vstruct_end(i) - (u64 *) k);
+ set_btree_bset_end(b, b->set);
+ continue;
+ }
+
+ if (u.k->type == KEY_TYPE_btree_ptr_v2) {
+ struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u);
+
+ bp.v->mem_ptr = 0;
+ }
+
+ k = bkey_p_next(k);
+ }
+
+ bch2_bset_build_aux_tree(b, b->set, false);
+
+ set_needs_whiteout(btree_bset_first(b), true);
+
+ btree_node_reset_sib_u64s(b);
+
+ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
+ struct bch_dev *ca2 = bch_dev_bkey_exists(c, ptr->dev);
+
+ if (ca2->mi.state != BCH_MEMBER_STATE_rw)
+ set_btree_node_need_rewrite(b);
+ }
+
+ if (!ptr_written)
+ set_btree_node_need_rewrite(b);
+out:
+ mempool_free(iter, &c->fill_iter);
+ printbuf_exit(&buf);
+ return retry_read;
+fsck_err:
+ if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
+ ret == -BCH_ERR_btree_node_read_err_must_retry)
+ retry_read = 1;
+ else
+ set_btree_node_read_error(b);
+ goto out;
+}
+
+static void btree_node_read_work(struct work_struct *work)
+{
+ struct btree_read_bio *rb =
+ container_of(work, struct btree_read_bio, work);
+ struct bch_fs *c = rb->c;
+ struct btree *b = rb->b;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+ struct bio *bio = &rb->bio;
+ struct bch_io_failures failed = { .nr = 0 };
+ struct printbuf buf = PRINTBUF;
+ bool saw_error = false;
+ bool retry = false;
+ bool can_retry;
+
+ goto start;
+ while (1) {
+ retry = true;
+ bch_info(c, "retrying read");
+ ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+ rb->have_ioref = bch2_dev_get_ioref(ca, READ);
+ bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
+ bio->bi_iter.bi_sector = rb->pick.ptr.offset;
+ bio->bi_iter.bi_size = btree_bytes(c);
+
+ if (rb->have_ioref) {
+ bio_set_dev(bio, ca->disk_sb.bdev);
+ submit_bio_wait(bio);
+ } else {
+ bio->bi_status = BLK_STS_REMOVED;
+ }
+start:
+ printbuf_reset(&buf);
+ bch2_btree_pos_to_text(&buf, c, b);
+ bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
+ "btree read error %s for %s",
+ bch2_blk_status_to_str(bio->bi_status), buf.buf);
+ if (rb->have_ioref)
+ percpu_ref_put(&ca->io_ref);
+ rb->have_ioref = false;
+
+ bch2_mark_io_failure(&failed, &rb->pick);
+
+ can_retry = bch2_bkey_pick_read_device(c,
+ bkey_i_to_s_c(&b->key),
+ &failed, &rb->pick) > 0;
+
+ if (!bio->bi_status &&
+ !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) {
+ if (retry)
+ bch_info(c, "retry success");
+ break;
+ }
+
+ saw_error = true;
+
+ if (!can_retry) {
+ set_btree_node_read_error(b);
+ break;
+ }
+ }
+
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
+ rb->start_time);
+ bio_put(&rb->bio);
+
+ if (saw_error && !btree_node_read_error(b)) {
+ printbuf_reset(&buf);
+ bch2_bpos_to_text(&buf, b->key.k.p);
+ bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
+ __func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf);
+
+ bch2_btree_node_rewrite_async(c, b);
+ }
+
+ printbuf_exit(&buf);
+ clear_btree_node_read_in_flight(b);
+ wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
+}
+
+static void btree_node_read_endio(struct bio *bio)
+{
+ struct btree_read_bio *rb =
+ container_of(bio, struct btree_read_bio, bio);
+ struct bch_fs *c = rb->c;
+
+ if (rb->have_ioref) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+
+ bch2_latency_acct(ca, rb->start_time, READ);
+ }
+
+ queue_work(c->io_complete_wq, &rb->work);
+}
+
+struct btree_node_read_all {
+ struct closure cl;
+ struct bch_fs *c;
+ struct btree *b;
+ unsigned nr;
+ void *buf[BCH_REPLICAS_MAX];
+ struct bio *bio[BCH_REPLICAS_MAX];
+ blk_status_t err[BCH_REPLICAS_MAX];
+};
+
+static unsigned btree_node_sectors_written(struct bch_fs *c, void *data)
+{
+ struct btree_node *bn = data;
+ struct btree_node_entry *bne;
+ unsigned offset = 0;
+
+ if (le64_to_cpu(bn->magic) != bset_magic(c))
+ return 0;
+
+ while (offset < btree_sectors(c)) {
+ if (!offset) {
+ offset += vstruct_sectors(bn, c->block_bits);
+ } else {
+ bne = data + (offset << 9);
+ if (bne->keys.seq != bn->keys.seq)
+ break;
+ offset += vstruct_sectors(bne, c->block_bits);
+ }
+ }
+
+ return offset;
+}
+
+static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data)
+{
+ struct btree_node *bn = data;
+ struct btree_node_entry *bne;
+
+ if (!offset)
+ return false;
+
+ while (offset < btree_sectors(c)) {
+ bne = data + (offset << 9);
+ if (bne->keys.seq == bn->keys.seq)
+ return true;
+ offset++;
+ }
+
+ return false;
+ return offset;
+}
+
+static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
+{
+ closure_type(ra, struct btree_node_read_all, cl);
+ struct bch_fs *c = ra->c;
+ struct btree *b = ra->b;
+ struct printbuf buf = PRINTBUF;
+ bool dump_bset_maps = false;
+ bool have_retry = false;
+ int ret = 0, best = -1, write = READ;
+ unsigned i, written = 0, written2 = 0;
+ __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
+ ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
+ bool _saw_error = false, *saw_error = &_saw_error;
+
+ for (i = 0; i < ra->nr; i++) {
+ struct btree_node *bn = ra->buf[i];
+
+ if (ra->err[i])
+ continue;
+
+ if (le64_to_cpu(bn->magic) != bset_magic(c) ||
+ (seq && seq != bn->keys.seq))
+ continue;
+
+ if (best < 0) {
+ best = i;
+ written = btree_node_sectors_written(c, bn);
+ continue;
+ }
+
+ written2 = btree_node_sectors_written(c, ra->buf[i]);
+ if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, NULL,
+ btree_node_replicas_sectors_written_mismatch,
+ "btree node sectors written mismatch: %u != %u",
+ written, written2) ||
+ btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, NULL,
+ btree_node_bset_after_end,
+ "found bset signature after last bset") ||
+ btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9),
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, NULL,
+ btree_node_replicas_data_mismatch,
+ "btree node replicas content mismatch"))
+ dump_bset_maps = true;
+
+ if (written2 > written) {
+ written = written2;
+ best = i;
+ }
+ }
+fsck_err:
+ if (dump_bset_maps) {
+ for (i = 0; i < ra->nr; i++) {
+ struct btree_node *bn = ra->buf[i];
+ struct btree_node_entry *bne = NULL;
+ unsigned offset = 0, sectors;
+ bool gap = false;
+
+ if (ra->err[i])
+ continue;
+
+ printbuf_reset(&buf);
+
+ while (offset < btree_sectors(c)) {
+ if (!offset) {
+ sectors = vstruct_sectors(bn, c->block_bits);
+ } else {
+ bne = ra->buf[i] + (offset << 9);
+ if (bne->keys.seq != bn->keys.seq)
+ break;
+ sectors = vstruct_sectors(bne, c->block_bits);
+ }
+
+ prt_printf(&buf, " %u-%u", offset, offset + sectors);
+ if (bne && bch2_journal_seq_is_blacklisted(c,
+ le64_to_cpu(bne->keys.journal_seq), false))
+ prt_printf(&buf, "*");
+ offset += sectors;
+ }
+
+ while (offset < btree_sectors(c)) {
+ bne = ra->buf[i] + (offset << 9);
+ if (bne->keys.seq == bn->keys.seq) {
+ if (!gap)
+ prt_printf(&buf, " GAP");
+ gap = true;
+
+ sectors = vstruct_sectors(bne, c->block_bits);
+ prt_printf(&buf, " %u-%u", offset, offset + sectors);
+ if (bch2_journal_seq_is_blacklisted(c,
+ le64_to_cpu(bne->keys.journal_seq), false))
+ prt_printf(&buf, "*");
+ }
+ offset++;
+ }
+
+ bch_err(c, "replica %u:%s", i, buf.buf);
+ }
+ }
+
+ if (best >= 0) {
+ memcpy(b->data, ra->buf[best], btree_bytes(c));
+ ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error);
+ } else {
+ ret = -1;
+ }
+
+ if (ret)
+ set_btree_node_read_error(b);
+ else if (*saw_error)
+ bch2_btree_node_rewrite_async(c, b);
+
+ for (i = 0; i < ra->nr; i++) {
+ mempool_free(ra->buf[i], &c->btree_bounce_pool);
+ bio_put(ra->bio[i]);
+ }
+
+ closure_debug_destroy(&ra->cl);
+ kfree(ra);
+ printbuf_exit(&buf);
+
+ clear_btree_node_read_in_flight(b);
+ wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
+}
+
+static void btree_node_read_all_replicas_endio(struct bio *bio)
+{
+ struct btree_read_bio *rb =
+ container_of(bio, struct btree_read_bio, bio);
+ struct bch_fs *c = rb->c;
+ struct btree_node_read_all *ra = rb->ra;
+
+ if (rb->have_ioref) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+
+ bch2_latency_acct(ca, rb->start_time, READ);
+ }
+
+ ra->err[rb->idx] = bio->bi_status;
+ closure_put(&ra->cl);
+}
+
+/*
+ * XXX This allocates multiple times from the same mempools, and can deadlock
+ * under sufficient memory pressure (but is only a debug path)
+ */
+static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync)
+{
+ struct bkey_s_c k = bkey_i_to_s_c(&b->key);
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded pick;
+ struct btree_node_read_all *ra;
+ unsigned i;
+
+ ra = kzalloc(sizeof(*ra), GFP_NOFS);
+ if (!ra)
+ return -BCH_ERR_ENOMEM_btree_node_read_all_replicas;
+
+ closure_init(&ra->cl, NULL);
+ ra->c = c;
+ ra->b = b;
+ ra->nr = bch2_bkey_nr_ptrs(k);
+
+ for (i = 0; i < ra->nr; i++) {
+ ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
+ ra->bio[i] = bio_alloc_bioset(NULL,
+ buf_pages(ra->buf[i], btree_bytes(c)),
+ REQ_OP_READ|REQ_SYNC|REQ_META,
+ GFP_NOFS,
+ &c->btree_bio);
+ }
+
+ i = 0;
+ bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+ struct btree_read_bio *rb =
+ container_of(ra->bio[i], struct btree_read_bio, bio);
+ rb->c = c;
+ rb->b = b;
+ rb->ra = ra;
+ rb->start_time = local_clock();
+ rb->have_ioref = bch2_dev_get_ioref(ca, READ);
+ rb->idx = i;
+ rb->pick = pick;
+ rb->bio.bi_iter.bi_sector = pick.ptr.offset;
+ rb->bio.bi_end_io = btree_node_read_all_replicas_endio;
+ bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c));
+
+ if (rb->have_ioref) {
+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
+ bio_sectors(&rb->bio));
+ bio_set_dev(&rb->bio, ca->disk_sb.bdev);
+
+ closure_get(&ra->cl);
+ submit_bio(&rb->bio);
+ } else {
+ ra->err[i] = BLK_STS_REMOVED;
+ }
+
+ i++;
+ }
+
+ if (sync) {
+ closure_sync(&ra->cl);
+ btree_node_read_all_replicas_done(&ra->cl.work);
+ } else {
+ continue_at(&ra->cl, btree_node_read_all_replicas_done,
+ c->io_complete_wq);
+ }
+
+ return 0;
+}
+
+void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
+ bool sync)
+{
+ struct extent_ptr_decoded pick;
+ struct btree_read_bio *rb;
+ struct bch_dev *ca;
+ struct bio *bio;
+ int ret;
+
+ trace_and_count(c, btree_node_read, c, b);
+
+ if (bch2_verify_all_btree_replicas &&
+ !btree_node_read_all_replicas(c, b, sync))
+ return;
+
+ ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
+ NULL, &pick);
+
+ if (ret <= 0) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "btree node read error: no device to read from\n at ");
+ bch2_btree_pos_to_text(&buf, c, b);
+ bch_err(c, "%s", buf.buf);
+
+ if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
+ c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology)
+ bch2_fatal_error(c);
+
+ set_btree_node_read_error(b);
+ clear_btree_node_read_in_flight(b);
+ wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
+ printbuf_exit(&buf);
+ return;
+ }
+
+ ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+
+ bio = bio_alloc_bioset(NULL,
+ buf_pages(b->data, btree_bytes(c)),
+ REQ_OP_READ|REQ_SYNC|REQ_META,
+ GFP_NOFS,
+ &c->btree_bio);
+ rb = container_of(bio, struct btree_read_bio, bio);
+ rb->c = c;
+ rb->b = b;
+ rb->ra = NULL;
+ rb->start_time = local_clock();
+ rb->have_ioref = bch2_dev_get_ioref(ca, READ);
+ rb->pick = pick;
+ INIT_WORK(&rb->work, btree_node_read_work);
+ bio->bi_iter.bi_sector = pick.ptr.offset;
+ bio->bi_end_io = btree_node_read_endio;
+ bch2_bio_map(bio, b->data, btree_bytes(c));
+
+ if (rb->have_ioref) {
+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
+ bio_sectors(bio));
+ bio_set_dev(bio, ca->disk_sb.bdev);
+
+ if (sync) {
+ submit_bio_wait(bio);
+
+ btree_node_read_work(&rb->work);
+ } else {
+ submit_bio(bio);
+ }
+ } else {
+ bio->bi_status = BLK_STS_REMOVED;
+
+ if (sync)
+ btree_node_read_work(&rb->work);
+ else
+ queue_work(c->io_complete_wq, &rb->work);
+ }
+}
+
+static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
+ const struct bkey_i *k, unsigned level)
+{
+ struct bch_fs *c = trans->c;
+ struct closure cl;
+ struct btree *b;
+ int ret;
+
+ closure_init_stack(&cl);
+
+ do {
+ ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+ closure_sync(&cl);
+ } while (ret);
+
+ b = bch2_btree_node_mem_alloc(trans, level != 0);
+ bch2_btree_cache_cannibalize_unlock(c);
+
+ BUG_ON(IS_ERR(b));
+
+ bkey_copy(&b->key, k);
+ BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
+
+ set_btree_node_read_in_flight(b);
+
+ bch2_btree_node_read(c, b, true);
+
+ if (btree_node_read_error(b)) {
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+ mutex_lock(&c->btree_cache.lock);
+ list_move(&b->list, &c->btree_cache.freeable);
+ mutex_unlock(&c->btree_cache.lock);
+
+ ret = -EIO;
+ goto err;
+ }
+
+ bch2_btree_set_root_for_read(c, b);
+err:
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
+
+ return ret;
+}
+
+int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
+ const struct bkey_i *k, unsigned level)
+{
+ return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level));
+}
+
+static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
+ struct btree_write *w)
+{
+ unsigned long old, new, v = READ_ONCE(b->will_make_reachable);
+
+ do {
+ old = new = v;
+ if (!(old & 1))
+ break;
+
+ new &= ~1UL;
+ } while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old);
+
+ if (old & 1)
+ closure_put(&((struct btree_update *) new)->cl);
+
+ bch2_journal_pin_drop(&c->journal, &w->journal);
+}
+
+static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
+{
+ struct btree_write *w = btree_prev_write(b);
+ unsigned long old, new, v;
+ unsigned type = 0;
+
+ bch2_btree_complete_write(c, b, w);
+
+ v = READ_ONCE(b->flags);
+ do {
+ old = new = v;
+
+ if ((old & (1U << BTREE_NODE_dirty)) &&
+ (old & (1U << BTREE_NODE_need_write)) &&
+ !(old & (1U << BTREE_NODE_never_write)) &&
+ !(old & (1U << BTREE_NODE_write_blocked)) &&
+ !(old & (1U << BTREE_NODE_will_make_reachable))) {
+ new &= ~(1U << BTREE_NODE_dirty);
+ new &= ~(1U << BTREE_NODE_need_write);
+ new |= (1U << BTREE_NODE_write_in_flight);
+ new |= (1U << BTREE_NODE_write_in_flight_inner);
+ new |= (1U << BTREE_NODE_just_written);
+ new ^= (1U << BTREE_NODE_write_idx);
+
+ type = new & BTREE_WRITE_TYPE_MASK;
+ new &= ~BTREE_WRITE_TYPE_MASK;
+ } else {
+ new &= ~(1U << BTREE_NODE_write_in_flight);
+ new &= ~(1U << BTREE_NODE_write_in_flight_inner);
+ }
+ } while ((v = cmpxchg(&b->flags, old, new)) != old);
+
+ if (new & (1U << BTREE_NODE_write_in_flight))
+ __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type);
+ else
+ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
+}
+
+static void btree_node_write_done(struct bch_fs *c, struct btree *b)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+ __btree_node_write_done(c, b);
+ six_unlock_read(&b->c.lock);
+
+ bch2_trans_put(trans);
+}
+
+static void btree_node_write_work(struct work_struct *work)
+{
+ struct btree_write_bio *wbio =
+ container_of(work, struct btree_write_bio, work);
+ struct bch_fs *c = wbio->wbio.c;
+ struct btree *b = wbio->wbio.bio.bi_private;
+ struct bch_extent_ptr *ptr;
+ int ret = 0;
+
+ btree_bounce_free(c,
+ wbio->data_bytes,
+ wbio->wbio.used_mempool,
+ wbio->data);
+
+ bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr,
+ bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
+
+ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key)))
+ goto err;
+
+ if (wbio->wbio.first_btree_write) {
+ if (wbio->wbio.failed.nr) {
+
+ }
+ } else {
+ ret = bch2_trans_do(c, NULL, NULL, 0,
+ bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
+ BCH_WATERMARK_reclaim|
+ BTREE_INSERT_JOURNAL_RECLAIM|
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_NOCHECK_RW,
+ !wbio->wbio.failed.nr));
+ if (ret)
+ goto err;
+ }
+out:
+ bio_put(&wbio->wbio.bio);
+ btree_node_write_done(c, b);
+ return;
+err:
+ set_btree_node_noevict(b);
+ if (!bch2_err_matches(ret, EROFS))
+ bch2_fs_fatal_error(c, "fatal error writing btree node: %s", bch2_err_str(ret));
+ goto out;
+}
+
+static void btree_node_write_endio(struct bio *bio)
+{
+ struct bch_write_bio *wbio = to_wbio(bio);
+ struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
+ struct bch_write_bio *orig = parent ?: wbio;
+ struct btree_write_bio *wb = container_of(orig, struct btree_write_bio, wbio);
+ struct bch_fs *c = wbio->c;
+ struct btree *b = wbio->bio.bi_private;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
+ unsigned long flags;
+
+ if (wbio->have_ioref)
+ bch2_latency_acct(ca, wbio->submit_time, WRITE);
+
+ if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
+ "btree write error: %s",
+ bch2_blk_status_to_str(bio->bi_status)) ||
+ bch2_meta_write_fault("btree")) {
+ spin_lock_irqsave(&c->btree_write_error_lock, flags);
+ bch2_dev_list_add_dev(&orig->failed, wbio->dev);
+ spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+ }
+
+ if (wbio->have_ioref)
+ percpu_ref_put(&ca->io_ref);
+
+ if (parent) {
+ bio_put(bio);
+ bio_endio(&parent->bio);
+ return;
+ }
+
+ clear_btree_node_write_in_flight_inner(b);
+ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner);
+ INIT_WORK(&wb->work, btree_node_write_work);
+ queue_work(c->btree_io_complete_wq, &wb->work);
+}
+
+static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
+ struct bset *i, unsigned sectors)
+{
+ struct printbuf buf = PRINTBUF;
+ bool saw_error;
+ int ret;
+
+ ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key),
+ BKEY_TYPE_btree, WRITE, &buf);
+
+ if (ret)
+ bch2_fs_inconsistent(c, "invalid btree node key before write: %s", buf.buf);
+ printbuf_exit(&buf);
+ if (ret)
+ return ret;
+
+ ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?:
+ validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error);
+ if (ret) {
+ bch2_inconsistent_error(c);
+ dump_stack();
+ }
+
+ return ret;
+}
+
+static void btree_write_submit(struct work_struct *work)
+{
+ struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work);
+ struct bch_extent_ptr *ptr;
+ BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+
+ bkey_copy(&tmp.k, &wbio->key);
+
+ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr)
+ ptr->offset += wbio->sector_offset;
+
+ bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree,
+ &tmp.k, false);
+}
+
+void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
+{
+ struct btree_write_bio *wbio;
+ struct bset_tree *t;
+ struct bset *i;
+ struct btree_node *bn = NULL;
+ struct btree_node_entry *bne = NULL;
+ struct sort_iter_stack sort_iter;
+ struct nonce nonce;
+ unsigned bytes_to_write, sectors_to_write, bytes, u64s;
+ u64 seq = 0;
+ bool used_mempool;
+ unsigned long old, new;
+ bool validate_before_checksum = false;
+ enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK;
+ void *data;
+ int ret;
+
+ if (flags & BTREE_WRITE_ALREADY_STARTED)
+ goto do_write;
+
+ /*
+ * We may only have a read lock on the btree node - the dirty bit is our
+ * "lock" against racing with other threads that may be trying to start
+ * a write, we do a write iff we clear the dirty bit. Since setting the
+ * dirty bit requires a write lock, we can't race with other threads
+ * redirtying it:
+ */
+ do {
+ old = new = READ_ONCE(b->flags);
+
+ if (!(old & (1 << BTREE_NODE_dirty)))
+ return;
+
+ if ((flags & BTREE_WRITE_ONLY_IF_NEED) &&
+ !(old & (1 << BTREE_NODE_need_write)))
+ return;
+
+ if (old &
+ ((1 << BTREE_NODE_never_write)|
+ (1 << BTREE_NODE_write_blocked)))
+ return;
+
+ if (b->written &&
+ (old & (1 << BTREE_NODE_will_make_reachable)))
+ return;
+
+ if (old & (1 << BTREE_NODE_write_in_flight))
+ return;
+
+ if (flags & BTREE_WRITE_ONLY_IF_NEED)
+ type = new & BTREE_WRITE_TYPE_MASK;
+ new &= ~BTREE_WRITE_TYPE_MASK;
+
+ new &= ~(1 << BTREE_NODE_dirty);
+ new &= ~(1 << BTREE_NODE_need_write);
+ new |= (1 << BTREE_NODE_write_in_flight);
+ new |= (1 << BTREE_NODE_write_in_flight_inner);
+ new |= (1 << BTREE_NODE_just_written);
+ new ^= (1 << BTREE_NODE_write_idx);
+ } while (cmpxchg_acquire(&b->flags, old, new) != old);
+
+ if (new & (1U << BTREE_NODE_need_write))
+ return;
+do_write:
+ BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0));
+
+ atomic_dec(&c->btree_cache.dirty);
+
+ BUG_ON(btree_node_fake(b));
+ BUG_ON((b->will_make_reachable != 0) != !b->written);
+
+ BUG_ON(b->written >= btree_sectors(c));
+ BUG_ON(b->written & (block_sectors(c) - 1));
+ BUG_ON(bset_written(b, btree_bset_last(b)));
+ BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
+ BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
+
+ bch2_sort_whiteouts(c, b);
+
+ sort_iter_stack_init(&sort_iter, b);
+
+ bytes = !b->written
+ ? sizeof(struct btree_node)
+ : sizeof(struct btree_node_entry);
+
+ bytes += b->whiteout_u64s * sizeof(u64);
+
+ for_each_bset(b, t) {
+ i = bset(b, t);
+
+ if (bset_written(b, i))
+ continue;
+
+ bytes += le16_to_cpu(i->u64s) * sizeof(u64);
+ sort_iter_add(&sort_iter.iter,
+ btree_bkey_first(b, t),
+ btree_bkey_last(b, t));
+ seq = max(seq, le64_to_cpu(i->journal_seq));
+ }
+
+ BUG_ON(b->written && !seq);
+
+ /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */
+ bytes += 8;
+
+ /* buffer must be a multiple of the block size */
+ bytes = round_up(bytes, block_bytes(c));
+
+ data = btree_bounce_alloc(c, bytes, &used_mempool);
+
+ if (!b->written) {
+ bn = data;
+ *bn = *b->data;
+ i = &bn->keys;
+ } else {
+ bne = data;
+ bne->keys = b->data->keys;
+ i = &bne->keys;
+ }
+
+ i->journal_seq = cpu_to_le64(seq);
+ i->u64s = 0;
+
+ sort_iter_add(&sort_iter.iter,
+ unwritten_whiteouts_start(c, b),
+ unwritten_whiteouts_end(c, b));
+ SET_BSET_SEPARATE_WHITEOUTS(i, false);
+
+ b->whiteout_u64s = 0;
+
+ u64s = bch2_sort_keys(i->start, &sort_iter.iter, false);
+ le16_add_cpu(&i->u64s, u64s);
+
+ BUG_ON(!b->written && i->u64s != b->data->keys.u64s);
+
+ set_needs_whiteout(i, false);
+
+ /* do we have data to write? */
+ if (b->written && !i->u64s)
+ goto nowrite;
+
+ bytes_to_write = vstruct_end(i) - data;
+ sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
+
+ if (!b->written &&
+ b->key.k.type == KEY_TYPE_btree_ptr_v2)
+ BUG_ON(btree_ptr_sectors_written(&b->key) != sectors_to_write);
+
+ memset(data + bytes_to_write, 0,
+ (sectors_to_write << 9) - bytes_to_write);
+
+ BUG_ON(b->written + sectors_to_write > btree_sectors(c));
+ BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
+ BUG_ON(i->seq != b->data->keys.seq);
+
+ i->version = cpu_to_le16(c->sb.version);
+ SET_BSET_OFFSET(i, b->written);
+ SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c));
+
+ if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)))
+ validate_before_checksum = true;
+
+ /* validate_bset will be modifying: */
+ if (le16_to_cpu(i->version) < bcachefs_metadata_version_current)
+ validate_before_checksum = true;
+
+ /* if we're going to be encrypting, check metadata validity first: */
+ if (validate_before_checksum &&
+ validate_bset_for_write(c, b, i, sectors_to_write))
+ goto err;
+
+ ret = bset_encrypt(c, i, b->written << 9);
+ if (bch2_fs_fatal_err_on(ret, c,
+ "error encrypting btree node: %i\n", ret))
+ goto err;
+
+ nonce = btree_nonce(i, b->written << 9);
+
+ if (bn)
+ bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn);
+ else
+ bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+
+ /* if we're not encrypting, check metadata after checksumming: */
+ if (!validate_before_checksum &&
+ validate_bset_for_write(c, b, i, sectors_to_write))
+ goto err;
+
+ /*
+ * We handle btree write errors by immediately halting the journal -
+ * after we've done that, we can't issue any subsequent btree writes
+ * because they might have pointers to new nodes that failed to write.
+ *
+ * Furthermore, there's no point in doing any more btree writes because
+ * with the journal stopped, we're never going to update the journal to
+ * reflect that those writes were done and the data flushed from the
+ * journal:
+ *
+ * Also on journal error, the pending write may have updates that were
+ * never journalled (interior nodes, see btree_update_nodes_written()) -
+ * it's critical that we don't do the write in that case otherwise we
+ * will have updates visible that weren't in the journal:
+ *
+ * Make sure to update b->written so bch2_btree_init_next() doesn't
+ * break:
+ */
+ if (bch2_journal_error(&c->journal) ||
+ c->opts.nochanges)
+ goto err;
+
+ trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write);
+
+ wbio = container_of(bio_alloc_bioset(NULL,
+ buf_pages(data, sectors_to_write << 9),
+ REQ_OP_WRITE|REQ_META,
+ GFP_NOFS,
+ &c->btree_bio),
+ struct btree_write_bio, wbio.bio);
+ wbio_init(&wbio->wbio.bio);
+ wbio->data = data;
+ wbio->data_bytes = bytes;
+ wbio->sector_offset = b->written;
+ wbio->wbio.c = c;
+ wbio->wbio.used_mempool = used_mempool;
+ wbio->wbio.first_btree_write = !b->written;
+ wbio->wbio.bio.bi_end_io = btree_node_write_endio;
+ wbio->wbio.bio.bi_private = b;
+
+ bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
+
+ bkey_copy(&wbio->key, &b->key);
+
+ b->written += sectors_to_write;
+
+ if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2)
+ bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written =
+ cpu_to_le16(b->written);
+
+ atomic64_inc(&c->btree_write_stats[type].nr);
+ atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes);
+
+ INIT_WORK(&wbio->work, btree_write_submit);
+ queue_work(c->io_complete_wq, &wbio->work);
+ return;
+err:
+ set_btree_node_noevict(b);
+ b->written += sectors_to_write;
+nowrite:
+ btree_bounce_free(c, bytes, used_mempool, data);
+ __btree_node_write_done(c, b);
+}
+
+/*
+ * Work that must be done with write lock held:
+ */
+bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
+{
+ bool invalidated_iter = false;
+ struct btree_node_entry *bne;
+ struct bset_tree *t;
+
+ if (!btree_node_just_written(b))
+ return false;
+
+ BUG_ON(b->whiteout_u64s);
+
+ clear_btree_node_just_written(b);
+
+ /*
+ * Note: immediately after write, bset_written() doesn't work - the
+ * amount of data we had to write after compaction might have been
+ * smaller than the offset of the last bset.
+ *
+ * However, we know that all bsets have been written here, as long as
+ * we're still holding the write lock:
+ */
+
+ /*
+ * XXX: decide if we really want to unconditionally sort down to a
+ * single bset:
+ */
+ if (b->nsets > 1) {
+ btree_node_sort(c, b, 0, b->nsets, true);
+ invalidated_iter = true;
+ } else {
+ invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL);
+ }
+
+ for_each_bset(b, t)
+ set_needs_whiteout(bset(b, t), true);
+
+ bch2_btree_verify(c, b);
+
+ /*
+ * If later we don't unconditionally sort down to a single bset, we have
+ * to ensure this is still true:
+ */
+ BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b));
+
+ bne = want_new_bset(c, b);
+ if (bne)
+ bch2_bset_init_next(c, b, bne);
+
+ bch2_btree_build_aux_trees(b);
+
+ return invalidated_iter;
+}
+
+/*
+ * Use this one if the node is intent locked:
+ */
+void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
+ enum six_lock_type lock_type_held,
+ unsigned flags)
+{
+ if (lock_type_held == SIX_LOCK_intent ||
+ (lock_type_held == SIX_LOCK_read &&
+ six_lock_tryupgrade(&b->c.lock))) {
+ __bch2_btree_node_write(c, b, flags);
+
+ /* don't cycle lock unnecessarily: */
+ if (btree_node_just_written(b) &&
+ six_trylock_write(&b->c.lock)) {
+ bch2_btree_post_write_cleanup(c, b);
+ six_unlock_write(&b->c.lock);
+ }
+
+ if (lock_type_held == SIX_LOCK_read)
+ six_lock_downgrade(&b->c.lock);
+ } else {
+ __bch2_btree_node_write(c, b, flags);
+ if (lock_type_held == SIX_LOCK_write &&
+ btree_node_just_written(b))
+ bch2_btree_post_write_cleanup(c, b);
+ }
+}
+
+static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
+{
+ struct bucket_table *tbl;
+ struct rhash_head *pos;
+ struct btree *b;
+ unsigned i;
+ bool ret = false;
+restart:
+ rcu_read_lock();
+ for_each_cached_btree(b, c, tbl, i, pos)
+ if (test_bit(flag, &b->flags)) {
+ rcu_read_unlock();
+ wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE);
+ ret = true;
+ goto restart;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+bool bch2_btree_flush_all_reads(struct bch_fs *c)
+{
+ return __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
+}
+
+bool bch2_btree_flush_all_writes(struct bch_fs *c)
+{
+ return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
+}
+
+static const char * const bch2_btree_write_types[] = {
+#define x(t, n) [n] = #t,
+ BCH_BTREE_WRITE_TYPES()
+ NULL
+};
+
+void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ printbuf_tabstop_push(out, 20);
+ printbuf_tabstop_push(out, 10);
+
+ prt_tab(out);
+ prt_str(out, "nr");
+ prt_tab(out);
+ prt_str(out, "size");
+ prt_newline(out);
+
+ for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) {
+ u64 nr = atomic64_read(&c->btree_write_stats[i].nr);
+ u64 bytes = atomic64_read(&c->btree_write_stats[i].bytes);
+
+ prt_printf(out, "%s:", bch2_btree_write_types[i]);
+ prt_tab(out);
+ prt_u64(out, nr);
+ prt_tab(out);
+ prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0);
+ prt_newline(out);
+ }
+}
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
new file mode 100644
index 000000000000..e0d7fa5b1dfb
--- /dev/null
+++ b/fs/bcachefs/btree_io.h
@@ -0,0 +1,225 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_IO_H
+#define _BCACHEFS_BTREE_IO_H
+
+#include "bkey_methods.h"
+#include "bset.h"
+#include "btree_locking.h"
+#include "checksum.h"
+#include "extents.h"
+#include "io_write_types.h"
+
+struct bch_fs;
+struct btree_write;
+struct btree;
+struct btree_iter;
+struct btree_node_read_all;
+
+static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
+{
+ if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
+ atomic_inc(&c->btree_cache.dirty);
+}
+
+static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
+{
+ if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
+ atomic_dec(&c->btree_cache.dirty);
+}
+
+static inline unsigned btree_ptr_sectors_written(struct bkey_i *k)
+{
+ return k->k.type == KEY_TYPE_btree_ptr_v2
+ ? le16_to_cpu(bkey_i_to_btree_ptr_v2(k)->v.sectors_written)
+ : 0;
+}
+
+struct btree_read_bio {
+ struct bch_fs *c;
+ struct btree *b;
+ struct btree_node_read_all *ra;
+ u64 start_time;
+ unsigned have_ioref:1;
+ unsigned idx:7;
+ struct extent_ptr_decoded pick;
+ struct work_struct work;
+ struct bio bio;
+};
+
+struct btree_write_bio {
+ struct work_struct work;
+ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+ void *data;
+ unsigned data_bytes;
+ unsigned sector_offset;
+ struct bch_write_bio wbio;
+};
+
+void bch2_btree_node_io_unlock(struct btree *);
+void bch2_btree_node_io_lock(struct btree *);
+void __bch2_btree_node_wait_on_read(struct btree *);
+void __bch2_btree_node_wait_on_write(struct btree *);
+void bch2_btree_node_wait_on_read(struct btree *);
+void bch2_btree_node_wait_on_write(struct btree *);
+
+enum compact_mode {
+ COMPACT_LAZY,
+ COMPACT_ALL,
+};
+
+bool bch2_compact_whiteouts(struct bch_fs *, struct btree *,
+ enum compact_mode);
+
+static inline bool should_compact_bset_lazy(struct btree *b,
+ struct bset_tree *t)
+{
+ unsigned total_u64s = bset_u64s(t);
+ unsigned dead_u64s = bset_dead_u64s(b, t);
+
+ return dead_u64s > 64 && dead_u64s * 3 > total_u64s;
+}
+
+static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b)
+{
+ struct bset_tree *t;
+
+ for_each_bset(b, t)
+ if (should_compact_bset_lazy(b, t))
+ return bch2_compact_whiteouts(c, b, COMPACT_LAZY);
+
+ return false;
+}
+
+static inline struct nonce btree_nonce(struct bset *i, unsigned offset)
+{
+ return (struct nonce) {{
+ [0] = cpu_to_le32(offset),
+ [1] = ((__le32 *) &i->seq)[0],
+ [2] = ((__le32 *) &i->seq)[1],
+ [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
+ }};
+}
+
+static inline int bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
+{
+ struct nonce nonce = btree_nonce(i, offset);
+ int ret;
+
+ if (!offset) {
+ struct btree_node *bn = container_of(i, struct btree_node, keys);
+ unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
+
+ ret = bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce,
+ &bn->flags, bytes);
+ if (ret)
+ return ret;
+
+ nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
+ }
+
+ return bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
+ vstruct_end(i) - (void *) i->_data);
+}
+
+void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
+
+void bch2_btree_node_drop_keys_outside_node(struct btree *);
+
+void bch2_btree_build_aux_trees(struct btree *);
+void bch2_btree_init_next(struct btree_trans *, struct btree *);
+
+int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
+ struct btree *, bool, bool *);
+void bch2_btree_node_read(struct bch_fs *, struct btree *, bool);
+int bch2_btree_root_read(struct bch_fs *, enum btree_id,
+ const struct bkey_i *, unsigned);
+
+bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
+
+enum btree_write_flags {
+ __BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS,
+ __BTREE_WRITE_ALREADY_STARTED,
+};
+#define BTREE_WRITE_ONLY_IF_NEED BIT(__BTREE_WRITE_ONLY_IF_NEED)
+#define BTREE_WRITE_ALREADY_STARTED BIT(__BTREE_WRITE_ALREADY_STARTED)
+
+void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
+void bch2_btree_node_write(struct bch_fs *, struct btree *,
+ enum six_lock_type, unsigned);
+
+static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
+ enum six_lock_type lock_held)
+{
+ bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
+}
+
+bool bch2_btree_flush_all_reads(struct bch_fs *);
+bool bch2_btree_flush_all_writes(struct bch_fs *);
+
+static inline void compat_bformat(unsigned level, enum btree_id btree_id,
+ unsigned version, unsigned big_endian,
+ int write, struct bkey_format *f)
+{
+ if (version < bcachefs_metadata_version_inode_btree_change &&
+ btree_id == BTREE_ID_inodes) {
+ swap(f->bits_per_field[BKEY_FIELD_INODE],
+ f->bits_per_field[BKEY_FIELD_OFFSET]);
+ swap(f->field_offset[BKEY_FIELD_INODE],
+ f->field_offset[BKEY_FIELD_OFFSET]);
+ }
+
+ if (version < bcachefs_metadata_version_snapshot &&
+ (level || btree_type_has_snapshots(btree_id))) {
+ u64 max_packed =
+ ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
+
+ f->field_offset[BKEY_FIELD_SNAPSHOT] = write
+ ? 0
+ : cpu_to_le64(U32_MAX - max_packed);
+ }
+}
+
+static inline void compat_bpos(unsigned level, enum btree_id btree_id,
+ unsigned version, unsigned big_endian,
+ int write, struct bpos *p)
+{
+ if (big_endian != CPU_BIG_ENDIAN)
+ bch2_bpos_swab(p);
+
+ if (version < bcachefs_metadata_version_inode_btree_change &&
+ btree_id == BTREE_ID_inodes)
+ swap(p->inode, p->offset);
+}
+
+static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
+ unsigned version, unsigned big_endian,
+ int write,
+ struct btree_node *bn)
+{
+ if (version < bcachefs_metadata_version_inode_btree_change &&
+ btree_id_is_extents(btree_id) &&
+ !bpos_eq(bn->min_key, POS_MIN) &&
+ write)
+ bn->min_key = bpos_nosnap_predecessor(bn->min_key);
+
+ if (version < bcachefs_metadata_version_snapshot &&
+ write)
+ bn->max_key.snapshot = 0;
+
+ compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key);
+ compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key);
+
+ if (version < bcachefs_metadata_version_snapshot &&
+ !write)
+ bn->max_key.snapshot = U32_MAX;
+
+ if (version < bcachefs_metadata_version_inode_btree_change &&
+ btree_id_is_extents(btree_id) &&
+ !bpos_eq(bn->min_key, POS_MIN) &&
+ !write)
+ bn->min_key = bpos_nosnap_successor(bn->min_key);
+}
+
+void bch2_btree_write_stats_to_text(struct printbuf *, struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_IO_H */
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
new file mode 100644
index 000000000000..da594e006769
--- /dev/null
+++ b/fs/bcachefs/btree_iter.c
@@ -0,0 +1,3261 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "bkey_buf.h"
+#include "btree_cache.h"
+#include "btree_iter.h"
+#include "btree_journal_iter.h"
+#include "btree_key_cache.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "journal.h"
+#include "replicas.h"
+#include "snapshot.h"
+#include "trace.h"
+
+#include <linux/random.h>
+#include <linux/prefetch.h>
+
+static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
+static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
+ struct btree_path *);
+
+static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
+{
+#ifdef TRACK_PATH_ALLOCATED
+ return iter->ip_allocated;
+#else
+ return 0;
+#endif
+}
+
+static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *);
+
+static inline int __btree_path_cmp(const struct btree_path *l,
+ enum btree_id r_btree_id,
+ bool r_cached,
+ struct bpos r_pos,
+ unsigned r_level)
+{
+ /*
+ * Must match lock ordering as defined by __bch2_btree_node_lock:
+ */
+ return cmp_int(l->btree_id, r_btree_id) ?:
+ cmp_int((int) l->cached, (int) r_cached) ?:
+ bpos_cmp(l->pos, r_pos) ?:
+ -cmp_int(l->level, r_level);
+}
+
+static inline int btree_path_cmp(const struct btree_path *l,
+ const struct btree_path *r)
+{
+ return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level);
+}
+
+static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
+{
+ /* Are we iterating over keys in all snapshots? */
+ if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
+ p = bpos_successor(p);
+ } else {
+ p = bpos_nosnap_successor(p);
+ p.snapshot = iter->snapshot;
+ }
+
+ return p;
+}
+
+static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
+{
+ /* Are we iterating over keys in all snapshots? */
+ if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
+ p = bpos_predecessor(p);
+ } else {
+ p = bpos_nosnap_predecessor(p);
+ p.snapshot = iter->snapshot;
+ }
+
+ return p;
+}
+
+static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
+{
+ struct bpos pos = iter->pos;
+
+ if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+ !bkey_eq(pos, POS_MAX))
+ pos = bkey_successor(iter, pos);
+ return pos;
+}
+
+static inline bool btree_path_pos_before_node(struct btree_path *path,
+ struct btree *b)
+{
+ return bpos_lt(path->pos, b->data->min_key);
+}
+
+static inline bool btree_path_pos_after_node(struct btree_path *path,
+ struct btree *b)
+{
+ return bpos_gt(path->pos, b->key.k.p);
+}
+
+static inline bool btree_path_pos_in_node(struct btree_path *path,
+ struct btree *b)
+{
+ return path->btree_id == b->c.btree_id &&
+ !btree_path_pos_before_node(path, b) &&
+ !btree_path_pos_after_node(path, b);
+}
+
+/* Btree iterator: */
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+static void bch2_btree_path_verify_cached(struct btree_trans *trans,
+ struct btree_path *path)
+{
+ struct bkey_cached *ck;
+ bool locked = btree_node_locked(path, 0);
+
+ if (!bch2_btree_node_relock(trans, path, 0))
+ return;
+
+ ck = (void *) path->l[0].b;
+ BUG_ON(ck->key.btree_id != path->btree_id ||
+ !bkey_eq(ck->key.pos, path->pos));
+
+ if (!locked)
+ btree_node_unlock(trans, path, 0);
+}
+
+static void bch2_btree_path_verify_level(struct btree_trans *trans,
+ struct btree_path *path, unsigned level)
+{
+ struct btree_path_level *l;
+ struct btree_node_iter tmp;
+ bool locked;
+ struct bkey_packed *p, *k;
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
+ struct printbuf buf3 = PRINTBUF;
+ const char *msg;
+
+ if (!bch2_debug_check_iterators)
+ return;
+
+ l = &path->l[level];
+ tmp = l->iter;
+ locked = btree_node_locked(path, level);
+
+ if (path->cached) {
+ if (!level)
+ bch2_btree_path_verify_cached(trans, path);
+ return;
+ }
+
+ if (!btree_path_node(path, level))
+ return;
+
+ if (!bch2_btree_node_relock_notrace(trans, path, level))
+ return;
+
+ BUG_ON(!btree_path_pos_in_node(path, l->b));
+
+ bch2_btree_node_iter_verify(&l->iter, l->b);
+
+ /*
+ * For interior nodes, the iterator will have skipped past deleted keys:
+ */
+ p = level
+ ? bch2_btree_node_iter_prev(&tmp, l->b)
+ : bch2_btree_node_iter_prev_all(&tmp, l->b);
+ k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
+
+ if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) {
+ msg = "before";
+ goto err;
+ }
+
+ if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
+ msg = "after";
+ goto err;
+ }
+
+ if (!locked)
+ btree_node_unlock(trans, path, level);
+ return;
+err:
+ bch2_bpos_to_text(&buf1, path->pos);
+
+ if (p) {
+ struct bkey uk = bkey_unpack_key(l->b, p);
+
+ bch2_bkey_to_text(&buf2, &uk);
+ } else {
+ prt_printf(&buf2, "(none)");
+ }
+
+ if (k) {
+ struct bkey uk = bkey_unpack_key(l->b, k);
+
+ bch2_bkey_to_text(&buf3, &uk);
+ } else {
+ prt_printf(&buf3, "(none)");
+ }
+
+ panic("path should be %s key at level %u:\n"
+ "path pos %s\n"
+ "prev key %s\n"
+ "cur key %s\n",
+ msg, level, buf1.buf, buf2.buf, buf3.buf);
+}
+
+static void bch2_btree_path_verify(struct btree_trans *trans,
+ struct btree_path *path)
+{
+ struct bch_fs *c = trans->c;
+ unsigned i;
+
+ EBUG_ON(path->btree_id >= BTREE_ID_NR);
+
+ for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
+ if (!path->l[i].b) {
+ BUG_ON(!path->cached &&
+ bch2_btree_id_root(c, path->btree_id)->b->c.level > i);
+ break;
+ }
+
+ bch2_btree_path_verify_level(trans, path, i);
+ }
+
+ bch2_btree_path_verify_locks(path);
+}
+
+void bch2_trans_verify_paths(struct btree_trans *trans)
+{
+ struct btree_path *path;
+
+ trans_for_each_path(trans, path)
+ bch2_btree_path_verify(trans, path);
+}
+
+static void bch2_btree_iter_verify(struct btree_iter *iter)
+{
+ struct btree_trans *trans = iter->trans;
+
+ BUG_ON(iter->btree_id >= BTREE_ID_NR);
+
+ BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached);
+
+ BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+ (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
+
+ BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
+ (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+ !btree_type_has_snapshot_field(iter->btree_id));
+
+ if (iter->update_path)
+ bch2_btree_path_verify(trans, iter->update_path);
+ bch2_btree_path_verify(trans, iter->path);
+}
+
+static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
+{
+ BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+ !iter->pos.snapshot);
+
+ BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+ iter->pos.snapshot != iter->snapshot);
+
+ BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
+ bkey_gt(iter->pos, iter->k.p));
+}
+
+static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
+{
+ struct btree_trans *trans = iter->trans;
+ struct btree_iter copy;
+ struct bkey_s_c prev;
+ int ret = 0;
+
+ if (!bch2_debug_check_iterators)
+ return 0;
+
+ if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS))
+ return 0;
+
+ if (bkey_err(k) || !k.k)
+ return 0;
+
+ BUG_ON(!bch2_snapshot_is_ancestor(trans->c,
+ iter->snapshot,
+ k.k->p.snapshot));
+
+ bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
+ BTREE_ITER_NOPRESERVE|
+ BTREE_ITER_ALL_SNAPSHOTS);
+ prev = bch2_btree_iter_prev(&copy);
+ if (!prev.k)
+ goto out;
+
+ ret = bkey_err(prev);
+ if (ret)
+ goto out;
+
+ if (bkey_eq(prev.k->p, k.k->p) &&
+ bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
+ prev.k->p.snapshot) > 0) {
+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+
+ bch2_bkey_to_text(&buf1, k.k);
+ bch2_bkey_to_text(&buf2, prev.k);
+
+ panic("iter snap %u\n"
+ "k %s\n"
+ "prev %s\n",
+ iter->snapshot,
+ buf1.buf, buf2.buf);
+ }
+out:
+ bch2_trans_iter_exit(trans, &copy);
+ return ret;
+}
+
+void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
+ struct bpos pos, bool key_cache)
+{
+ struct btree_path *path;
+ unsigned idx;
+ struct printbuf buf = PRINTBUF;
+
+ btree_trans_sort_paths(trans);
+
+ trans_for_each_path_inorder(trans, path, idx) {
+ int cmp = cmp_int(path->btree_id, id) ?:
+ cmp_int(path->cached, key_cache);
+
+ if (cmp > 0)
+ break;
+ if (cmp < 0)
+ continue;
+
+ if (!btree_node_locked(path, 0) ||
+ !path->should_be_locked)
+ continue;
+
+ if (!key_cache) {
+ if (bkey_ge(pos, path->l[0].b->data->min_key) &&
+ bkey_le(pos, path->l[0].b->key.k.p))
+ return;
+ } else {
+ if (bkey_eq(pos, path->pos))
+ return;
+ }
+ }
+
+ bch2_dump_trans_paths_updates(trans);
+ bch2_bpos_to_text(&buf, pos);
+
+ panic("not locked: %s %s%s\n",
+ bch2_btree_id_str(id), buf.buf,
+ key_cache ? " cached" : "");
+}
+
+#else
+
+static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
+ struct btree_path *path, unsigned l) {}
+static inline void bch2_btree_path_verify(struct btree_trans *trans,
+ struct btree_path *path) {}
+static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
+static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
+static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; }
+
+#endif
+
+/* Btree path: fixups after btree updates */
+
+static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
+ struct btree *b,
+ struct bset_tree *t,
+ struct bkey_packed *k)
+{
+ struct btree_node_iter_set *set;
+
+ btree_node_iter_for_each(iter, set)
+ if (set->end == t->end_offset) {
+ set->k = __btree_node_key_to_offset(b, k);
+ bch2_btree_node_iter_sort(iter, b);
+ return;
+ }
+
+ bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
+}
+
+static void __bch2_btree_path_fix_key_modified(struct btree_path *path,
+ struct btree *b,
+ struct bkey_packed *where)
+{
+ struct btree_path_level *l = &path->l[b->c.level];
+
+ if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
+ return;
+
+ if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0)
+ bch2_btree_node_iter_advance(&l->iter, l->b);
+}
+
+void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
+ struct btree *b,
+ struct bkey_packed *where)
+{
+ struct btree_path *path;
+
+ trans_for_each_path_with_node(trans, b, path) {
+ __bch2_btree_path_fix_key_modified(path, b, where);
+ bch2_btree_path_verify_level(trans, path, b->c.level);
+ }
+}
+
+static void __bch2_btree_node_iter_fix(struct btree_path *path,
+ struct btree *b,
+ struct btree_node_iter *node_iter,
+ struct bset_tree *t,
+ struct bkey_packed *where,
+ unsigned clobber_u64s,
+ unsigned new_u64s)
+{
+ const struct bkey_packed *end = btree_bkey_last(b, t);
+ struct btree_node_iter_set *set;
+ unsigned offset = __btree_node_key_to_offset(b, where);
+ int shift = new_u64s - clobber_u64s;
+ unsigned old_end = t->end_offset - shift;
+ unsigned orig_iter_pos = node_iter->data[0].k;
+ bool iter_current_key_modified =
+ orig_iter_pos >= offset &&
+ orig_iter_pos <= offset + clobber_u64s;
+
+ btree_node_iter_for_each(node_iter, set)
+ if (set->end == old_end)
+ goto found;
+
+ /* didn't find the bset in the iterator - might have to readd it: */
+ if (new_u64s &&
+ bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
+ bch2_btree_node_iter_push(node_iter, b, where, end);
+ goto fixup_done;
+ } else {
+ /* Iterator is after key that changed */
+ return;
+ }
+found:
+ set->end = t->end_offset;
+
+ /* Iterator hasn't gotten to the key that changed yet: */
+ if (set->k < offset)
+ return;
+
+ if (new_u64s &&
+ bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
+ set->k = offset;
+ } else if (set->k < offset + clobber_u64s) {
+ set->k = offset + new_u64s;
+ if (set->k == set->end)
+ bch2_btree_node_iter_set_drop(node_iter, set);
+ } else {
+ /* Iterator is after key that changed */
+ set->k = (int) set->k + shift;
+ return;
+ }
+
+ bch2_btree_node_iter_sort(node_iter, b);
+fixup_done:
+ if (node_iter->data[0].k != orig_iter_pos)
+ iter_current_key_modified = true;
+
+ /*
+ * When a new key is added, and the node iterator now points to that
+ * key, the iterator might have skipped past deleted keys that should
+ * come after the key the iterator now points to. We have to rewind to
+ * before those deleted keys - otherwise
+ * bch2_btree_node_iter_prev_all() breaks:
+ */
+ if (!bch2_btree_node_iter_end(node_iter) &&
+ iter_current_key_modified &&
+ b->c.level) {
+ struct bkey_packed *k, *k2, *p;
+
+ k = bch2_btree_node_iter_peek_all(node_iter, b);
+
+ for_each_bset(b, t) {
+ bool set_pos = false;
+
+ if (node_iter->data[0].end == t->end_offset)
+ continue;
+
+ k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
+
+ while ((p = bch2_bkey_prev_all(b, t, k2)) &&
+ bkey_iter_cmp(b, k, p) < 0) {
+ k2 = p;
+ set_pos = true;
+ }
+
+ if (set_pos)
+ btree_node_iter_set_set_pos(node_iter,
+ b, t, k2);
+ }
+ }
+}
+
+void bch2_btree_node_iter_fix(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b,
+ struct btree_node_iter *node_iter,
+ struct bkey_packed *where,
+ unsigned clobber_u64s,
+ unsigned new_u64s)
+{
+ struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where);
+ struct btree_path *linked;
+
+ if (node_iter != &path->l[b->c.level].iter) {
+ __bch2_btree_node_iter_fix(path, b, node_iter, t,
+ where, clobber_u64s, new_u64s);
+
+ if (bch2_debug_check_iterators)
+ bch2_btree_node_iter_verify(node_iter, b);
+ }
+
+ trans_for_each_path_with_node(trans, b, linked) {
+ __bch2_btree_node_iter_fix(linked, b,
+ &linked->l[b->c.level].iter, t,
+ where, clobber_u64s, new_u64s);
+ bch2_btree_path_verify_level(trans, linked, b->c.level);
+ }
+}
+
+/* Btree path level: pointer to a particular btree node and node iter */
+
+static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
+ struct btree_path_level *l,
+ struct bkey *u,
+ struct bkey_packed *k)
+{
+ if (unlikely(!k)) {
+ /*
+ * signal to bch2_btree_iter_peek_slot() that we're currently at
+ * a hole
+ */
+ u->type = KEY_TYPE_deleted;
+ return bkey_s_c_null;
+ }
+
+ return bkey_disassemble(l->b, k, u);
+}
+
+static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
+ struct btree_path_level *l,
+ struct bkey *u)
+{
+ return __btree_iter_unpack(c, l, u,
+ bch2_btree_node_iter_peek_all(&l->iter, l->b));
+}
+
+static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree_path_level *l,
+ struct bkey *u)
+{
+ struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
+ bch2_btree_node_iter_peek(&l->iter, l->b));
+
+ path->pos = k.k ? k.k->p : l->b->key.k.p;
+ trans->paths_sorted = false;
+ bch2_btree_path_verify_level(trans, path, l - path->l);
+ return k;
+}
+
+static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree_path_level *l,
+ struct bkey *u)
+{
+ struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
+ bch2_btree_node_iter_prev(&l->iter, l->b));
+
+ path->pos = k.k ? k.k->p : l->b->data->min_key;
+ trans->paths_sorted = false;
+ bch2_btree_path_verify_level(trans, path, l - path->l);
+ return k;
+}
+
+static inline bool btree_path_advance_to_pos(struct btree_path *path,
+ struct btree_path_level *l,
+ int max_advance)
+{
+ struct bkey_packed *k;
+ int nr_advanced = 0;
+
+ while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
+ bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
+ if (max_advance > 0 && nr_advanced >= max_advance)
+ return false;
+
+ bch2_btree_node_iter_advance(&l->iter, l->b);
+ nr_advanced++;
+ }
+
+ return true;
+}
+
+static inline void __btree_path_level_init(struct btree_path *path,
+ unsigned level)
+{
+ struct btree_path_level *l = &path->l[level];
+
+ bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
+
+ /*
+ * Iterators to interior nodes should always be pointed at the first non
+ * whiteout:
+ */
+ if (level)
+ bch2_btree_node_iter_peek(&l->iter, l->b);
+}
+
+void bch2_btree_path_level_init(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b)
+{
+ BUG_ON(path->cached);
+
+ EBUG_ON(!btree_path_pos_in_node(path, b));
+
+ path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
+ path->l[b->c.level].b = b;
+ __btree_path_level_init(path, b->c.level);
+}
+
+/* Btree path: fixups after btree node updates: */
+
+static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_insert_entry *i;
+
+ trans_for_each_update(trans, i)
+ if (!i->cached &&
+ i->level == b->c.level &&
+ i->btree_id == b->c.btree_id &&
+ bpos_cmp(i->k->k.p, b->data->min_key) >= 0 &&
+ bpos_cmp(i->k->k.p, b->data->max_key) <= 0) {
+ i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v;
+
+ if (unlikely(trans->journal_replay_not_finished)) {
+ struct bkey_i *j_k =
+ bch2_journal_keys_peek_slot(c, i->btree_id, i->level,
+ i->k->k.p);
+
+ if (j_k) {
+ i->old_k = j_k->k;
+ i->old_v = &j_k->v;
+ }
+ }
+ }
+}
+
+/*
+ * A btree node is being replaced - update the iterator to point to the new
+ * node:
+ */
+void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
+{
+ struct btree_path *path;
+
+ trans_for_each_path(trans, path)
+ if (path->uptodate == BTREE_ITER_UPTODATE &&
+ !path->cached &&
+ btree_path_pos_in_node(path, b)) {
+ enum btree_node_locked_type t =
+ btree_lock_want(path, b->c.level);
+
+ if (t != BTREE_NODE_UNLOCKED) {
+ btree_node_unlock(trans, path, b->c.level);
+ six_lock_increment(&b->c.lock, (enum six_lock_type) t);
+ mark_btree_node_locked(trans, path, b->c.level, t);
+ }
+
+ bch2_btree_path_level_init(trans, path, b);
+ }
+
+ bch2_trans_revalidate_updates_in_node(trans, b);
+}
+
+/*
+ * A btree node has been modified in such a way as to invalidate iterators - fix
+ * them:
+ */
+void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
+{
+ struct btree_path *path;
+
+ trans_for_each_path_with_node(trans, b, path)
+ __btree_path_level_init(path, b->c.level);
+
+ bch2_trans_revalidate_updates_in_node(trans, b);
+}
+
+/* Btree path: traverse, set_pos: */
+
+static inline int btree_path_lock_root(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned depth_want,
+ unsigned long trace_ip)
+{
+ struct bch_fs *c = trans->c;
+ struct btree *b, **rootp = &bch2_btree_id_root(c, path->btree_id)->b;
+ enum six_lock_type lock_type;
+ unsigned i;
+ int ret;
+
+ EBUG_ON(path->nodes_locked);
+
+ while (1) {
+ b = READ_ONCE(*rootp);
+ path->level = READ_ONCE(b->c.level);
+
+ if (unlikely(path->level < depth_want)) {
+ /*
+ * the root is at a lower depth than the depth we want:
+ * got to the end of the btree, or we're walking nodes
+ * greater than some depth and there are no nodes >=
+ * that depth
+ */
+ path->level = depth_want;
+ for (i = path->level; i < BTREE_MAX_DEPTH; i++)
+ path->l[i].b = NULL;
+ return 1;
+ }
+
+ lock_type = __btree_lock_want(path, path->level);
+ ret = btree_node_lock(trans, path, &b->c,
+ path->level, lock_type, trace_ip);
+ if (unlikely(ret)) {
+ if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed))
+ continue;
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ return ret;
+ BUG();
+ }
+
+ if (likely(b == READ_ONCE(*rootp) &&
+ b->c.level == path->level &&
+ !race_fault())) {
+ for (i = 0; i < path->level; i++)
+ path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root);
+ path->l[path->level].b = b;
+ for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
+ path->l[i].b = NULL;
+
+ mark_btree_node_locked(trans, path, path->level,
+ (enum btree_node_locked_type) lock_type);
+ bch2_btree_path_level_init(trans, path, b);
+ return 0;
+ }
+
+ six_unlock_type(&b->c.lock, lock_type);
+ }
+}
+
+noinline
+static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_path_level *l = path_l(path);
+ struct btree_node_iter node_iter = l->iter;
+ struct bkey_packed *k;
+ struct bkey_buf tmp;
+ unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
+ ? (path->level > 1 ? 0 : 2)
+ : (path->level > 1 ? 1 : 16);
+ bool was_locked = btree_node_locked(path, path->level);
+ int ret = 0;
+
+ bch2_bkey_buf_init(&tmp);
+
+ while (nr-- && !ret) {
+ if (!bch2_btree_node_relock(trans, path, path->level))
+ break;
+
+ bch2_btree_node_iter_advance(&node_iter, l->b);
+ k = bch2_btree_node_iter_peek(&node_iter, l->b);
+ if (!k)
+ break;
+
+ bch2_bkey_buf_unpack(&tmp, c, l->b, k);
+ ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
+ path->level - 1);
+ }
+
+ if (!was_locked)
+ btree_node_unlock(trans, path, path->level);
+
+ bch2_bkey_buf_exit(&tmp, c);
+ return ret;
+}
+
+static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
+ struct btree_and_journal_iter *jiter)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k;
+ struct bkey_buf tmp;
+ unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
+ ? (path->level > 1 ? 0 : 2)
+ : (path->level > 1 ? 1 : 16);
+ bool was_locked = btree_node_locked(path, path->level);
+ int ret = 0;
+
+ bch2_bkey_buf_init(&tmp);
+
+ while (nr-- && !ret) {
+ if (!bch2_btree_node_relock(trans, path, path->level))
+ break;
+
+ bch2_btree_and_journal_iter_advance(jiter);
+ k = bch2_btree_and_journal_iter_peek(jiter);
+ if (!k.k)
+ break;
+
+ bch2_bkey_buf_reassemble(&tmp, c, k);
+ ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
+ path->level - 1);
+ }
+
+ if (!was_locked)
+ btree_node_unlock(trans, path, path->level);
+
+ bch2_bkey_buf_exit(&tmp, c);
+ return ret;
+}
+
+static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned plevel, struct btree *b)
+{
+ struct btree_path_level *l = &path->l[plevel];
+ bool locked = btree_node_locked(path, plevel);
+ struct bkey_packed *k;
+ struct bch_btree_ptr_v2 *bp;
+
+ if (!bch2_btree_node_relock(trans, path, plevel))
+ return;
+
+ k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
+ BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);
+
+ bp = (void *) bkeyp_val(&l->b->format, k);
+ bp->mem_ptr = (unsigned long)b;
+
+ if (!locked)
+ btree_node_unlock(trans, path, plevel);
+}
+
+static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned flags,
+ struct bkey_buf *out)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_path_level *l = path_l(path);
+ struct btree_and_journal_iter jiter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
+
+ k = bch2_btree_and_journal_iter_peek(&jiter);
+
+ bch2_bkey_buf_reassemble(out, c, k);
+
+ if (flags & BTREE_ITER_PREFETCH)
+ ret = btree_path_prefetch_j(trans, path, &jiter);
+
+ bch2_btree_and_journal_iter_exit(&jiter);
+ return ret;
+}
+
+static __always_inline int btree_path_down(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned flags,
+ unsigned long trace_ip)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_path_level *l = path_l(path);
+ struct btree *b;
+ unsigned level = path->level - 1;
+ enum six_lock_type lock_type = __btree_lock_want(path, level);
+ struct bkey_buf tmp;
+ int ret;
+
+ EBUG_ON(!btree_node_locked(path, path->level));
+
+ bch2_bkey_buf_init(&tmp);
+
+ if (unlikely(trans->journal_replay_not_finished)) {
+ ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
+ if (ret)
+ goto err;
+ } else {
+ bch2_bkey_buf_unpack(&tmp, c, l->b,
+ bch2_btree_node_iter_peek(&l->iter, l->b));
+
+ if (flags & BTREE_ITER_PREFETCH) {
+ ret = btree_path_prefetch(trans, path);
+ if (ret)
+ goto err;
+ }
+ }
+
+ b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
+ ret = PTR_ERR_OR_ZERO(b);
+ if (unlikely(ret))
+ goto err;
+
+ if (likely(!trans->journal_replay_not_finished &&
+ tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
+ unlikely(b != btree_node_mem_ptr(tmp.k)))
+ btree_node_mem_ptr_set(trans, path, level + 1, b);
+
+ if (btree_node_read_locked(path, level + 1))
+ btree_node_unlock(trans, path, level + 1);
+
+ mark_btree_node_locked(trans, path, level,
+ (enum btree_node_locked_type) lock_type);
+ path->level = level;
+ bch2_btree_path_level_init(trans, path, b);
+
+ bch2_btree_path_verify_locks(path);
+err:
+ bch2_bkey_buf_exit(&tmp, c);
+ return ret;
+}
+
+
+static int bch2_btree_path_traverse_all(struct btree_trans *trans)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_path *path;
+ unsigned long trace_ip = _RET_IP_;
+ int i, ret = 0;
+
+ if (trans->in_traverse_all)
+ return -BCH_ERR_transaction_restart_in_traverse_all;
+
+ trans->in_traverse_all = true;
+retry_all:
+ trans->restarted = 0;
+ trans->last_restarted_ip = 0;
+
+ trans_for_each_path(trans, path)
+ path->should_be_locked = false;
+
+ btree_trans_sort_paths(trans);
+
+ bch2_trans_unlock(trans);
+ cond_resched();
+
+ if (unlikely(trans->memory_allocation_failure)) {
+ struct closure cl;
+
+ closure_init_stack(&cl);
+
+ do {
+ ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+ closure_sync(&cl);
+ } while (ret);
+ }
+
+ /* Now, redo traversals in correct order: */
+ i = 0;
+ while (i < trans->nr_sorted) {
+ path = trans->paths + trans->sorted[i];
+
+ /*
+ * Traversing a path can cause another path to be added at about
+ * the same position:
+ */
+ if (path->uptodate) {
+ __btree_path_get(path, false);
+ ret = bch2_btree_path_traverse_one(trans, path, 0, _THIS_IP_);
+ __btree_path_put(path, false);
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+ bch2_err_matches(ret, ENOMEM))
+ goto retry_all;
+ if (ret)
+ goto err;
+ } else {
+ i++;
+ }
+ }
+
+ /*
+ * We used to assert that all paths had been traversed here
+ * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since
+ * path->should_be_locked is not set yet, we might have unlocked and
+ * then failed to relock a path - that's fine.
+ */
+err:
+ bch2_btree_cache_cannibalize_unlock(c);
+
+ trans->in_traverse_all = false;
+
+ trace_and_count(c, trans_traverse_all, trans, trace_ip);
+ return ret;
+}
+
+static inline bool btree_path_check_pos_in_node(struct btree_path *path,
+ unsigned l, int check_pos)
+{
+ if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
+ return false;
+ if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
+ return false;
+ return true;
+}
+
+static inline bool btree_path_good_node(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned l, int check_pos)
+{
+ return is_btree_node(path, l) &&
+ bch2_btree_node_relock(trans, path, l) &&
+ btree_path_check_pos_in_node(path, l, check_pos);
+}
+
+static void btree_path_set_level_down(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned new_level)
+{
+ unsigned l;
+
+ path->level = new_level;
+
+ for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
+ if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
+ btree_node_unlock(trans, path, l);
+
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ bch2_btree_path_verify(trans, path);
+}
+
+static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans,
+ struct btree_path *path,
+ int check_pos)
+{
+ unsigned i, l = path->level;
+again:
+ while (btree_path_node(path, l) &&
+ !btree_path_good_node(trans, path, l, check_pos))
+ __btree_path_set_level_up(trans, path, l++);
+
+ /* If we need intent locks, take them too: */
+ for (i = l + 1;
+ i < path->locks_want && btree_path_node(path, i);
+ i++)
+ if (!bch2_btree_node_relock(trans, path, i)) {
+ while (l <= i)
+ __btree_path_set_level_up(trans, path, l++);
+ goto again;
+ }
+
+ return l;
+}
+
+static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
+ struct btree_path *path,
+ int check_pos)
+{
+ return likely(btree_node_locked(path, path->level) &&
+ btree_path_check_pos_in_node(path, path->level, check_pos))
+ ? path->level
+ : __btree_path_up_until_good_node(trans, path, check_pos);
+}
+
+/*
+ * This is the main state machine for walking down the btree - walks down to a
+ * specified depth
+ *
+ * Returns 0 on success, -EIO on error (error reading in a btree node).
+ *
+ * On error, caller (peek_node()/peek_key()) must return NULL; the error is
+ * stashed in the iterator and returned from bch2_trans_exit().
+ */
+int bch2_btree_path_traverse_one(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned flags,
+ unsigned long trace_ip)
+{
+ unsigned depth_want = path->level;
+ int ret = -((int) trans->restarted);
+
+ if (unlikely(ret))
+ goto out;
+
+ if (unlikely(!trans->srcu_held))
+ bch2_trans_srcu_lock(trans);
+
+ /*
+ * Ensure we obey path->should_be_locked: if it's set, we can't unlock
+ * and re-traverse the path without a transaction restart:
+ */
+ if (path->should_be_locked) {
+ ret = bch2_btree_path_relock(trans, path, trace_ip);
+ goto out;
+ }
+
+ if (path->cached) {
+ ret = bch2_btree_path_traverse_cached(trans, path, flags);
+ goto out;
+ }
+
+ if (unlikely(path->level >= BTREE_MAX_DEPTH))
+ goto out;
+
+ path->level = btree_path_up_until_good_node(trans, path, 0);
+
+ EBUG_ON(btree_path_node(path, path->level) &&
+ !btree_node_locked(path, path->level));
+
+ /*
+ * Note: path->nodes[path->level] may be temporarily NULL here - that
+ * would indicate to other code that we got to the end of the btree,
+ * here it indicates that relocking the root failed - it's critical that
+ * btree_path_lock_root() comes next and that it can't fail
+ */
+ while (path->level > depth_want) {
+ ret = btree_path_node(path, path->level)
+ ? btree_path_down(trans, path, flags, trace_ip)
+ : btree_path_lock_root(trans, path, depth_want, trace_ip);
+ if (unlikely(ret)) {
+ if (ret == 1) {
+ /*
+ * No nodes at this level - got to the end of
+ * the btree:
+ */
+ ret = 0;
+ goto out;
+ }
+
+ __bch2_btree_path_unlock(trans, path);
+ path->level = depth_want;
+ path->l[path->level].b = ERR_PTR(ret);
+ goto out;
+ }
+ }
+
+ path->uptodate = BTREE_ITER_UPTODATE;
+out:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
+ panic("ret %s (%i) trans->restarted %s (%i)\n",
+ bch2_err_str(ret), ret,
+ bch2_err_str(trans->restarted), trans->restarted);
+ bch2_btree_path_verify(trans, path);
+ return ret;
+}
+
+static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
+ struct btree_path *src)
+{
+ unsigned i, offset = offsetof(struct btree_path, pos);
+
+ memcpy((void *) dst + offset,
+ (void *) src + offset,
+ sizeof(struct btree_path) - offset);
+
+ for (i = 0; i < BTREE_MAX_DEPTH; i++) {
+ unsigned t = btree_node_locked_type(dst, i);
+
+ if (t != BTREE_NODE_UNLOCKED)
+ six_lock_increment(&dst->l[i].b->c.lock, t);
+ }
+}
+
+static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src,
+ bool intent)
+{
+ struct btree_path *new = btree_path_alloc(trans, src);
+
+ btree_path_copy(trans, new, src);
+ __btree_path_get(new, intent);
+ return new;
+}
+
+__flatten
+struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans,
+ struct btree_path *path, bool intent,
+ unsigned long ip)
+{
+ __btree_path_put(path, intent);
+ path = btree_path_clone(trans, path, intent);
+ path->preserve = false;
+ return path;
+}
+
+struct btree_path * __must_check
+__bch2_btree_path_set_pos(struct btree_trans *trans,
+ struct btree_path *path, struct bpos new_pos,
+ bool intent, unsigned long ip, int cmp)
+{
+ unsigned level = path->level;
+
+ bch2_trans_verify_not_in_restart(trans);
+ EBUG_ON(!path->ref);
+
+ path = bch2_btree_path_make_mut(trans, path, intent, ip);
+
+ path->pos = new_pos;
+ trans->paths_sorted = false;
+
+ if (unlikely(path->cached)) {
+ btree_node_unlock(trans, path, 0);
+ path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ goto out;
+ }
+
+ level = btree_path_up_until_good_node(trans, path, cmp);
+
+ if (btree_path_node(path, level)) {
+ struct btree_path_level *l = &path->l[level];
+
+ BUG_ON(!btree_node_locked(path, level));
+ /*
+ * We might have to skip over many keys, or just a few: try
+ * advancing the node iterator, and if we have to skip over too
+ * many keys just reinit it (or if we're rewinding, since that
+ * is expensive).
+ */
+ if (cmp < 0 ||
+ !btree_path_advance_to_pos(path, l, 8))
+ bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
+
+ /*
+ * Iterators to interior nodes should always be pointed at the first non
+ * whiteout:
+ */
+ if (unlikely(level))
+ bch2_btree_node_iter_peek(&l->iter, l->b);
+ }
+
+ if (unlikely(level != path->level)) {
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ __bch2_btree_path_unlock(trans, path);
+ }
+out:
+ bch2_btree_path_verify(trans, path);
+ return path;
+}
+
+/* Btree path: main interface: */
+
+static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
+{
+ struct btree_path *sib;
+
+ sib = prev_btree_path(trans, path);
+ if (sib && !btree_path_cmp(sib, path))
+ return sib;
+
+ sib = next_btree_path(trans, path);
+ if (sib && !btree_path_cmp(sib, path))
+ return sib;
+
+ return NULL;
+}
+
+static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
+{
+ struct btree_path *sib;
+
+ sib = prev_btree_path(trans, path);
+ if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
+ return sib;
+
+ sib = next_btree_path(trans, path);
+ if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
+ return sib;
+
+ return NULL;
+}
+
+static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path)
+{
+ __bch2_btree_path_unlock(trans, path);
+ btree_path_list_remove(trans, path);
+ trans->paths_allocated &= ~(1ULL << path->idx);
+}
+
+void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
+{
+ struct btree_path *dup;
+
+ EBUG_ON(trans->paths + path->idx != path);
+ EBUG_ON(!path->ref);
+
+ if (!__btree_path_put(path, intent))
+ return;
+
+ dup = path->preserve
+ ? have_path_at_pos(trans, path)
+ : have_node_at_pos(trans, path);
+
+ if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
+ return;
+
+ if (path->should_be_locked &&
+ !trans->restarted &&
+ (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_)))
+ return;
+
+ if (dup) {
+ dup->preserve |= path->preserve;
+ dup->should_be_locked |= path->should_be_locked;
+ }
+
+ __bch2_path_free(trans, path);
+}
+
+static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path,
+ bool intent)
+{
+ EBUG_ON(trans->paths + path->idx != path);
+ EBUG_ON(!path->ref);
+
+ if (!__btree_path_put(path, intent))
+ return;
+
+ __bch2_path_free(trans, path);
+}
+
+void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
+{
+ panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
+ trans->restart_count, restart_count,
+ (void *) trans->last_begin_ip);
+}
+
+void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
+{
+ panic("in transaction restart: %s, last restarted by %pS\n",
+ bch2_err_str(trans->restarted),
+ (void *) trans->last_restarted_ip);
+}
+
+noinline __cold
+void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
+{
+ struct btree_insert_entry *i;
+ struct btree_write_buffered_key *wb;
+
+ prt_printf(buf, "transaction updates for %s journal seq %llu",
+ trans->fn, trans->journal_res.seq);
+ prt_newline(buf);
+ printbuf_indent_add(buf, 2);
+
+ trans_for_each_update(trans, i) {
+ struct bkey_s_c old = { &i->old_k, i->old_v };
+
+ prt_printf(buf, "update: btree=%s cached=%u %pS",
+ bch2_btree_id_str(i->btree_id),
+ i->cached,
+ (void *) i->ip_allocated);
+ prt_newline(buf);
+
+ prt_printf(buf, " old ");
+ bch2_bkey_val_to_text(buf, trans->c, old);
+ prt_newline(buf);
+
+ prt_printf(buf, " new ");
+ bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k));
+ prt_newline(buf);
+ }
+
+ trans_for_each_wb_update(trans, wb) {
+ prt_printf(buf, "update: btree=%s wb=1 %pS",
+ bch2_btree_id_str(wb->btree),
+ (void *) i->ip_allocated);
+ prt_newline(buf);
+
+ prt_printf(buf, " new ");
+ bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(&wb->k));
+ prt_newline(buf);
+ }
+
+ printbuf_indent_sub(buf, 2);
+}
+
+noinline __cold
+void bch2_dump_trans_updates(struct btree_trans *trans)
+{
+ struct printbuf buf = PRINTBUF;
+
+ bch2_trans_updates_to_text(&buf, trans);
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+}
+
+noinline __cold
+void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path)
+{
+ prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ",
+ path->idx, path->ref, path->intent_ref,
+ path->preserve ? 'P' : ' ',
+ path->should_be_locked ? 'S' : ' ',
+ bch2_btree_id_str(path->btree_id),
+ path->level);
+ bch2_bpos_to_text(out, path->pos);
+
+ prt_printf(out, " locks %u", path->nodes_locked);
+#ifdef TRACK_PATH_ALLOCATED
+ prt_printf(out, " %pS", (void *) path->ip_allocated);
+#endif
+ prt_newline(out);
+}
+
+static noinline __cold
+void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans,
+ bool nosort)
+{
+ struct btree_path *path;
+ unsigned idx;
+
+ if (!nosort)
+ btree_trans_sort_paths(trans);
+
+ trans_for_each_path_inorder(trans, path, idx)
+ bch2_btree_path_to_text(out, path);
+}
+
+noinline __cold
+void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans)
+{
+ __bch2_trans_paths_to_text(out, trans, false);
+}
+
+static noinline __cold
+void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort)
+{
+ struct printbuf buf = PRINTBUF;
+
+ __bch2_trans_paths_to_text(&buf, trans, nosort);
+ bch2_trans_updates_to_text(&buf, trans);
+
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+}
+
+noinline __cold
+void bch2_dump_trans_paths_updates(struct btree_trans *trans)
+{
+ __bch2_dump_trans_paths_updates(trans, false);
+}
+
+noinline __cold
+static void bch2_trans_update_max_paths(struct btree_trans *trans)
+{
+ struct btree_transaction_stats *s = btree_trans_stats(trans);
+ struct printbuf buf = PRINTBUF;
+
+ if (!s)
+ return;
+
+ bch2_trans_paths_to_text(&buf, trans);
+
+ if (!buf.allocation_failure) {
+ mutex_lock(&s->lock);
+ if (s->nr_max_paths < hweight64(trans->paths_allocated)) {
+ s->nr_max_paths = trans->nr_max_paths =
+ hweight64(trans->paths_allocated);
+ swap(s->max_paths_text, buf.buf);
+ }
+ mutex_unlock(&s->lock);
+ }
+
+ printbuf_exit(&buf);
+
+ trans->nr_max_paths = hweight64(trans->paths_allocated);
+}
+
+static noinline void btree_path_overflow(struct btree_trans *trans)
+{
+ bch2_dump_trans_paths_updates(trans);
+ panic("trans path overflow\n");
+}
+
+static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
+ struct btree_path *pos)
+{
+ struct btree_path *path;
+ unsigned idx;
+
+ if (unlikely(trans->paths_allocated ==
+ ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
+ btree_path_overflow(trans);
+
+ idx = __ffs64(~trans->paths_allocated);
+
+ /*
+ * Do this before marking the new path as allocated, since it won't be
+ * initialized yet:
+ */
+ if (unlikely(idx > trans->nr_max_paths))
+ bch2_trans_update_max_paths(trans);
+
+ trans->paths_allocated |= 1ULL << idx;
+
+ path = &trans->paths[idx];
+ path->idx = idx;
+ path->ref = 0;
+ path->intent_ref = 0;
+ path->nodes_locked = 0;
+ path->alloc_seq++;
+
+ btree_path_list_add(trans, pos, path);
+ trans->paths_sorted = false;
+ return path;
+}
+
+struct btree_path *bch2_path_get(struct btree_trans *trans,
+ enum btree_id btree_id, struct bpos pos,
+ unsigned locks_want, unsigned level,
+ unsigned flags, unsigned long ip)
+{
+ struct btree_path *path, *path_pos = NULL;
+ bool cached = flags & BTREE_ITER_CACHED;
+ bool intent = flags & BTREE_ITER_INTENT;
+ int i;
+
+ bch2_trans_verify_not_in_restart(trans);
+ bch2_trans_verify_locks(trans);
+
+ btree_trans_sort_paths(trans);
+
+ trans_for_each_path_inorder(trans, path, i) {
+ if (__btree_path_cmp(path,
+ btree_id,
+ cached,
+ pos,
+ level) > 0)
+ break;
+
+ path_pos = path;
+ }
+
+ if (path_pos &&
+ path_pos->cached == cached &&
+ path_pos->btree_id == btree_id &&
+ path_pos->level == level) {
+ __btree_path_get(path_pos, intent);
+ path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
+ } else {
+ path = btree_path_alloc(trans, path_pos);
+ path_pos = NULL;
+
+ __btree_path_get(path, intent);
+ path->pos = pos;
+ path->btree_id = btree_id;
+ path->cached = cached;
+ path->uptodate = BTREE_ITER_NEED_TRAVERSE;
+ path->should_be_locked = false;
+ path->level = level;
+ path->locks_want = locks_want;
+ path->nodes_locked = 0;
+ for (i = 0; i < ARRAY_SIZE(path->l); i++)
+ path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
+#ifdef TRACK_PATH_ALLOCATED
+ path->ip_allocated = ip;
+#endif
+ trans->paths_sorted = false;
+ }
+
+ if (!(flags & BTREE_ITER_NOPRESERVE))
+ path->preserve = true;
+
+ if (path->intent_ref)
+ locks_want = max(locks_want, level + 1);
+
+ /*
+ * If the path has locks_want greater than requested, we don't downgrade
+ * it here - on transaction restart because btree node split needs to
+ * upgrade locks, we might be putting/getting the iterator again.
+ * Downgrading iterators only happens via bch2_trans_downgrade(), after
+ * a successful transaction commit.
+ */
+
+ locks_want = min(locks_want, BTREE_MAX_DEPTH);
+ if (locks_want > path->locks_want)
+ bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL);
+
+ return path;
+}
+
+struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
+{
+
+ struct btree_path_level *l = path_l(path);
+ struct bkey_packed *_k;
+ struct bkey_s_c k;
+
+ if (unlikely(!l->b))
+ return bkey_s_c_null;
+
+ EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
+ EBUG_ON(!btree_node_locked(path, path->level));
+
+ if (!path->cached) {
+ _k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
+ k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
+
+ EBUG_ON(k.k && bkey_deleted(k.k) && bpos_eq(k.k->p, path->pos));
+
+ if (!k.k || !bpos_eq(path->pos, k.k->p))
+ goto hole;
+ } else {
+ struct bkey_cached *ck = (void *) path->l[0].b;
+
+ EBUG_ON(ck &&
+ (path->btree_id != ck->key.btree_id ||
+ !bkey_eq(path->pos, ck->key.pos)));
+ if (!ck || !ck->valid)
+ return bkey_s_c_null;
+
+ *u = ck->k->k;
+ k = bkey_i_to_s_c(ck->k);
+ }
+
+ return k;
+hole:
+ bkey_init(u);
+ u->p = path->pos;
+ return (struct bkey_s_c) { u, NULL };
+}
+
+/* Btree iterators: */
+
+int __must_check
+__bch2_btree_iter_traverse(struct btree_iter *iter)
+{
+ return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
+}
+
+int __must_check
+bch2_btree_iter_traverse(struct btree_iter *iter)
+{
+ int ret;
+
+ iter->path = bch2_btree_path_set_pos(iter->trans, iter->path,
+ btree_iter_search_key(iter),
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+
+ ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
+ if (ret)
+ return ret;
+
+ btree_path_set_should_be_locked(iter->path);
+ return 0;
+}
+
+/* Iterate across nodes (leaf and interior nodes) */
+
+struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
+{
+ struct btree_trans *trans = iter->trans;
+ struct btree *b = NULL;
+ int ret;
+
+ EBUG_ON(iter->path->cached);
+ bch2_btree_iter_verify(iter);
+
+ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+ if (ret)
+ goto err;
+
+ b = btree_path_node(iter->path, iter->path->level);
+ if (!b)
+ goto out;
+
+ BUG_ON(bpos_lt(b->key.k.p, iter->pos));
+
+ bkey_init(&iter->k);
+ iter->k.p = iter->pos = b->key.k.p;
+
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+ btree_path_set_should_be_locked(iter->path);
+out:
+ bch2_btree_iter_verify_entry_exit(iter);
+ bch2_btree_iter_verify(iter);
+
+ return b;
+err:
+ b = ERR_PTR(ret);
+ goto out;
+}
+
+struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter)
+{
+ struct btree *b;
+
+ while (b = bch2_btree_iter_peek_node(iter),
+ bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart))
+ bch2_trans_begin(iter->trans);
+
+ return b;
+}
+
+struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
+{
+ struct btree_trans *trans = iter->trans;
+ struct btree_path *path = iter->path;
+ struct btree *b = NULL;
+ int ret;
+
+ bch2_trans_verify_not_in_restart(trans);
+ EBUG_ON(iter->path->cached);
+ bch2_btree_iter_verify(iter);
+
+ /* already at end? */
+ if (!btree_path_node(path, path->level))
+ return NULL;
+
+ /* got to end? */
+ if (!btree_path_node(path, path->level + 1)) {
+ btree_path_set_level_up(trans, path);
+ return NULL;
+ }
+
+ if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
+ __bch2_btree_path_unlock(trans, path);
+ path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock);
+ path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock);
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
+ goto err;
+ }
+
+ b = btree_path_node(path, path->level + 1);
+
+ if (bpos_eq(iter->pos, b->key.k.p)) {
+ __btree_path_set_level_up(trans, path, path->level++);
+ } else {
+ /*
+ * Haven't gotten to the end of the parent node: go back down to
+ * the next child node
+ */
+ path = iter->path =
+ bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos),
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+
+ btree_path_set_level_down(trans, path, iter->min_depth);
+
+ ret = bch2_btree_path_traverse(trans, path, iter->flags);
+ if (ret)
+ goto err;
+
+ b = path->l[path->level].b;
+ }
+
+ bkey_init(&iter->k);
+ iter->k.p = iter->pos = b->key.k.p;
+
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+ btree_path_set_should_be_locked(iter->path);
+ BUG_ON(iter->path->uptodate);
+out:
+ bch2_btree_iter_verify_entry_exit(iter);
+ bch2_btree_iter_verify(iter);
+
+ return b;
+err:
+ b = ERR_PTR(ret);
+ goto out;
+}
+
+/* Iterate across keys (in leaf nodes only) */
+
+inline bool bch2_btree_iter_advance(struct btree_iter *iter)
+{
+ if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) {
+ struct bpos pos = iter->k.p;
+ bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+ ? bpos_eq(pos, SPOS_MAX)
+ : bkey_eq(pos, SPOS_MAX));
+
+ if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+ pos = bkey_successor(iter, pos);
+ bch2_btree_iter_set_pos(iter, pos);
+ return ret;
+ } else {
+ if (!btree_path_node(iter->path, iter->path->level))
+ return true;
+
+ iter->advanced = true;
+ return false;
+ }
+}
+
+inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
+{
+ struct bpos pos = bkey_start_pos(&iter->k);
+ bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+ ? bpos_eq(pos, POS_MIN)
+ : bkey_eq(pos, POS_MIN));
+
+ if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+ pos = bkey_predecessor(iter, pos);
+ bch2_btree_iter_set_pos(iter, pos);
+ return ret;
+}
+
+static noinline
+struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter)
+{
+ struct btree_insert_entry *i;
+ struct bkey_i *ret = NULL;
+
+ trans_for_each_update(iter->trans, i) {
+ if (i->btree_id < iter->btree_id)
+ continue;
+ if (i->btree_id > iter->btree_id)
+ break;
+ if (bpos_lt(i->k->k.p, iter->path->pos))
+ continue;
+ if (i->key_cache_already_flushed)
+ continue;
+ if (!ret || bpos_lt(i->k->k.p, ret->k.p))
+ ret = i->k;
+ }
+
+ return ret;
+}
+
+static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter)
+{
+ return iter->flags & BTREE_ITER_WITH_UPDATES
+ ? __bch2_btree_trans_peek_updates(iter)
+ : NULL;
+}
+
+static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos end_pos)
+{
+ struct bkey_i *k;
+
+ if (bpos_lt(iter->path->pos, iter->journal_pos))
+ iter->journal_idx = 0;
+
+ k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
+ iter->path->level,
+ iter->path->pos,
+ end_pos,
+ &iter->journal_idx);
+
+ iter->journal_pos = k ? k->k.p : end_pos;
+ return k;
+}
+
+static noinline
+struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
+ struct btree_iter *iter)
+{
+ struct bkey_i *k = bch2_btree_journal_peek(trans, iter, iter->path->pos);
+
+ if (k) {
+ iter->k = k->k;
+ return bkey_i_to_s_c(k);
+ } else {
+ return bkey_s_c_null;
+ }
+}
+
+static noinline
+struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bkey_i *next_journal =
+ bch2_btree_journal_peek(trans, iter,
+ k.k ? k.k->p : path_l(iter->path)->b->key.k.p);
+
+ if (next_journal) {
+ iter->k = next_journal->k;
+ k = bkey_i_to_s_c(next_journal);
+ }
+
+ return k;
+}
+
+/*
+ * Checks btree key cache for key at iter->pos and returns it if present, or
+ * bkey_s_c_null:
+ */
+static noinline
+struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
+{
+ struct btree_trans *trans = iter->trans;
+ struct bch_fs *c = trans->c;
+ struct bkey u;
+ struct bkey_s_c k;
+ int ret;
+
+ if ((iter->flags & BTREE_ITER_KEY_CACHE_FILL) &&
+ bpos_eq(iter->pos, pos))
+ return bkey_s_c_null;
+
+ if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
+ return bkey_s_c_null;
+
+ if (!iter->key_cache_path)
+ iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
+ iter->flags & BTREE_ITER_INTENT, 0,
+ iter->flags|BTREE_ITER_CACHED|
+ BTREE_ITER_CACHED_NOFILL,
+ _THIS_IP_);
+
+ iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+
+ ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
+ iter->flags|BTREE_ITER_CACHED) ?:
+ bch2_btree_path_relock(trans, iter->path, _THIS_IP_);
+ if (unlikely(ret))
+ return bkey_s_c_err(ret);
+
+ btree_path_set_should_be_locked(iter->key_cache_path);
+
+ k = bch2_btree_path_peek_slot(iter->key_cache_path, &u);
+ if (k.k && !bkey_err(k)) {
+ iter->k = u;
+ k.k = &iter->k;
+ }
+ return k;
+}
+
+static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
+{
+ struct btree_trans *trans = iter->trans;
+ struct bkey_i *next_update;
+ struct bkey_s_c k, k2;
+ int ret;
+
+ EBUG_ON(iter->path->cached);
+ bch2_btree_iter_verify(iter);
+
+ while (1) {
+ struct btree_path_level *l;
+
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+
+ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+ if (unlikely(ret)) {
+ /* ensure that iter->k is consistent with iter->pos: */
+ bch2_btree_iter_set_pos(iter, iter->pos);
+ k = bkey_s_c_err(ret);
+ goto out;
+ }
+
+ l = path_l(iter->path);
+
+ if (unlikely(!l->b)) {
+ /* No btree nodes at requested level: */
+ bch2_btree_iter_set_pos(iter, SPOS_MAX);
+ k = bkey_s_c_null;
+ goto out;
+ }
+
+ btree_path_set_should_be_locked(iter->path);
+
+ k = btree_path_level_peek_all(trans->c, l, &iter->k);
+
+ if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
+ k.k &&
+ (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
+ k = k2;
+ ret = bkey_err(k);
+ if (ret) {
+ bch2_btree_iter_set_pos(iter, iter->pos);
+ goto out;
+ }
+ }
+
+ if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
+ k = btree_trans_peek_journal(trans, iter, k);
+
+ next_update = btree_trans_peek_updates(iter);
+
+ if (next_update &&
+ bpos_le(next_update->k.p,
+ k.k ? k.k->p : l->b->key.k.p)) {
+ iter->k = next_update->k;
+ k = bkey_i_to_s_c(next_update);
+ }
+
+ if (k.k && bkey_deleted(k.k)) {
+ /*
+ * If we've got a whiteout, and it's after the search
+ * key, advance the search key to the whiteout instead
+ * of just after the whiteout - it might be a btree
+ * whiteout, with a real key at the same position, since
+ * in the btree deleted keys sort before non deleted.
+ */
+ search_key = !bpos_eq(search_key, k.k->p)
+ ? k.k->p
+ : bpos_successor(k.k->p);
+ continue;
+ }
+
+ if (likely(k.k)) {
+ break;
+ } else if (likely(!bpos_eq(l->b->key.k.p, SPOS_MAX))) {
+ /* Advance to next leaf node: */
+ search_key = bpos_successor(l->b->key.k.p);
+ } else {
+ /* End of btree: */
+ bch2_btree_iter_set_pos(iter, SPOS_MAX);
+ k = bkey_s_c_null;
+ goto out;
+ }
+ }
+out:
+ bch2_btree_iter_verify(iter);
+
+ return k;
+}
+
+/**
+ * bch2_btree_iter_peek_upto() - returns first key greater than or equal to
+ * iterator's current position
+ * @iter: iterator to peek from
+ * @end: search limit: returns keys less than or equal to @end
+ *
+ * Returns: key if found, or an error extractable with bkey_err().
+ */
+struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
+{
+ struct btree_trans *trans = iter->trans;
+ struct bpos search_key = btree_iter_search_key(iter);
+ struct bkey_s_c k;
+ struct bpos iter_pos;
+ int ret;
+
+ EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
+ EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX));
+
+ if (iter->update_path) {
+ bch2_path_put_nokeep(trans, iter->update_path,
+ iter->flags & BTREE_ITER_INTENT);
+ iter->update_path = NULL;
+ }
+
+ bch2_btree_iter_verify_entry_exit(iter);
+
+ while (1) {
+ k = __bch2_btree_iter_peek(iter, search_key);
+ if (unlikely(!k.k))
+ goto end;
+ if (unlikely(bkey_err(k)))
+ goto out_no_locked;
+
+ /*
+ * We need to check against @end before FILTER_SNAPSHOTS because
+ * if we get to a different inode that requested we might be
+ * seeing keys for a different snapshot tree that will all be
+ * filtered out.
+ *
+ * But we can't do the full check here, because bkey_start_pos()
+ * isn't monotonically increasing before FILTER_SNAPSHOTS, and
+ * that's what we check against in extents mode:
+ */
+ if (k.k->p.inode > end.inode)
+ goto end;
+
+ if (iter->update_path &&
+ !bkey_eq(iter->update_path->pos, k.k->p)) {
+ bch2_path_put_nokeep(trans, iter->update_path,
+ iter->flags & BTREE_ITER_INTENT);
+ iter->update_path = NULL;
+ }
+
+ if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+ (iter->flags & BTREE_ITER_INTENT) &&
+ !(iter->flags & BTREE_ITER_IS_EXTENTS) &&
+ !iter->update_path) {
+ struct bpos pos = k.k->p;
+
+ if (pos.snapshot < iter->snapshot) {
+ search_key = bpos_successor(k.k->p);
+ continue;
+ }
+
+ pos.snapshot = iter->snapshot;
+
+ /*
+ * advance, same as on exit for iter->path, but only up
+ * to snapshot
+ */
+ __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT);
+ iter->update_path = iter->path;
+
+ iter->update_path = bch2_btree_path_set_pos(trans,
+ iter->update_path, pos,
+ iter->flags & BTREE_ITER_INTENT,
+ _THIS_IP_);
+ ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags);
+ if (unlikely(ret)) {
+ k = bkey_s_c_err(ret);
+ goto out_no_locked;
+ }
+ }
+
+ /*
+ * We can never have a key in a leaf node at POS_MAX, so
+ * we don't have to check these successor() calls:
+ */
+ if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+ !bch2_snapshot_is_ancestor(trans->c,
+ iter->snapshot,
+ k.k->p.snapshot)) {
+ search_key = bpos_successor(k.k->p);
+ continue;
+ }
+
+ if (bkey_whiteout(k.k) &&
+ !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
+ search_key = bkey_successor(iter, k.k->p);
+ continue;
+ }
+
+ /*
+ * iter->pos should be mononotically increasing, and always be
+ * equal to the key we just returned - except extents can
+ * straddle iter->pos:
+ */
+ if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
+ iter_pos = k.k->p;
+ else
+ iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k));
+
+ if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS)
+ ? bkey_gt(iter_pos, end)
+ : bkey_ge(iter_pos, end)))
+ goto end;
+
+ break;
+ }
+
+ iter->pos = iter_pos;
+
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+
+ btree_path_set_should_be_locked(iter->path);
+out_no_locked:
+ if (iter->update_path) {
+ ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_);
+ if (unlikely(ret))
+ k = bkey_s_c_err(ret);
+ else
+ btree_path_set_should_be_locked(iter->update_path);
+ }
+
+ if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+ iter->pos.snapshot = iter->snapshot;
+
+ ret = bch2_btree_iter_verify_ret(iter, k);
+ if (unlikely(ret)) {
+ bch2_btree_iter_set_pos(iter, iter->pos);
+ k = bkey_s_c_err(ret);
+ }
+
+ bch2_btree_iter_verify_entry_exit(iter);
+
+ return k;
+end:
+ bch2_btree_iter_set_pos(iter, end);
+ k = bkey_s_c_null;
+ goto out_no_locked;
+}
+
+/**
+ * bch2_btree_iter_peek_all_levels() - returns the first key greater than or
+ * equal to iterator's current position, returning keys from every level of the
+ * btree. For keys at different levels of the btree that compare equal, the key
+ * from the lower level (leaf) is returned first.
+ * @iter: iterator to peek from
+ *
+ * Returns: key if found, or an error extractable with bkey_err().
+ */
+struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
+{
+ struct btree_trans *trans = iter->trans;
+ struct bkey_s_c k;
+ int ret;
+
+ EBUG_ON(iter->path->cached);
+ bch2_btree_iter_verify(iter);
+ BUG_ON(iter->path->level < iter->min_depth);
+ BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
+ EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS));
+
+ while (1) {
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos,
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+
+ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+ if (unlikely(ret)) {
+ /* ensure that iter->k is consistent with iter->pos: */
+ bch2_btree_iter_set_pos(iter, iter->pos);
+ k = bkey_s_c_err(ret);
+ goto out_no_locked;
+ }
+
+ /* Already at end? */
+ if (!btree_path_node(iter->path, iter->path->level)) {
+ k = bkey_s_c_null;
+ goto out_no_locked;
+ }
+
+ k = btree_path_level_peek_all(trans->c,
+ &iter->path->l[iter->path->level], &iter->k);
+
+ /* Check if we should go up to the parent node: */
+ if (!k.k ||
+ (iter->advanced &&
+ bpos_eq(path_l(iter->path)->b->key.k.p, iter->pos))) {
+ iter->pos = path_l(iter->path)->b->key.k.p;
+ btree_path_set_level_up(trans, iter->path);
+ iter->advanced = false;
+ continue;
+ }
+
+ /*
+ * Check if we should go back down to a leaf:
+ * If we're not in a leaf node, we only return the current key
+ * if it exactly matches iter->pos - otherwise we first have to
+ * go back to the leaf:
+ */
+ if (iter->path->level != iter->min_depth &&
+ (iter->advanced ||
+ !k.k ||
+ !bpos_eq(iter->pos, k.k->p))) {
+ btree_path_set_level_down(trans, iter->path, iter->min_depth);
+ iter->pos = bpos_successor(iter->pos);
+ iter->advanced = false;
+ continue;
+ }
+
+ /* Check if we should go to the next key: */
+ if (iter->path->level == iter->min_depth &&
+ iter->advanced &&
+ k.k &&
+ bpos_eq(iter->pos, k.k->p)) {
+ iter->pos = bpos_successor(iter->pos);
+ iter->advanced = false;
+ continue;
+ }
+
+ if (iter->advanced &&
+ iter->path->level == iter->min_depth &&
+ !bpos_eq(k.k->p, iter->pos))
+ iter->advanced = false;
+
+ BUG_ON(iter->advanced);
+ BUG_ON(!k.k);
+ break;
+ }
+
+ iter->pos = k.k->p;
+ btree_path_set_should_be_locked(iter->path);
+out_no_locked:
+ bch2_btree_iter_verify(iter);
+
+ return k;
+}
+
+/**
+ * bch2_btree_iter_next() - returns first key greater than iterator's current
+ * position
+ * @iter: iterator to peek from
+ *
+ * Returns: key if found, or an error extractable with bkey_err().
+ */
+struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
+{
+ if (!bch2_btree_iter_advance(iter))
+ return bkey_s_c_null;
+
+ return bch2_btree_iter_peek(iter);
+}
+
+/**
+ * bch2_btree_iter_peek_prev() - returns first key less than or equal to
+ * iterator's current position
+ * @iter: iterator to peek from
+ *
+ * Returns: key if found, or an error extractable with bkey_err().
+ */
+struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
+{
+ struct btree_trans *trans = iter->trans;
+ struct bpos search_key = iter->pos;
+ struct btree_path *saved_path = NULL;
+ struct bkey_s_c k;
+ struct bkey saved_k;
+ const struct bch_val *saved_v;
+ int ret;
+
+ EBUG_ON(iter->path->cached || iter->path->level);
+ EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
+
+ if (iter->flags & BTREE_ITER_WITH_JOURNAL)
+ return bkey_s_c_err(-EIO);
+
+ bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify_entry_exit(iter);
+
+ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+ search_key.snapshot = U32_MAX;
+
+ while (1) {
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+
+ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+ if (unlikely(ret)) {
+ /* ensure that iter->k is consistent with iter->pos: */
+ bch2_btree_iter_set_pos(iter, iter->pos);
+ k = bkey_s_c_err(ret);
+ goto out_no_locked;
+ }
+
+ k = btree_path_level_peek(trans, iter->path,
+ &iter->path->l[0], &iter->k);
+ if (!k.k ||
+ ((iter->flags & BTREE_ITER_IS_EXTENTS)
+ ? bpos_ge(bkey_start_pos(k.k), search_key)
+ : bpos_gt(k.k->p, search_key)))
+ k = btree_path_level_prev(trans, iter->path,
+ &iter->path->l[0], &iter->k);
+
+ if (likely(k.k)) {
+ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
+ if (k.k->p.snapshot == iter->snapshot)
+ goto got_key;
+
+ /*
+ * If we have a saved candidate, and we're no
+ * longer at the same _key_ (not pos), return
+ * that candidate
+ */
+ if (saved_path && !bkey_eq(k.k->p, saved_k.p)) {
+ bch2_path_put_nokeep(trans, iter->path,
+ iter->flags & BTREE_ITER_INTENT);
+ iter->path = saved_path;
+ saved_path = NULL;
+ iter->k = saved_k;
+ k.v = saved_v;
+ goto got_key;
+ }
+
+ if (bch2_snapshot_is_ancestor(iter->trans->c,
+ iter->snapshot,
+ k.k->p.snapshot)) {
+ if (saved_path)
+ bch2_path_put_nokeep(trans, saved_path,
+ iter->flags & BTREE_ITER_INTENT);
+ saved_path = btree_path_clone(trans, iter->path,
+ iter->flags & BTREE_ITER_INTENT);
+ saved_k = *k.k;
+ saved_v = k.v;
+ }
+
+ search_key = bpos_predecessor(k.k->p);
+ continue;
+ }
+got_key:
+ if (bkey_whiteout(k.k) &&
+ !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
+ search_key = bkey_predecessor(iter, k.k->p);
+ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+ search_key.snapshot = U32_MAX;
+ continue;
+ }
+
+ break;
+ } else if (likely(!bpos_eq(iter->path->l[0].b->data->min_key, POS_MIN))) {
+ /* Advance to previous leaf node: */
+ search_key = bpos_predecessor(iter->path->l[0].b->data->min_key);
+ } else {
+ /* Start of btree: */
+ bch2_btree_iter_set_pos(iter, POS_MIN);
+ k = bkey_s_c_null;
+ goto out_no_locked;
+ }
+ }
+
+ EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos));
+
+ /* Extents can straddle iter->pos: */
+ if (bkey_lt(k.k->p, iter->pos))
+ iter->pos = k.k->p;
+
+ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+ iter->pos.snapshot = iter->snapshot;
+
+ btree_path_set_should_be_locked(iter->path);
+out_no_locked:
+ if (saved_path)
+ bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
+
+ bch2_btree_iter_verify_entry_exit(iter);
+ bch2_btree_iter_verify(iter);
+
+ return k;
+}
+
+/**
+ * bch2_btree_iter_prev() - returns first key less than iterator's current
+ * position
+ * @iter: iterator to peek from
+ *
+ * Returns: key if found, or an error extractable with bkey_err().
+ */
+struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
+{
+ if (!bch2_btree_iter_rewind(iter))
+ return bkey_s_c_null;
+
+ return bch2_btree_iter_peek_prev(iter);
+}
+
+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
+{
+ struct btree_trans *trans = iter->trans;
+ struct bpos search_key;
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify_entry_exit(iter);
+ EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
+ EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
+
+ /* extents can't span inode numbers: */
+ if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+ unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
+ if (iter->pos.inode == KEY_INODE_MAX)
+ return bkey_s_c_null;
+
+ bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
+ }
+
+ search_key = btree_iter_search_key(iter);
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+
+ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+ if (unlikely(ret)) {
+ k = bkey_s_c_err(ret);
+ goto out_no_locked;
+ }
+
+ if ((iter->flags & BTREE_ITER_CACHED) ||
+ !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
+ struct bkey_i *next_update;
+
+ if ((next_update = btree_trans_peek_updates(iter)) &&
+ bpos_eq(next_update->k.p, iter->pos)) {
+ iter->k = next_update->k;
+ k = bkey_i_to_s_c(next_update);
+ goto out;
+ }
+
+ if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
+ (k = btree_trans_peek_slot_journal(trans, iter)).k)
+ goto out;
+
+ if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
+ (k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
+ if (!bkey_err(k))
+ iter->k = *k.k;
+ /* We're not returning a key from iter->path: */
+ goto out_no_locked;
+ }
+
+ k = bch2_btree_path_peek_slot(iter->path, &iter->k);
+ if (unlikely(!k.k))
+ goto out_no_locked;
+ } else {
+ struct bpos next;
+ struct bpos end = iter->pos;
+
+ if (iter->flags & BTREE_ITER_IS_EXTENTS)
+ end.offset = U64_MAX;
+
+ EBUG_ON(iter->path->level);
+
+ if (iter->flags & BTREE_ITER_INTENT) {
+ struct btree_iter iter2;
+
+ bch2_trans_copy_iter(&iter2, iter);
+ k = bch2_btree_iter_peek_upto(&iter2, end);
+
+ if (k.k && !bkey_err(k)) {
+ iter->k = iter2.k;
+ k.k = &iter->k;
+ }
+ bch2_trans_iter_exit(trans, &iter2);
+ } else {
+ struct bpos pos = iter->pos;
+
+ k = bch2_btree_iter_peek_upto(iter, end);
+ if (unlikely(bkey_err(k)))
+ bch2_btree_iter_set_pos(iter, pos);
+ else
+ iter->pos = pos;
+ }
+
+ if (unlikely(bkey_err(k)))
+ goto out_no_locked;
+
+ next = k.k ? bkey_start_pos(k.k) : POS_MAX;
+
+ if (bkey_lt(iter->pos, next)) {
+ bkey_init(&iter->k);
+ iter->k.p = iter->pos;
+
+ if (iter->flags & BTREE_ITER_IS_EXTENTS) {
+ bch2_key_resize(&iter->k,
+ min_t(u64, KEY_SIZE_MAX,
+ (next.inode == iter->pos.inode
+ ? next.offset
+ : KEY_OFFSET_MAX) -
+ iter->pos.offset));
+ EBUG_ON(!iter->k.size);
+ }
+
+ k = (struct bkey_s_c) { &iter->k, NULL };
+ }
+ }
+out:
+ btree_path_set_should_be_locked(iter->path);
+out_no_locked:
+ bch2_btree_iter_verify_entry_exit(iter);
+ bch2_btree_iter_verify(iter);
+ ret = bch2_btree_iter_verify_ret(iter, k);
+ if (unlikely(ret))
+ return bkey_s_c_err(ret);
+
+ return k;
+}
+
+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
+{
+ if (!bch2_btree_iter_advance(iter))
+ return bkey_s_c_null;
+
+ return bch2_btree_iter_peek_slot(iter);
+}
+
+struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
+{
+ if (!bch2_btree_iter_rewind(iter))
+ return bkey_s_c_null;
+
+ return bch2_btree_iter_peek_slot(iter);
+}
+
+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter)
+{
+ struct bkey_s_c k;
+
+ while (btree_trans_too_many_iters(iter->trans) ||
+ (k = bch2_btree_iter_peek_type(iter, iter->flags),
+ bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
+ bch2_trans_begin(iter->trans);
+
+ return k;
+}
+
+/* new transactional stuff: */
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+static void btree_trans_verify_sorted_refs(struct btree_trans *trans)
+{
+ struct btree_path *path;
+ unsigned i;
+
+ BUG_ON(trans->nr_sorted != hweight64(trans->paths_allocated));
+
+ trans_for_each_path(trans, path) {
+ BUG_ON(path->sorted_idx >= trans->nr_sorted);
+ BUG_ON(trans->sorted[path->sorted_idx] != path->idx);
+ }
+
+ for (i = 0; i < trans->nr_sorted; i++) {
+ unsigned idx = trans->sorted[i];
+
+ EBUG_ON(!(trans->paths_allocated & (1ULL << idx)));
+ BUG_ON(trans->paths[idx].sorted_idx != i);
+ }
+}
+
+static void btree_trans_verify_sorted(struct btree_trans *trans)
+{
+ struct btree_path *path, *prev = NULL;
+ unsigned i;
+
+ if (!bch2_debug_check_iterators)
+ return;
+
+ trans_for_each_path_inorder(trans, path, i) {
+ if (prev && btree_path_cmp(prev, path) > 0) {
+ __bch2_dump_trans_paths_updates(trans, true);
+ panic("trans paths out of order!\n");
+ }
+ prev = path;
+ }
+}
+#else
+static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {}
+static inline void btree_trans_verify_sorted(struct btree_trans *trans) {}
+#endif
+
+void __bch2_btree_trans_sort_paths(struct btree_trans *trans)
+{
+ int i, l = 0, r = trans->nr_sorted, inc = 1;
+ bool swapped;
+
+ btree_trans_verify_sorted_refs(trans);
+
+ if (trans->paths_sorted)
+ goto out;
+
+ /*
+ * Cocktail shaker sort: this is efficient because iterators will be
+ * mostly sorted.
+ */
+ do {
+ swapped = false;
+
+ for (i = inc > 0 ? l : r - 2;
+ i + 1 < r && i >= l;
+ i += inc) {
+ if (btree_path_cmp(trans->paths + trans->sorted[i],
+ trans->paths + trans->sorted[i + 1]) > 0) {
+ swap(trans->sorted[i], trans->sorted[i + 1]);
+ trans->paths[trans->sorted[i]].sorted_idx = i;
+ trans->paths[trans->sorted[i + 1]].sorted_idx = i + 1;
+ swapped = true;
+ }
+ }
+
+ if (inc > 0)
+ --r;
+ else
+ l++;
+ inc = -inc;
+ } while (swapped);
+
+ trans->paths_sorted = true;
+out:
+ btree_trans_verify_sorted(trans);
+}
+
+static inline void btree_path_list_remove(struct btree_trans *trans,
+ struct btree_path *path)
+{
+ unsigned i;
+
+ EBUG_ON(path->sorted_idx >= trans->nr_sorted);
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ trans->nr_sorted--;
+ memmove_u64s_down_small(trans->sorted + path->sorted_idx,
+ trans->sorted + path->sorted_idx + 1,
+ DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8));
+#else
+ array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
+#endif
+ for (i = path->sorted_idx; i < trans->nr_sorted; i++)
+ trans->paths[trans->sorted[i]].sorted_idx = i;
+
+ path->sorted_idx = U8_MAX;
+}
+
+static inline void btree_path_list_add(struct btree_trans *trans,
+ struct btree_path *pos,
+ struct btree_path *path)
+{
+ unsigned i;
+
+ path->sorted_idx = pos ? pos->sorted_idx + 1 : trans->nr_sorted;
+
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1,
+ trans->sorted + path->sorted_idx,
+ DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8));
+ trans->nr_sorted++;
+ trans->sorted[path->sorted_idx] = path->idx;
+#else
+ array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx);
+#endif
+
+ for (i = path->sorted_idx; i < trans->nr_sorted; i++)
+ trans->paths[trans->sorted[i]].sorted_idx = i;
+
+ btree_trans_verify_sorted_refs(trans);
+}
+
+void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
+{
+ if (iter->update_path)
+ bch2_path_put_nokeep(trans, iter->update_path,
+ iter->flags & BTREE_ITER_INTENT);
+ if (iter->path)
+ bch2_path_put(trans, iter->path,
+ iter->flags & BTREE_ITER_INTENT);
+ if (iter->key_cache_path)
+ bch2_path_put(trans, iter->key_cache_path,
+ iter->flags & BTREE_ITER_INTENT);
+ iter->path = NULL;
+ iter->update_path = NULL;
+ iter->key_cache_path = NULL;
+}
+
+void bch2_trans_iter_init_outlined(struct btree_trans *trans,
+ struct btree_iter *iter,
+ enum btree_id btree_id, struct bpos pos,
+ unsigned flags)
+{
+ bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
+ bch2_btree_iter_flags(trans, btree_id, flags),
+ _RET_IP_);
+}
+
+void bch2_trans_node_iter_init(struct btree_trans *trans,
+ struct btree_iter *iter,
+ enum btree_id btree_id,
+ struct bpos pos,
+ unsigned locks_want,
+ unsigned depth,
+ unsigned flags)
+{
+ flags |= BTREE_ITER_NOT_EXTENTS;
+ flags |= __BTREE_ITER_ALL_SNAPSHOTS;
+ flags |= BTREE_ITER_ALL_SNAPSHOTS;
+
+ bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
+ __bch2_btree_iter_flags(trans, btree_id, flags),
+ _RET_IP_);
+
+ iter->min_depth = depth;
+
+ BUG_ON(iter->path->locks_want < min(locks_want, BTREE_MAX_DEPTH));
+ BUG_ON(iter->path->level != depth);
+ BUG_ON(iter->min_depth != depth);
+}
+
+void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
+{
+ *dst = *src;
+ if (src->path)
+ __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT);
+ if (src->update_path)
+ __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT);
+ dst->key_cache_path = NULL;
+}
+
+void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
+{
+ unsigned new_top = trans->mem_top + size;
+ size_t old_bytes = trans->mem_bytes;
+ size_t new_bytes = roundup_pow_of_two(new_top);
+ int ret;
+ void *new_mem;
+ void *p;
+
+ trans->mem_max = max(trans->mem_max, new_top);
+
+ WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
+
+ new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
+ if (unlikely(!new_mem)) {
+ bch2_trans_unlock(trans);
+
+ new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL);
+ if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
+ new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
+ new_bytes = BTREE_TRANS_MEM_MAX;
+ kfree(trans->mem);
+ }
+
+ if (!new_mem)
+ return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
+
+ trans->mem = new_mem;
+ trans->mem_bytes = new_bytes;
+
+ ret = bch2_trans_relock(trans);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+
+ trans->mem = new_mem;
+ trans->mem_bytes = new_bytes;
+
+ if (old_bytes) {
+ trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
+ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
+ }
+
+ p = trans->mem + trans->mem_top;
+ trans->mem_top += size;
+ memset(p, 0, size);
+ return p;
+}
+
+static inline void check_srcu_held_too_long(struct btree_trans *trans)
+{
+ WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10),
+ "btree trans held srcu lock (delaying memory reclaim) for %lu seconds",
+ (jiffies - trans->srcu_lock_time) / HZ);
+}
+
+void bch2_trans_srcu_unlock(struct btree_trans *trans)
+{
+ if (trans->srcu_held) {
+ struct bch_fs *c = trans->c;
+ struct btree_path *path;
+
+ trans_for_each_path(trans, path)
+ if (path->cached && !btree_node_locked(path, 0))
+ path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
+
+ check_srcu_held_too_long(trans);
+ srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
+ trans->srcu_held = false;
+ }
+}
+
+void bch2_trans_srcu_lock(struct btree_trans *trans)
+{
+ if (!trans->srcu_held) {
+ trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier);
+ trans->srcu_lock_time = jiffies;
+ trans->srcu_held = true;
+ }
+}
+
+/**
+ * bch2_trans_begin() - reset a transaction after a interrupted attempt
+ * @trans: transaction to reset
+ *
+ * Returns: current restart counter, to be used with trans_was_restarted()
+ *
+ * While iterating over nodes or updating nodes a attempt to lock a btree node
+ * may return BCH_ERR_transaction_restart when the trylock fails. When this
+ * occurs bch2_trans_begin() should be called and the transaction retried.
+ */
+u32 bch2_trans_begin(struct btree_trans *trans)
+{
+ struct btree_path *path;
+ u64 now;
+
+ bch2_trans_reset_updates(trans);
+
+ trans->restart_count++;
+ trans->mem_top = 0;
+
+ trans_for_each_path(trans, path) {
+ path->should_be_locked = false;
+
+ /*
+ * If the transaction wasn't restarted, we're presuming to be
+ * doing something new: dont keep iterators excpt the ones that
+ * are in use - except for the subvolumes btree:
+ */
+ if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes)
+ path->preserve = false;
+
+ /*
+ * XXX: we probably shouldn't be doing this if the transaction
+ * was restarted, but currently we still overflow transaction
+ * iterators if we do that
+ */
+ if (!path->ref && !path->preserve)
+ __bch2_path_free(trans, path);
+ else
+ path->preserve = false;
+ }
+
+ now = local_clock();
+ if (!trans->restarted &&
+ (need_resched() ||
+ now - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
+ drop_locks_do(trans, (cond_resched(), 0));
+ now = local_clock();
+ }
+ trans->last_begin_time = now;
+
+ if (unlikely(trans->srcu_held &&
+ time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
+ bch2_trans_srcu_unlock(trans);
+
+ trans->last_begin_ip = _RET_IP_;
+ if (trans->restarted) {
+ bch2_btree_path_traverse_all(trans);
+ trans->notrace_relock_fail = false;
+ }
+
+ return trans->restart_count;
+}
+
+static struct btree_trans *bch2_trans_alloc(struct bch_fs *c)
+{
+ struct btree_trans *trans;
+
+ if (IS_ENABLED(__KERNEL__)) {
+ trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
+ if (trans)
+ return trans;
+ }
+
+ trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
+ /*
+ * paths need to be zeroed, bch2_check_for_deadlock looks at
+ * paths in other threads
+ */
+ memset(&trans->paths, 0, sizeof(trans->paths));
+ return trans;
+}
+
+const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
+
+unsigned bch2_trans_get_fn_idx(const char *fn)
+{
+ unsigned i;
+
+ for (i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++)
+ if (!bch2_btree_transaction_fns[i] ||
+ bch2_btree_transaction_fns[i] == fn) {
+ bch2_btree_transaction_fns[i] = fn;
+ return i;
+ }
+
+ pr_warn_once("BCH_TRANSACTIONS_NR not big enough!");
+ return i;
+}
+
+struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
+ __acquires(&c->btree_trans_barrier)
+{
+ struct btree_trans *trans;
+ struct btree_transaction_stats *s;
+
+ trans = bch2_trans_alloc(c);
+
+ memset(trans, 0, sizeof(*trans));
+ trans->c = c;
+ trans->fn = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns)
+ ? bch2_btree_transaction_fns[fn_idx] : NULL;
+ trans->last_begin_time = local_clock();
+ trans->fn_idx = fn_idx;
+ trans->locking_wait.task = current;
+ trans->journal_replay_not_finished =
+ unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) &&
+ atomic_inc_not_zero(&c->journal_keys.ref);
+ closure_init_stack(&trans->ref);
+
+ s = btree_trans_stats(trans);
+ if (s && s->max_mem) {
+ unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
+
+ trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
+
+ if (!unlikely(trans->mem)) {
+ trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
+ trans->mem_bytes = BTREE_TRANS_MEM_MAX;
+ } else {
+ trans->mem_bytes = expected_mem_bytes;
+ }
+ }
+
+ if (s) {
+ trans->nr_max_paths = s->nr_max_paths;
+ trans->wb_updates_size = s->wb_updates_size;
+ }
+
+ trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+ trans->srcu_lock_time = jiffies;
+ trans->srcu_held = true;
+
+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
+ struct btree_trans *pos;
+
+ seqmutex_lock(&c->btree_trans_lock);
+ list_for_each_entry(pos, &c->btree_trans_list, list) {
+ /*
+ * We'd much prefer to be stricter here and completely
+ * disallow multiple btree_trans in the same thread -
+ * but the data move path calls bch2_write when we
+ * already have a btree_trans initialized.
+ */
+ BUG_ON(trans->locking_wait.task->pid == pos->locking_wait.task->pid &&
+ bch2_trans_locked(pos));
+
+ if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) {
+ list_add_tail(&trans->list, &pos->list);
+ goto list_add_done;
+ }
+ }
+ list_add_tail(&trans->list, &c->btree_trans_list);
+list_add_done:
+ seqmutex_unlock(&c->btree_trans_lock);
+ }
+
+ return trans;
+}
+
+static void check_btree_paths_leaked(struct btree_trans *trans)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+ struct bch_fs *c = trans->c;
+ struct btree_path *path;
+
+ trans_for_each_path(trans, path)
+ if (path->ref)
+ goto leaked;
+ return;
+leaked:
+ bch_err(c, "btree paths leaked from %s!", trans->fn);
+ trans_for_each_path(trans, path)
+ if (path->ref)
+ printk(KERN_ERR " btree %s %pS\n",
+ bch2_btree_id_str(path->btree_id),
+ (void *) path->ip_allocated);
+ /* Be noisy about this: */
+ bch2_fatal_error(c);
+#endif
+}
+
+void bch2_trans_put(struct btree_trans *trans)
+ __releases(&c->btree_trans_barrier)
+{
+ struct btree_insert_entry *i;
+ struct bch_fs *c = trans->c;
+ struct btree_transaction_stats *s = btree_trans_stats(trans);
+
+ bch2_trans_unlock(trans);
+
+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
+ seqmutex_lock(&c->btree_trans_lock);
+ list_del(&trans->list);
+ seqmutex_unlock(&c->btree_trans_lock);
+ }
+
+ closure_sync(&trans->ref);
+
+ if (s)
+ s->max_mem = max(s->max_mem, trans->mem_max);
+
+ trans_for_each_update(trans, i)
+ __btree_path_put(i->path, true);
+ trans->nr_updates = 0;
+
+ check_btree_paths_leaked(trans);
+
+ if (trans->srcu_held) {
+ check_srcu_held_too_long(trans);
+ srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
+ }
+
+ kfree(trans->extra_journal_entries.data);
+
+ if (trans->fs_usage_deltas) {
+ if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
+ REPLICAS_DELTA_LIST_MAX)
+ mempool_free(trans->fs_usage_deltas,
+ &c->replicas_delta_pool);
+ else
+ kfree(trans->fs_usage_deltas);
+ }
+
+ if (unlikely(trans->journal_replay_not_finished))
+ bch2_journal_keys_put(c);
+
+ if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
+ mempool_free(trans->mem, &c->btree_trans_mem_pool);
+ else
+ kfree(trans->mem);
+
+ /* Userspace doesn't have a real percpu implementation: */
+ if (IS_ENABLED(__KERNEL__))
+ trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
+ if (trans)
+ mempool_free(trans, &c->btree_trans_pool);
+}
+
+static void __maybe_unused
+bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
+ struct btree_bkey_cached_common *b)
+{
+ struct six_lock_count c = six_lock_counts(&b->lock);
+ struct task_struct *owner;
+ pid_t pid;
+
+ rcu_read_lock();
+ owner = READ_ONCE(b->lock.owner);
+ pid = owner ? owner->pid : 0;
+ rcu_read_unlock();
+
+ prt_tab(out);
+ prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
+ b->level, bch2_btree_id_str(b->btree_id));
+ bch2_bpos_to_text(out, btree_node_pos(b));
+
+ prt_tab(out);
+ prt_printf(out, " locks %u:%u:%u held by pid %u",
+ c.n[0], c.n[1], c.n[2], pid);
+}
+
+void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
+{
+ struct btree_path *path;
+ struct btree_bkey_cached_common *b;
+ static char lock_types[] = { 'r', 'i', 'w' };
+ unsigned l, idx;
+
+ if (!out->nr_tabstops) {
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 32);
+ }
+
+ prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn);
+
+ trans_for_each_path_safe(trans, path, idx) {
+ if (!path->nodes_locked)
+ continue;
+
+ prt_printf(out, " path %u %c l=%u %s:",
+ path->idx,
+ path->cached ? 'c' : 'b',
+ path->level,
+ bch2_btree_id_str(path->btree_id));
+ bch2_bpos_to_text(out, path->pos);
+ prt_newline(out);
+
+ for (l = 0; l < BTREE_MAX_DEPTH; l++) {
+ if (btree_node_locked(path, l) &&
+ !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) {
+ prt_printf(out, " %c l=%u ",
+ lock_types[btree_node_locked_type(path, l)], l);
+ bch2_btree_bkey_cached_common_to_text(out, b);
+ prt_newline(out);
+ }
+ }
+ }
+
+ b = READ_ONCE(trans->locking);
+ if (b) {
+ prt_printf(out, " blocked for %lluus on",
+ div_u64(local_clock() - trans->locking_wait.start_time,
+ 1000));
+ prt_newline(out);
+ prt_printf(out, " %c", lock_types[trans->locking_wait.lock_want]);
+ bch2_btree_bkey_cached_common_to_text(out, b);
+ prt_newline(out);
+ }
+}
+
+void bch2_fs_btree_iter_exit(struct bch_fs *c)
+{
+ struct btree_transaction_stats *s;
+ struct btree_trans *trans;
+ int cpu;
+
+ trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list);
+ if (trans)
+ panic("%s leaked btree_trans\n", trans->fn);
+
+ if (c->btree_trans_bufs)
+ for_each_possible_cpu(cpu)
+ kfree(per_cpu_ptr(c->btree_trans_bufs, cpu)->trans);
+ free_percpu(c->btree_trans_bufs);
+
+ for (s = c->btree_transaction_stats;
+ s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
+ s++) {
+ kfree(s->max_paths_text);
+ bch2_time_stats_exit(&s->lock_hold_times);
+ }
+
+ if (c->btree_trans_barrier_initialized)
+ cleanup_srcu_struct(&c->btree_trans_barrier);
+ mempool_exit(&c->btree_trans_mem_pool);
+ mempool_exit(&c->btree_trans_pool);
+}
+
+void bch2_fs_btree_iter_init_early(struct bch_fs *c)
+{
+ struct btree_transaction_stats *s;
+
+ for (s = c->btree_transaction_stats;
+ s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
+ s++) {
+ bch2_time_stats_init(&s->lock_hold_times);
+ mutex_init(&s->lock);
+ }
+
+ INIT_LIST_HEAD(&c->btree_trans_list);
+ seqmutex_init(&c->btree_trans_lock);
+}
+
+int bch2_fs_btree_iter_init(struct bch_fs *c)
+{
+ int ret;
+
+ c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf);
+ if (!c->btree_trans_bufs)
+ return -ENOMEM;
+
+ ret = mempool_init_kmalloc_pool(&c->btree_trans_pool, 1,
+ sizeof(struct btree_trans)) ?:
+ mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
+ BTREE_TRANS_MEM_MAX) ?:
+ init_srcu_struct(&c->btree_trans_barrier);
+ if (!ret)
+ c->btree_trans_barrier_initialized = true;
+ return ret;
+}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
new file mode 100644
index 000000000000..eaffced4c132
--- /dev/null
+++ b/fs/bcachefs/btree_iter.h
@@ -0,0 +1,944 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_ITER_H
+#define _BCACHEFS_BTREE_ITER_H
+
+#include "bset.h"
+#include "btree_types.h"
+#include "trace.h"
+
+static inline int __bkey_err(const struct bkey *k)
+{
+ return PTR_ERR_OR_ZERO(k);
+}
+
+#define bkey_err(_k) __bkey_err((_k).k)
+
+static inline void __btree_path_get(struct btree_path *path, bool intent)
+{
+ path->ref++;
+ path->intent_ref += intent;
+}
+
+static inline bool __btree_path_put(struct btree_path *path, bool intent)
+{
+ EBUG_ON(!path->ref);
+ EBUG_ON(!path->intent_ref && intent);
+ path->intent_ref -= intent;
+ return --path->ref == 0;
+}
+
+static inline void btree_path_set_dirty(struct btree_path *path,
+ enum btree_path_uptodate u)
+{
+ path->uptodate = max_t(unsigned, path->uptodate, u);
+}
+
+static inline struct btree *btree_path_node(struct btree_path *path,
+ unsigned level)
+{
+ return level < BTREE_MAX_DEPTH ? path->l[level].b : NULL;
+}
+
+static inline bool btree_node_lock_seq_matches(const struct btree_path *path,
+ const struct btree *b, unsigned level)
+{
+ return path->l[level].lock_seq == six_lock_seq(&b->c.lock);
+}
+
+static inline struct btree *btree_node_parent(struct btree_path *path,
+ struct btree *b)
+{
+ return btree_path_node(path, b->c.level + 1);
+}
+
+/* Iterate over paths within a transaction: */
+
+void __bch2_btree_trans_sort_paths(struct btree_trans *);
+
+static inline void btree_trans_sort_paths(struct btree_trans *trans)
+{
+ if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+ trans->paths_sorted)
+ return;
+ __bch2_btree_trans_sort_paths(trans);
+}
+
+static inline struct btree_path *
+__trans_next_path(struct btree_trans *trans, unsigned idx)
+{
+ u64 l;
+
+ if (idx == BTREE_ITER_MAX)
+ return NULL;
+
+ l = trans->paths_allocated >> idx;
+ if (!l)
+ return NULL;
+
+ idx += __ffs64(l);
+ EBUG_ON(idx >= BTREE_ITER_MAX);
+ EBUG_ON(trans->paths[idx].idx != idx);
+ return &trans->paths[idx];
+}
+
+#define trans_for_each_path_from(_trans, _path, _start) \
+ for (_path = __trans_next_path((_trans), _start); \
+ (_path); \
+ _path = __trans_next_path((_trans), (_path)->idx + 1))
+
+#define trans_for_each_path(_trans, _path) \
+ trans_for_each_path_from(_trans, _path, 0)
+
+static inline struct btree_path *
+__trans_next_path_safe(struct btree_trans *trans, unsigned *idx)
+{
+ u64 l;
+
+ if (*idx == BTREE_ITER_MAX)
+ return NULL;
+
+ l = trans->paths_allocated >> *idx;
+ if (!l)
+ return NULL;
+
+ *idx += __ffs64(l);
+ EBUG_ON(*idx >= BTREE_ITER_MAX);
+ return &trans->paths[*idx];
+}
+
+/*
+ * This version is intended to be safe for use on a btree_trans that is owned by
+ * another thread, for bch2_btree_trans_to_text();
+ */
+#define trans_for_each_path_safe_from(_trans, _path, _idx, _start) \
+ for (_idx = _start; \
+ (_path = __trans_next_path_safe((_trans), &_idx)); \
+ _idx++)
+
+#define trans_for_each_path_safe(_trans, _path, _idx) \
+ trans_for_each_path_safe_from(_trans, _path, _idx, 0)
+
+static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path)
+{
+ unsigned idx = path ? path->sorted_idx + 1 : 0;
+
+ EBUG_ON(idx > trans->nr_sorted);
+
+ return idx < trans->nr_sorted
+ ? trans->paths + trans->sorted[idx]
+ : NULL;
+}
+
+static inline struct btree_path *prev_btree_path(struct btree_trans *trans, struct btree_path *path)
+{
+ unsigned idx = path ? path->sorted_idx : trans->nr_sorted;
+
+ return idx
+ ? trans->paths + trans->sorted[idx - 1]
+ : NULL;
+}
+
+#define trans_for_each_path_inorder(_trans, _path, _i) \
+ for (_i = 0; \
+ ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\
+ _i++)
+
+#define trans_for_each_path_inorder_reverse(_trans, _path, _i) \
+ for (_i = trans->nr_sorted - 1; \
+ ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) >= 0;\
+ --_i)
+
+static inline bool __path_has_node(const struct btree_path *path,
+ const struct btree *b)
+{
+ return path->l[b->c.level].b == b &&
+ btree_node_lock_seq_matches(path, b, b->c.level);
+}
+
+static inline struct btree_path *
+__trans_next_path_with_node(struct btree_trans *trans, struct btree *b,
+ unsigned idx)
+{
+ struct btree_path *path = __trans_next_path(trans, idx);
+
+ while (path && !__path_has_node(path, b))
+ path = __trans_next_path(trans, path->idx + 1);
+
+ return path;
+}
+
+#define trans_for_each_path_with_node(_trans, _b, _path) \
+ for (_path = __trans_next_path_with_node((_trans), (_b), 0); \
+ (_path); \
+ _path = __trans_next_path_with_node((_trans), (_b), \
+ (_path)->idx + 1))
+
+struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *,
+ bool, unsigned long);
+
+static inline struct btree_path * __must_check
+bch2_btree_path_make_mut(struct btree_trans *trans,
+ struct btree_path *path, bool intent,
+ unsigned long ip)
+{
+ if (path->ref > 1 || path->preserve)
+ path = __bch2_btree_path_make_mut(trans, path, intent, ip);
+ path->should_be_locked = false;
+ return path;
+}
+
+struct btree_path * __must_check
+__bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *,
+ struct bpos, bool, unsigned long, int);
+
+static inline struct btree_path * __must_check
+bch2_btree_path_set_pos(struct btree_trans *trans,
+ struct btree_path *path, struct bpos new_pos,
+ bool intent, unsigned long ip)
+{
+ int cmp = bpos_cmp(new_pos, path->pos);
+
+ return cmp
+ ? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip, cmp)
+ : path;
+}
+
+int __must_check bch2_btree_path_traverse_one(struct btree_trans *, struct btree_path *,
+ unsigned, unsigned long);
+
+static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
+ struct btree_path *path, unsigned flags)
+{
+ if (path->uptodate < BTREE_ITER_NEED_RELOCK)
+ return 0;
+
+ return bch2_btree_path_traverse_one(trans, path, flags, _RET_IP_);
+}
+
+int __must_check bch2_btree_path_traverse(struct btree_trans *,
+ struct btree_path *, unsigned);
+struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
+ unsigned, unsigned, unsigned, unsigned long);
+struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
+
+/*
+ * bch2_btree_path_peek_slot() for a cached iterator might return a key in a
+ * different snapshot:
+ */
+static inline struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u)
+{
+ struct bkey_s_c k = bch2_btree_path_peek_slot(path, u);
+
+ if (k.k && bpos_eq(path->pos, k.k->p))
+ return k;
+
+ bkey_init(u);
+ u->p = path->pos;
+ return (struct bkey_s_c) { u, NULL };
+}
+
+struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *,
+ struct btree_iter *, struct bpos);
+
+void bch2_btree_path_level_init(struct btree_trans *, struct btree_path *, struct btree *);
+
+int __bch2_trans_mutex_lock(struct btree_trans *, struct mutex *);
+
+static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex *lock)
+{
+ return mutex_trylock(lock)
+ ? 0
+ : __bch2_trans_mutex_lock(trans, lock);
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_trans_verify_paths(struct btree_trans *);
+void bch2_assert_pos_locked(struct btree_trans *, enum btree_id,
+ struct bpos, bool);
+#else
+static inline void bch2_trans_verify_paths(struct btree_trans *trans) {}
+static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
+ struct bpos pos, bool key_cache) {}
+#endif
+
+void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
+ struct btree *, struct bkey_packed *);
+void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *,
+ struct btree *, struct btree_node_iter *,
+ struct bkey_packed *, unsigned, unsigned);
+
+int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
+
+void bch2_path_put(struct btree_trans *, struct btree_path *, bool);
+
+int bch2_trans_relock(struct btree_trans *);
+int bch2_trans_relock_notrace(struct btree_trans *);
+void bch2_trans_unlock(struct btree_trans *);
+void bch2_trans_unlock_long(struct btree_trans *);
+bool bch2_trans_locked(struct btree_trans *);
+
+static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count)
+{
+ return restart_count != trans->restart_count
+ ? -BCH_ERR_transaction_restart_nested
+ : 0;
+}
+
+void __noreturn bch2_trans_restart_error(struct btree_trans *, u32);
+
+static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans,
+ u32 restart_count)
+{
+ if (trans_was_restarted(trans, restart_count))
+ bch2_trans_restart_error(trans, restart_count);
+}
+
+void __noreturn bch2_trans_in_restart_error(struct btree_trans *);
+
+static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans)
+{
+ if (trans->restarted)
+ bch2_trans_in_restart_error(trans);
+}
+
+__always_inline
+static int btree_trans_restart_nounlock(struct btree_trans *trans, int err)
+{
+ BUG_ON(err <= 0);
+ BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart));
+
+ trans->restarted = err;
+ trans->last_restarted_ip = _THIS_IP_;
+ return -err;
+}
+
+__always_inline
+static int btree_trans_restart(struct btree_trans *trans, int err)
+{
+ btree_trans_restart_nounlock(trans, err);
+ return -err;
+}
+
+bool bch2_btree_node_upgrade(struct btree_trans *,
+ struct btree_path *, unsigned);
+
+void __bch2_btree_path_downgrade(struct btree_trans *, struct btree_path *, unsigned);
+
+static inline void bch2_btree_path_downgrade(struct btree_trans *trans,
+ struct btree_path *path)
+{
+ unsigned new_locks_want = path->level + !!path->intent_ref;
+
+ if (path->locks_want > new_locks_want)
+ __bch2_btree_path_downgrade(trans, path, new_locks_want);
+}
+
+void bch2_trans_downgrade(struct btree_trans *);
+
+void bch2_trans_node_add(struct btree_trans *trans, struct btree *);
+void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *);
+
+int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter);
+int __must_check bch2_btree_iter_traverse(struct btree_iter *);
+
+struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
+struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *);
+struct btree *bch2_btree_iter_next_node(struct btree_iter *);
+
+struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos);
+struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
+
+struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *);
+
+static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+{
+ return bch2_btree_iter_peek_upto(iter, SPOS_MAX);
+}
+
+struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
+
+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *);
+
+bool bch2_btree_iter_advance(struct btree_iter *);
+bool bch2_btree_iter_rewind(struct btree_iter *);
+
+static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+{
+ iter->k.type = KEY_TYPE_deleted;
+ iter->k.p.inode = iter->pos.inode = new_pos.inode;
+ iter->k.p.offset = iter->pos.offset = new_pos.offset;
+ iter->k.p.snapshot = iter->pos.snapshot = new_pos.snapshot;
+ iter->k.size = 0;
+}
+
+static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+{
+ if (unlikely(iter->update_path))
+ bch2_path_put(iter->trans, iter->update_path,
+ iter->flags & BTREE_ITER_INTENT);
+ iter->update_path = NULL;
+
+ if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+ new_pos.snapshot = iter->snapshot;
+
+ __bch2_btree_iter_set_pos(iter, new_pos);
+}
+
+static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter)
+{
+ BUG_ON(!(iter->flags & BTREE_ITER_IS_EXTENTS));
+ iter->pos = bkey_start_pos(&iter->k);
+}
+
+static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot)
+{
+ struct bpos pos = iter->pos;
+
+ iter->snapshot = snapshot;
+ pos.snapshot = snapshot;
+ bch2_btree_iter_set_pos(iter, pos);
+}
+
+void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *);
+
+static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans,
+ unsigned btree_id,
+ unsigned flags)
+{
+ if (flags & BTREE_ITER_ALL_LEVELS)
+ flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS;
+
+ if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
+ btree_id_is_extents(btree_id))
+ flags |= BTREE_ITER_IS_EXTENTS;
+
+ if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
+ !btree_type_has_snapshot_field(btree_id))
+ flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
+
+ if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+ btree_type_has_snapshots(btree_id))
+ flags |= BTREE_ITER_FILTER_SNAPSHOTS;
+
+ if (trans->journal_replay_not_finished)
+ flags |= BTREE_ITER_WITH_JOURNAL;
+
+ return flags;
+}
+
+static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans,
+ unsigned btree_id,
+ unsigned flags)
+{
+ if (!btree_id_cached(trans->c, btree_id)) {
+ flags &= ~BTREE_ITER_CACHED;
+ flags &= ~BTREE_ITER_WITH_KEY_CACHE;
+ } else if (!(flags & BTREE_ITER_CACHED))
+ flags |= BTREE_ITER_WITH_KEY_CACHE;
+
+ return __bch2_btree_iter_flags(trans, btree_id, flags);
+}
+
+static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
+ struct btree_iter *iter,
+ unsigned btree_id, struct bpos pos,
+ unsigned locks_want,
+ unsigned depth,
+ unsigned flags,
+ unsigned long ip)
+{
+ memset(iter, 0, sizeof(*iter));
+ iter->trans = trans;
+ iter->btree_id = btree_id;
+ iter->flags = flags;
+ iter->snapshot = pos.snapshot;
+ iter->pos = pos;
+ iter->k.p = pos;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+ iter->ip_allocated = ip;
+#endif
+ iter->path = bch2_path_get(trans, btree_id, iter->pos,
+ locks_want, depth, flags, ip);
+}
+
+void bch2_trans_iter_init_outlined(struct btree_trans *, struct btree_iter *,
+ enum btree_id, struct bpos, unsigned);
+
+static inline void bch2_trans_iter_init(struct btree_trans *trans,
+ struct btree_iter *iter,
+ unsigned btree_id, struct bpos pos,
+ unsigned flags)
+{
+ if (__builtin_constant_p(btree_id) &&
+ __builtin_constant_p(flags))
+ bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
+ bch2_btree_iter_flags(trans, btree_id, flags),
+ _THIS_IP_);
+ else
+ bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags);
+}
+
+void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *,
+ enum btree_id, struct bpos,
+ unsigned, unsigned, unsigned);
+void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *);
+
+static inline void set_btree_iter_dontneed(struct btree_iter *iter)
+{
+ if (!iter->trans->restarted)
+ iter->path->preserve = false;
+}
+
+void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
+
+static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
+{
+ size = roundup(size, 8);
+
+ if (likely(trans->mem_top + size <= trans->mem_bytes)) {
+ void *p = trans->mem + trans->mem_top;
+
+ trans->mem_top += size;
+ memset(p, 0, size);
+ return p;
+ } else {
+ return __bch2_trans_kmalloc(trans, size);
+ }
+}
+
+static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size)
+{
+ size = roundup(size, 8);
+
+ if (likely(trans->mem_top + size <= trans->mem_bytes)) {
+ void *p = trans->mem + trans->mem_top;
+
+ trans->mem_top += size;
+ return p;
+ } else {
+ return __bch2_trans_kmalloc(trans, size);
+ }
+}
+
+static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans,
+ struct btree_iter *iter,
+ unsigned btree_id, struct bpos pos,
+ unsigned flags, unsigned type)
+{
+ struct bkey_s_c k;
+
+ bch2_trans_iter_init(trans, iter, btree_id, pos, flags);
+ k = bch2_btree_iter_peek_slot(iter);
+
+ if (!bkey_err(k) && type && k.k->type != type)
+ k = bkey_s_c_err(-BCH_ERR_ENOENT_bkey_type_mismatch);
+ if (unlikely(bkey_err(k)))
+ bch2_trans_iter_exit(trans, iter);
+ return k;
+}
+
+static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans,
+ struct btree_iter *iter,
+ unsigned btree_id, struct bpos pos,
+ unsigned flags)
+{
+ return __bch2_bkey_get_iter(trans, iter, btree_id, pos, flags, 0);
+}
+
+#define bch2_bkey_get_iter_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\
+ bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter, \
+ _btree_id, _pos, _flags, KEY_TYPE_##_type))
+
+static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
+ unsigned btree_id, struct bpos pos,
+ unsigned flags, unsigned type,
+ unsigned val_size, void *val)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type);
+ ret = bkey_err(k);
+ if (!ret) {
+ unsigned b = min_t(unsigned, bkey_val_bytes(k.k), val_size);
+
+ memcpy(val, k.v, b);
+ if (unlikely(b < sizeof(*val)))
+ memset((void *) val + b, 0, sizeof(*val) - b);
+ bch2_trans_iter_exit(trans, &iter);
+ }
+
+ return ret;
+}
+
+#define bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, _type, _val)\
+ __bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, \
+ KEY_TYPE_##_type, sizeof(*_val), _val)
+
+void bch2_trans_srcu_unlock(struct btree_trans *);
+void bch2_trans_srcu_lock(struct btree_trans *);
+
+u32 bch2_trans_begin(struct btree_trans *);
+
+/*
+ * XXX
+ * this does not handle transaction restarts from bch2_btree_iter_next_node()
+ * correctly
+ */
+#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \
+ _locks_want, _depth, _flags, _b, _ret) \
+ for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id), \
+ _start, _locks_want, _depth, _flags); \
+ (_b) = bch2_btree_iter_peek_node_and_restart(&(_iter)), \
+ !((_ret) = PTR_ERR_OR_ZERO(_b)) && (_b); \
+ (_b) = bch2_btree_iter_next_node(&(_iter)))
+
+#define for_each_btree_node(_trans, _iter, _btree_id, _start, \
+ _flags, _b, _ret) \
+ __for_each_btree_node(_trans, _iter, _btree_id, _start, \
+ 0, 0, _flags, _b, _ret)
+
+static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter,
+ unsigned flags)
+{
+ BUG_ON(flags & BTREE_ITER_ALL_LEVELS);
+
+ return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) :
+ bch2_btree_iter_peek_prev(iter);
+}
+
+static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
+ unsigned flags)
+{
+ return flags & BTREE_ITER_ALL_LEVELS ? bch2_btree_iter_peek_all_levels(iter) :
+ flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) :
+ bch2_btree_iter_peek(iter);
+}
+
+static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter,
+ struct bpos end,
+ unsigned flags)
+{
+ if (!(flags & BTREE_ITER_SLOTS))
+ return bch2_btree_iter_peek_upto(iter, end);
+
+ if (bkey_gt(iter->pos, end))
+ return bkey_s_c_null;
+
+ return bch2_btree_iter_peek_slot(iter);
+}
+
+static inline int btree_trans_too_many_iters(struct btree_trans *trans)
+{
+ if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX - 8) {
+ trace_and_count(trans->c, trans_restart_too_many_iters, trans, _THIS_IP_);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
+ }
+
+ return 0;
+}
+
+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
+
+static inline struct bkey_s_c
+__bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
+ struct btree_iter *iter, unsigned flags)
+{
+ struct bkey_s_c k;
+
+ while (btree_trans_too_many_iters(trans) ||
+ (k = bch2_btree_iter_peek_type(iter, flags),
+ bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
+ bch2_trans_begin(trans);
+
+ return k;
+}
+
+static inline struct bkey_s_c
+__bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos end,
+ unsigned flags)
+{
+ struct bkey_s_c k;
+
+ while (btree_trans_too_many_iters(trans) ||
+ (k = bch2_btree_iter_peek_upto_type(iter, end, flags),
+ bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
+ bch2_trans_begin(trans);
+
+ return k;
+}
+
+#define lockrestart_do(_trans, _do) \
+({ \
+ u32 _restart_count; \
+ int _ret2; \
+ \
+ do { \
+ _restart_count = bch2_trans_begin(_trans); \
+ _ret2 = (_do); \
+ } while (bch2_err_matches(_ret2, BCH_ERR_transaction_restart)); \
+ \
+ if (!_ret2) \
+ bch2_trans_verify_not_restarted(_trans, _restart_count);\
+ \
+ _ret2; \
+})
+
+/*
+ * nested_lockrestart_do(), nested_commit_do():
+ *
+ * These are like lockrestart_do() and commit_do(), with two differences:
+ *
+ * - We don't call bch2_trans_begin() unless we had a transaction restart
+ * - We return -BCH_ERR_transaction_restart_nested if we succeeded after a
+ * transaction restart
+ */
+#define nested_lockrestart_do(_trans, _do) \
+({ \
+ u32 _restart_count, _orig_restart_count; \
+ int _ret2; \
+ \
+ _restart_count = _orig_restart_count = (_trans)->restart_count; \
+ \
+ while (bch2_err_matches(_ret2 = (_do), BCH_ERR_transaction_restart))\
+ _restart_count = bch2_trans_begin(_trans); \
+ \
+ if (!_ret2) \
+ bch2_trans_verify_not_restarted(_trans, _restart_count);\
+ \
+ _ret2 ?: trans_was_restarted(_trans, _restart_count); \
+})
+
+#define for_each_btree_key2(_trans, _iter, _btree_id, \
+ _start, _flags, _k, _do) \
+({ \
+ int _ret3 = 0; \
+ \
+ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ \
+ while (1) { \
+ u32 _restart_count = bch2_trans_begin(_trans); \
+ \
+ _ret3 = 0; \
+ (_k) = bch2_btree_iter_peek_type(&(_iter), (_flags)); \
+ if (!(_k).k) \
+ break; \
+ \
+ _ret3 = bkey_err(_k) ?: (_do); \
+ if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
+ continue; \
+ if (_ret3) \
+ break; \
+ bch2_trans_verify_not_restarted(_trans, _restart_count);\
+ if (!bch2_btree_iter_advance(&(_iter))) \
+ break; \
+ } \
+ \
+ bch2_trans_iter_exit((_trans), &(_iter)); \
+ _ret3; \
+})
+
+#define for_each_btree_key2_upto(_trans, _iter, _btree_id, \
+ _start, _end, _flags, _k, _do) \
+({ \
+ int _ret3 = 0; \
+ \
+ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ \
+ while (1) { \
+ u32 _restart_count = bch2_trans_begin(_trans); \
+ \
+ _ret3 = 0; \
+ (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, (_flags));\
+ if (!(_k).k) \
+ break; \
+ \
+ _ret3 = bkey_err(_k) ?: (_do); \
+ if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
+ continue; \
+ if (_ret3) \
+ break; \
+ bch2_trans_verify_not_restarted(_trans, _restart_count);\
+ if (!bch2_btree_iter_advance(&(_iter))) \
+ break; \
+ } \
+ \
+ bch2_trans_iter_exit((_trans), &(_iter)); \
+ _ret3; \
+})
+
+#define for_each_btree_key_reverse(_trans, _iter, _btree_id, \
+ _start, _flags, _k, _do) \
+({ \
+ int _ret3 = 0; \
+ \
+ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ \
+ while (1) { \
+ u32 _restart_count = bch2_trans_begin(_trans); \
+ (_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\
+ if (!(_k).k) { \
+ _ret3 = 0; \
+ break; \
+ } \
+ \
+ _ret3 = bkey_err(_k) ?: (_do); \
+ if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
+ continue; \
+ if (_ret3) \
+ break; \
+ bch2_trans_verify_not_restarted(_trans, _restart_count);\
+ if (!bch2_btree_iter_rewind(&(_iter))) \
+ break; \
+ } \
+ \
+ bch2_trans_iter_exit((_trans), &(_iter)); \
+ _ret3; \
+})
+
+#define for_each_btree_key_commit(_trans, _iter, _btree_id, \
+ _start, _iter_flags, _k, \
+ _disk_res, _journal_seq, _commit_flags,\
+ _do) \
+ for_each_btree_key2(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
+ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
+ (_journal_seq), (_commit_flags)))
+
+#define for_each_btree_key_reverse_commit(_trans, _iter, _btree_id, \
+ _start, _iter_flags, _k, \
+ _disk_res, _journal_seq, _commit_flags,\
+ _do) \
+ for_each_btree_key_reverse(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
+ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
+ (_journal_seq), (_commit_flags)))
+
+#define for_each_btree_key_upto_commit(_trans, _iter, _btree_id, \
+ _start, _end, _iter_flags, _k, \
+ _disk_res, _journal_seq, _commit_flags,\
+ _do) \
+ for_each_btree_key2_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\
+ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
+ (_journal_seq), (_commit_flags)))
+
+#define for_each_btree_key(_trans, _iter, _btree_id, \
+ _start, _flags, _k, _ret) \
+ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
+ !((_ret) = bkey_err(_k)) && (_k).k; \
+ bch2_btree_iter_advance(&(_iter)))
+
+#define for_each_btree_key_upto(_trans, _iter, _btree_id, \
+ _start, _end, _flags, _k, _ret) \
+ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ (_k) = __bch2_btree_iter_peek_upto_and_restart((_trans), \
+ &(_iter), _end, _flags),\
+ !((_ret) = bkey_err(_k)) && (_k).k; \
+ bch2_btree_iter_advance(&(_iter)))
+
+#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \
+ _start, _flags, _k, _ret) \
+ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \
+ !((_ret) = bkey_err(_k)) && (_k).k; \
+ bch2_btree_iter_advance(&(_iter)))
+
+#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \
+ _start, _end, _flags, _k, _ret) \
+ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),\
+ !((_ret) = bkey_err(_k)) && (_k).k; \
+ bch2_btree_iter_advance(&(_iter)))
+
+#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _ret) \
+ for (; \
+ (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
+ !((_ret) = bkey_err(_k)) && (_k).k; \
+ bch2_btree_iter_advance(&(_iter)))
+
+#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \
+ for (; \
+ (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \
+ !((_ret) = bkey_err(_k)) && (_k).k; \
+ bch2_btree_iter_advance(&(_iter)))
+
+#define for_each_btree_key_upto_continue_norestart(_iter, _end, _flags, _k, _ret)\
+ for (; \
+ (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags), \
+ !((_ret) = bkey_err(_k)) && (_k).k; \
+ bch2_btree_iter_advance(&(_iter)))
+
+#define drop_locks_do(_trans, _do) \
+({ \
+ bch2_trans_unlock(_trans); \
+ _do ?: bch2_trans_relock(_trans); \
+})
+
+#define allocate_dropping_locks_errcode(_trans, _do) \
+({ \
+ gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \
+ int _ret = _do; \
+ \
+ if (bch2_err_matches(_ret, ENOMEM)) { \
+ _gfp = GFP_KERNEL; \
+ _ret = drop_locks_do(trans, _do); \
+ } \
+ _ret; \
+})
+
+#define allocate_dropping_locks(_trans, _ret, _do) \
+({ \
+ gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \
+ typeof(_do) _p = _do; \
+ \
+ _ret = 0; \
+ if (unlikely(!_p)) { \
+ _gfp = GFP_KERNEL; \
+ _ret = drop_locks_do(trans, ((_p = _do), 0)); \
+ } \
+ _p; \
+})
+
+/* new multiple iterator interface: */
+
+void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
+void bch2_btree_path_to_text(struct printbuf *, struct btree_path *);
+void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
+void bch2_dump_trans_updates(struct btree_trans *);
+void bch2_dump_trans_paths_updates(struct btree_trans *);
+
+struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
+void bch2_trans_put(struct btree_trans *);
+
+extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
+unsigned bch2_trans_get_fn_idx(const char *);
+
+#define bch2_trans_get(_c) \
+({ \
+ static unsigned trans_fn_idx; \
+ \
+ if (unlikely(!trans_fn_idx)) \
+ trans_fn_idx = bch2_trans_get_fn_idx(__func__); \
+ __bch2_trans_get(_c, trans_fn_idx); \
+})
+
+void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *);
+
+void bch2_fs_btree_iter_exit(struct bch_fs *);
+void bch2_fs_btree_iter_init_early(struct bch_fs *);
+int bch2_fs_btree_iter_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_ITER_H */
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
new file mode 100644
index 000000000000..ec52f50d249d
--- /dev/null
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -0,0 +1,543 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bset.h"
+#include "btree_journal_iter.h"
+#include "journal_io.h"
+
+#include <linux/sort.h>
+
+/*
+ * For managing keys we read from the journal: until journal replay works normal
+ * btree lookups need to be able to find and return keys from the journal where
+ * they overwrite what's in the btree, so we have a special iterator and
+ * operations for the regular btree iter code to use:
+ */
+
+static int __journal_key_cmp(enum btree_id l_btree_id,
+ unsigned l_level,
+ struct bpos l_pos,
+ const struct journal_key *r)
+{
+ return (cmp_int(l_btree_id, r->btree_id) ?:
+ cmp_int(l_level, r->level) ?:
+ bpos_cmp(l_pos, r->k->k.p));
+}
+
+static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
+{
+ return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
+}
+
+static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
+{
+ size_t gap_size = keys->size - keys->nr;
+
+ if (idx >= keys->gap)
+ idx += gap_size;
+ return idx;
+}
+
+static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
+{
+ return keys->d + idx_to_pos(keys, idx);
+}
+
+static size_t __bch2_journal_key_search(struct journal_keys *keys,
+ enum btree_id id, unsigned level,
+ struct bpos pos)
+{
+ size_t l = 0, r = keys->nr, m;
+
+ while (l < r) {
+ m = l + ((r - l) >> 1);
+ if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
+ l = m + 1;
+ else
+ r = m;
+ }
+
+ BUG_ON(l < keys->nr &&
+ __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
+
+ BUG_ON(l &&
+ __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
+
+ return l;
+}
+
+static size_t bch2_journal_key_search(struct journal_keys *keys,
+ enum btree_id id, unsigned level,
+ struct bpos pos)
+{
+ return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
+}
+
+struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
+ unsigned level, struct bpos pos,
+ struct bpos end_pos, size_t *idx)
+{
+ struct journal_keys *keys = &c->journal_keys;
+ unsigned iters = 0;
+ struct journal_key *k;
+
+ BUG_ON(*idx > keys->nr);
+search:
+ if (!*idx)
+ *idx = __bch2_journal_key_search(keys, btree_id, level, pos);
+
+ while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
+ if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
+ return NULL;
+
+ if (__journal_key_cmp(btree_id, level, pos, k) <= 0 &&
+ !k->overwritten)
+ return k->k;
+
+ (*idx)++;
+ iters++;
+ if (iters == 10) {
+ *idx = 0;
+ goto search;
+ }
+ }
+
+ return NULL;
+}
+
+struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
+ unsigned level, struct bpos pos)
+{
+ size_t idx = 0;
+
+ return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
+}
+
+static void journal_iters_fix(struct bch_fs *c)
+{
+ struct journal_keys *keys = &c->journal_keys;
+ /* The key we just inserted is immediately before the gap: */
+ size_t gap_end = keys->gap + (keys->size - keys->nr);
+ struct btree_and_journal_iter *iter;
+
+ /*
+ * If an iterator points one after the key we just inserted, decrement
+ * the iterator so it points at the key we just inserted - if the
+ * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
+ * handle that:
+ */
+ list_for_each_entry(iter, &c->journal_iters, journal.list)
+ if (iter->journal.idx == gap_end)
+ iter->journal.idx = keys->gap - 1;
+}
+
+static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
+{
+ struct journal_keys *keys = &c->journal_keys;
+ struct journal_iter *iter;
+ size_t gap_size = keys->size - keys->nr;
+
+ list_for_each_entry(iter, &c->journal_iters, list) {
+ if (iter->idx > old_gap)
+ iter->idx -= gap_size;
+ if (iter->idx >= new_gap)
+ iter->idx += gap_size;
+ }
+}
+
+int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
+ unsigned level, struct bkey_i *k)
+{
+ struct journal_key n = {
+ .btree_id = id,
+ .level = level,
+ .k = k,
+ .allocated = true,
+ /*
+ * Ensure these keys are done last by journal replay, to unblock
+ * journal reclaim:
+ */
+ .journal_seq = U32_MAX,
+ };
+ struct journal_keys *keys = &c->journal_keys;
+ size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
+
+ BUG_ON(test_bit(BCH_FS_RW, &c->flags));
+
+ if (idx < keys->size &&
+ journal_key_cmp(&n, &keys->d[idx]) == 0) {
+ if (keys->d[idx].allocated)
+ kfree(keys->d[idx].k);
+ keys->d[idx] = n;
+ return 0;
+ }
+
+ if (idx > keys->gap)
+ idx -= keys->size - keys->nr;
+
+ if (keys->nr == keys->size) {
+ struct journal_keys new_keys = {
+ .nr = keys->nr,
+ .size = max_t(size_t, keys->size, 8) * 2,
+ };
+
+ new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL);
+ if (!new_keys.d) {
+ bch_err(c, "%s: error allocating new key array (size %zu)",
+ __func__, new_keys.size);
+ return -BCH_ERR_ENOMEM_journal_key_insert;
+ }
+
+ /* Since @keys was full, there was no gap: */
+ memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
+ kvfree(keys->d);
+ keys->d = new_keys.d;
+ keys->nr = new_keys.nr;
+ keys->size = new_keys.size;
+
+ /* And now the gap is at the end: */
+ keys->gap = keys->nr;
+ }
+
+ journal_iters_move_gap(c, keys->gap, idx);
+
+ move_gap(keys->d, keys->nr, keys->size, keys->gap, idx);
+ keys->gap = idx;
+
+ keys->nr++;
+ keys->d[keys->gap++] = n;
+
+ journal_iters_fix(c);
+
+ return 0;
+}
+
+/*
+ * Can only be used from the recovery thread while we're still RO - can't be
+ * used once we've got RW, as journal_keys is at that point used by multiple
+ * threads:
+ */
+int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
+ unsigned level, struct bkey_i *k)
+{
+ struct bkey_i *n;
+ int ret;
+
+ n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
+ if (!n)
+ return -BCH_ERR_ENOMEM_journal_key_insert;
+
+ bkey_copy(n, k);
+ ret = bch2_journal_key_insert_take(c, id, level, n);
+ if (ret)
+ kfree(n);
+ return ret;
+}
+
+int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
+ unsigned level, struct bpos pos)
+{
+ struct bkey_i whiteout;
+
+ bkey_init(&whiteout.k);
+ whiteout.k.p = pos;
+
+ return bch2_journal_key_insert(c, id, level, &whiteout);
+}
+
+void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
+ unsigned level, struct bpos pos)
+{
+ struct journal_keys *keys = &c->journal_keys;
+ size_t idx = bch2_journal_key_search(keys, btree, level, pos);
+
+ if (idx < keys->size &&
+ keys->d[idx].btree_id == btree &&
+ keys->d[idx].level == level &&
+ bpos_eq(keys->d[idx].k->k.p, pos))
+ keys->d[idx].overwritten = true;
+}
+
+static void bch2_journal_iter_advance(struct journal_iter *iter)
+{
+ if (iter->idx < iter->keys->size) {
+ iter->idx++;
+ if (iter->idx == iter->keys->gap)
+ iter->idx += iter->keys->size - iter->keys->nr;
+ }
+}
+
+static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
+{
+ struct journal_key *k = iter->keys->d + iter->idx;
+
+ while (k < iter->keys->d + iter->keys->size &&
+ k->btree_id == iter->btree_id &&
+ k->level == iter->level) {
+ if (!k->overwritten)
+ return bkey_i_to_s_c(k->k);
+
+ bch2_journal_iter_advance(iter);
+ k = iter->keys->d + iter->idx;
+ }
+
+ return bkey_s_c_null;
+}
+
+static void bch2_journal_iter_exit(struct journal_iter *iter)
+{
+ list_del(&iter->list);
+}
+
+static void bch2_journal_iter_init(struct bch_fs *c,
+ struct journal_iter *iter,
+ enum btree_id id, unsigned level,
+ struct bpos pos)
+{
+ iter->btree_id = id;
+ iter->level = level;
+ iter->keys = &c->journal_keys;
+ iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos);
+}
+
+static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
+{
+ return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
+ iter->b, &iter->unpacked);
+}
+
+static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
+{
+ bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
+}
+
+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
+{
+ if (bpos_eq(iter->pos, SPOS_MAX))
+ iter->at_end = true;
+ else
+ iter->pos = bpos_successor(iter->pos);
+}
+
+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
+{
+ struct bkey_s_c btree_k, journal_k, ret;
+again:
+ if (iter->at_end)
+ return bkey_s_c_null;
+
+ while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
+ bpos_lt(btree_k.k->p, iter->pos))
+ bch2_journal_iter_advance_btree(iter);
+
+ while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
+ bpos_lt(journal_k.k->p, iter->pos))
+ bch2_journal_iter_advance(&iter->journal);
+
+ ret = journal_k.k &&
+ (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
+ ? journal_k
+ : btree_k;
+
+ if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key))
+ ret = bkey_s_c_null;
+
+ if (ret.k) {
+ iter->pos = ret.k->p;
+ if (bkey_deleted(ret.k)) {
+ bch2_btree_and_journal_iter_advance(iter);
+ goto again;
+ }
+ } else {
+ iter->pos = SPOS_MAX;
+ iter->at_end = true;
+ }
+
+ return ret;
+}
+
+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
+{
+ bch2_journal_iter_exit(&iter->journal);
+}
+
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+ struct bch_fs *c,
+ struct btree *b,
+ struct btree_node_iter node_iter,
+ struct bpos pos)
+{
+ memset(iter, 0, sizeof(*iter));
+
+ iter->b = b;
+ iter->node_iter = node_iter;
+ bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
+ INIT_LIST_HEAD(&iter->journal.list);
+ iter->pos = b->data->min_key;
+ iter->at_end = false;
+}
+
+/*
+ * this version is used by btree_gc before filesystem has gone RW and
+ * multithreaded, so uses the journal_iters list:
+ */
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+ struct bch_fs *c,
+ struct btree *b)
+{
+ struct btree_node_iter node_iter;
+
+ bch2_btree_node_iter_init_from_start(&node_iter, b);
+ __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
+ list_add(&iter->journal.list, &c->journal_iters);
+}
+
+/* sort and dedup all keys in the journal: */
+
+void bch2_journal_entries_free(struct bch_fs *c)
+{
+ struct journal_replay **i;
+ struct genradix_iter iter;
+
+ genradix_for_each(&c->journal_entries, iter, i)
+ if (*i)
+ kvpfree(*i, offsetof(struct journal_replay, j) +
+ vstruct_bytes(&(*i)->j));
+ genradix_free(&c->journal_entries);
+}
+
+/*
+ * When keys compare equal, oldest compares first:
+ */
+static int journal_sort_key_cmp(const void *_l, const void *_r)
+{
+ const struct journal_key *l = _l;
+ const struct journal_key *r = _r;
+
+ return journal_key_cmp(l, r) ?:
+ cmp_int(l->journal_seq, r->journal_seq) ?:
+ cmp_int(l->journal_offset, r->journal_offset);
+}
+
+void bch2_journal_keys_put(struct bch_fs *c)
+{
+ struct journal_keys *keys = &c->journal_keys;
+ struct journal_key *i;
+
+ BUG_ON(atomic_read(&keys->ref) <= 0);
+
+ if (!atomic_dec_and_test(&keys->ref))
+ return;
+
+ move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
+ keys->gap = keys->nr;
+
+ for (i = keys->d; i < keys->d + keys->nr; i++)
+ if (i->allocated)
+ kfree(i->k);
+
+ kvfree(keys->d);
+ keys->d = NULL;
+ keys->nr = keys->gap = keys->size = 0;
+
+ bch2_journal_entries_free(c);
+}
+
+static void __journal_keys_sort(struct journal_keys *keys)
+{
+ struct journal_key *src, *dst;
+
+ sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL);
+
+ src = dst = keys->d;
+ while (src < keys->d + keys->nr) {
+ while (src + 1 < keys->d + keys->nr &&
+ src[0].btree_id == src[1].btree_id &&
+ src[0].level == src[1].level &&
+ bpos_eq(src[0].k->k.p, src[1].k->k.p))
+ src++;
+
+ *dst++ = *src++;
+ }
+
+ keys->nr = dst - keys->d;
+}
+
+int bch2_journal_keys_sort(struct bch_fs *c)
+{
+ struct genradix_iter iter;
+ struct journal_replay *i, **_i;
+ struct jset_entry *entry;
+ struct bkey_i *k;
+ struct journal_keys *keys = &c->journal_keys;
+ size_t nr_keys = 0, nr_read = 0;
+
+ genradix_for_each(&c->journal_entries, iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
+ continue;
+
+ for_each_jset_key(k, entry, &i->j)
+ nr_keys++;
+ }
+
+ if (!nr_keys)
+ return 0;
+
+ keys->size = roundup_pow_of_two(nr_keys);
+
+ keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
+ if (!keys->d) {
+ bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath",
+ nr_keys);
+
+ do {
+ keys->size >>= 1;
+ keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
+ } while (!keys->d && keys->size > nr_keys / 8);
+
+ if (!keys->d) {
+ bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting",
+ keys->size);
+ return -BCH_ERR_ENOMEM_journal_keys_sort;
+ }
+ }
+
+ genradix_for_each(&c->journal_entries, iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
+ continue;
+
+ cond_resched();
+
+ for_each_jset_key(k, entry, &i->j) {
+ if (keys->nr == keys->size) {
+ __journal_keys_sort(keys);
+
+ if (keys->nr > keys->size * 7 / 8) {
+ bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu",
+ keys->nr, keys->size, nr_read, nr_keys);
+ return -BCH_ERR_ENOMEM_journal_keys_sort;
+ }
+ }
+
+ keys->d[keys->nr++] = (struct journal_key) {
+ .btree_id = entry->btree_id,
+ .level = entry->level,
+ .k = k,
+ .journal_seq = le64_to_cpu(i->j.seq),
+ .journal_offset = k->_data - i->j._data,
+ };
+
+ nr_read++;
+ }
+ }
+
+ __journal_keys_sort(keys);
+ keys->gap = keys->nr;
+
+ bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr);
+ return 0;
+}
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
new file mode 100644
index 000000000000..8ca4c100b2e3
--- /dev/null
+++ b/fs/bcachefs/btree_journal_iter.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_H
+#define _BCACHEFS_BTREE_JOURNAL_ITER_H
+
+struct journal_iter {
+ struct list_head list;
+ enum btree_id btree_id;
+ unsigned level;
+ size_t idx;
+ struct journal_keys *keys;
+};
+
+/*
+ * Iterate over keys in the btree, with keys from the journal overlaid on top:
+ */
+
+struct btree_and_journal_iter {
+ struct btree *b;
+ struct btree_node_iter node_iter;
+ struct bkey unpacked;
+
+ struct journal_iter journal;
+ struct bpos pos;
+ bool at_end;
+};
+
+struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
+ unsigned, struct bpos, struct bpos, size_t *);
+struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
+ unsigned, struct bpos);
+
+int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
+ unsigned, struct bkey_i *);
+int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
+ unsigned, struct bkey_i *);
+int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
+ unsigned, struct bpos);
+void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id,
+ unsigned, struct bpos);
+
+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
+
+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
+ struct bch_fs *, struct btree *,
+ struct btree_node_iter, struct bpos);
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
+ struct bch_fs *,
+ struct btree *);
+
+void bch2_journal_keys_put(struct bch_fs *);
+
+static inline void bch2_journal_keys_put_initial(struct bch_fs *c)
+{
+ if (c->journal_keys.initial_ref_held)
+ bch2_journal_keys_put(c);
+ c->journal_keys.initial_ref_held = false;
+}
+
+void bch2_journal_entries_free(struct bch_fs *);
+
+int bch2_journal_keys_sort(struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
new file mode 100644
index 000000000000..1b7a5668df7c
--- /dev/null
+++ b/fs/bcachefs/btree_key_cache.c
@@ -0,0 +1,1074 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_cache.h"
+#include "btree_iter.h"
+#include "btree_key_cache.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+#include "errcode.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "trace.h"
+
+#include <linux/sched/mm.h>
+
+static inline bool btree_uses_pcpu_readers(enum btree_id id)
+{
+ return id == BTREE_ID_subvolumes;
+}
+
+static struct kmem_cache *bch2_key_cache;
+
+static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
+ const void *obj)
+{
+ const struct bkey_cached *ck = obj;
+ const struct bkey_cached_key *key = arg->key;
+
+ return ck->key.btree_id != key->btree_id ||
+ !bpos_eq(ck->key.pos, key->pos);
+}
+
+static const struct rhashtable_params bch2_btree_key_cache_params = {
+ .head_offset = offsetof(struct bkey_cached, hash),
+ .key_offset = offsetof(struct bkey_cached, key),
+ .key_len = sizeof(struct bkey_cached_key),
+ .obj_cmpfn = bch2_btree_key_cache_cmp_fn,
+};
+
+__flatten
+inline struct bkey_cached *
+bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
+{
+ struct bkey_cached_key key = {
+ .btree_id = btree_id,
+ .pos = pos,
+ };
+
+ return rhashtable_lookup_fast(&c->btree_key_cache.table, &key,
+ bch2_btree_key_cache_params);
+}
+
+static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
+{
+ if (!six_trylock_intent(&ck->c.lock))
+ return false;
+
+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ six_unlock_intent(&ck->c.lock);
+ return false;
+ }
+
+ if (!six_trylock_write(&ck->c.lock)) {
+ six_unlock_intent(&ck->c.lock);
+ return false;
+ }
+
+ return true;
+}
+
+static void bkey_cached_evict(struct btree_key_cache *c,
+ struct bkey_cached *ck)
+{
+ BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
+ bch2_btree_key_cache_params));
+ memset(&ck->key, ~0, sizeof(ck->key));
+
+ atomic_long_dec(&c->nr_keys);
+}
+
+static void bkey_cached_free(struct btree_key_cache *bc,
+ struct bkey_cached *ck)
+{
+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+
+ BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
+
+ ck->btree_trans_barrier_seq =
+ start_poll_synchronize_srcu(&c->btree_trans_barrier);
+
+ if (ck->c.lock.readers) {
+ list_move_tail(&ck->list, &bc->freed_pcpu);
+ bc->nr_freed_pcpu++;
+ } else {
+ list_move_tail(&ck->list, &bc->freed_nonpcpu);
+ bc->nr_freed_nonpcpu++;
+ }
+ atomic_long_inc(&bc->nr_freed);
+
+ kfree(ck->k);
+ ck->k = NULL;
+ ck->u64s = 0;
+
+ six_unlock_write(&ck->c.lock);
+ six_unlock_intent(&ck->c.lock);
+}
+
+#ifdef __KERNEL__
+static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
+ struct bkey_cached *ck)
+{
+ struct bkey_cached *pos;
+
+ bc->nr_freed_nonpcpu++;
+
+ list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
+ if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
+ pos->btree_trans_barrier_seq)) {
+ list_move(&ck->list, &pos->list);
+ return;
+ }
+ }
+
+ list_move(&ck->list, &bc->freed_nonpcpu);
+}
+#endif
+
+static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
+ struct bkey_cached *ck)
+{
+ BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
+
+ if (!ck->c.lock.readers) {
+#ifdef __KERNEL__
+ struct btree_key_cache_freelist *f;
+ bool freed = false;
+
+ preempt_disable();
+ f = this_cpu_ptr(bc->pcpu_freed);
+
+ if (f->nr < ARRAY_SIZE(f->objs)) {
+ f->objs[f->nr++] = ck;
+ freed = true;
+ }
+ preempt_enable();
+
+ if (!freed) {
+ mutex_lock(&bc->lock);
+ preempt_disable();
+ f = this_cpu_ptr(bc->pcpu_freed);
+
+ while (f->nr > ARRAY_SIZE(f->objs) / 2) {
+ struct bkey_cached *ck2 = f->objs[--f->nr];
+
+ __bkey_cached_move_to_freelist_ordered(bc, ck2);
+ }
+ preempt_enable();
+
+ __bkey_cached_move_to_freelist_ordered(bc, ck);
+ mutex_unlock(&bc->lock);
+ }
+#else
+ mutex_lock(&bc->lock);
+ list_move_tail(&ck->list, &bc->freed_nonpcpu);
+ bc->nr_freed_nonpcpu++;
+ mutex_unlock(&bc->lock);
+#endif
+ } else {
+ mutex_lock(&bc->lock);
+ list_move_tail(&ck->list, &bc->freed_pcpu);
+ mutex_unlock(&bc->lock);
+ }
+}
+
+static void bkey_cached_free_fast(struct btree_key_cache *bc,
+ struct bkey_cached *ck)
+{
+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+
+ ck->btree_trans_barrier_seq =
+ start_poll_synchronize_srcu(&c->btree_trans_barrier);
+
+ list_del_init(&ck->list);
+ atomic_long_inc(&bc->nr_freed);
+
+ kfree(ck->k);
+ ck->k = NULL;
+ ck->u64s = 0;
+
+ bkey_cached_move_to_freelist(bc, ck);
+
+ six_unlock_write(&ck->c.lock);
+ six_unlock_intent(&ck->c.lock);
+}
+
+static struct bkey_cached *
+bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
+ bool *was_new)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_key_cache *bc = &c->btree_key_cache;
+ struct bkey_cached *ck = NULL;
+ bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
+ int ret;
+
+ if (!pcpu_readers) {
+#ifdef __KERNEL__
+ struct btree_key_cache_freelist *f;
+
+ preempt_disable();
+ f = this_cpu_ptr(bc->pcpu_freed);
+ if (f->nr)
+ ck = f->objs[--f->nr];
+ preempt_enable();
+
+ if (!ck) {
+ mutex_lock(&bc->lock);
+ preempt_disable();
+ f = this_cpu_ptr(bc->pcpu_freed);
+
+ while (!list_empty(&bc->freed_nonpcpu) &&
+ f->nr < ARRAY_SIZE(f->objs) / 2) {
+ ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
+ list_del_init(&ck->list);
+ bc->nr_freed_nonpcpu--;
+ f->objs[f->nr++] = ck;
+ }
+
+ ck = f->nr ? f->objs[--f->nr] : NULL;
+ preempt_enable();
+ mutex_unlock(&bc->lock);
+ }
+#else
+ mutex_lock(&bc->lock);
+ if (!list_empty(&bc->freed_nonpcpu)) {
+ ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
+ list_del_init(&ck->list);
+ bc->nr_freed_nonpcpu--;
+ }
+ mutex_unlock(&bc->lock);
+#endif
+ } else {
+ mutex_lock(&bc->lock);
+ if (!list_empty(&bc->freed_pcpu)) {
+ ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
+ list_del_init(&ck->list);
+ }
+ mutex_unlock(&bc->lock);
+ }
+
+ if (ck) {
+ ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_);
+ if (unlikely(ret)) {
+ bkey_cached_move_to_freelist(bc, ck);
+ return ERR_PTR(ret);
+ }
+
+ path->l[0].b = (void *) ck;
+ path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
+ mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
+
+ ret = bch2_btree_node_lock_write(trans, path, &ck->c);
+ if (unlikely(ret)) {
+ btree_node_unlock(trans, path, 0);
+ bkey_cached_move_to_freelist(bc, ck);
+ return ERR_PTR(ret);
+ }
+
+ return ck;
+ }
+
+ ck = allocate_dropping_locks(trans, ret,
+ kmem_cache_zalloc(bch2_key_cache, _gfp));
+ if (ret) {
+ kmem_cache_free(bch2_key_cache, ck);
+ return ERR_PTR(ret);
+ }
+
+ if (!ck)
+ return NULL;
+
+ INIT_LIST_HEAD(&ck->list);
+ bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
+
+ ck->c.cached = true;
+ BUG_ON(!six_trylock_intent(&ck->c.lock));
+ BUG_ON(!six_trylock_write(&ck->c.lock));
+ *was_new = true;
+ return ck;
+}
+
+static struct bkey_cached *
+bkey_cached_reuse(struct btree_key_cache *c)
+{
+ struct bucket_table *tbl;
+ struct rhash_head *pos;
+ struct bkey_cached *ck;
+ unsigned i;
+
+ mutex_lock(&c->lock);
+ rcu_read_lock();
+ tbl = rht_dereference_rcu(c->table.tbl, &c->table);
+ for (i = 0; i < tbl->size; i++)
+ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+ bkey_cached_lock_for_evict(ck)) {
+ bkey_cached_evict(c, ck);
+ goto out;
+ }
+ }
+ ck = NULL;
+out:
+ rcu_read_unlock();
+ mutex_unlock(&c->lock);
+ return ck;
+}
+
+static struct bkey_cached *
+btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_key_cache *bc = &c->btree_key_cache;
+ struct bkey_cached *ck;
+ bool was_new = false;
+
+ ck = bkey_cached_alloc(trans, path, &was_new);
+ if (IS_ERR(ck))
+ return ck;
+
+ if (unlikely(!ck)) {
+ ck = bkey_cached_reuse(bc);
+ if (unlikely(!ck)) {
+ bch_err(c, "error allocating memory for key cache item, btree %s",
+ bch2_btree_id_str(path->btree_id));
+ return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create);
+ }
+
+ mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
+ }
+
+ ck->c.level = 0;
+ ck->c.btree_id = path->btree_id;
+ ck->key.btree_id = path->btree_id;
+ ck->key.pos = path->pos;
+ ck->valid = false;
+ ck->flags = 1U << BKEY_CACHED_ACCESSED;
+
+ if (unlikely(rhashtable_lookup_insert_fast(&bc->table,
+ &ck->hash,
+ bch2_btree_key_cache_params))) {
+ /* We raced with another fill: */
+
+ if (likely(was_new)) {
+ six_unlock_write(&ck->c.lock);
+ six_unlock_intent(&ck->c.lock);
+ kfree(ck);
+ } else {
+ bkey_cached_free_fast(bc, ck);
+ }
+
+ mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
+ return NULL;
+ }
+
+ atomic_long_inc(&bc->nr_keys);
+
+ six_unlock_write(&ck->c.lock);
+
+ return ck;
+}
+
+static int btree_key_cache_fill(struct btree_trans *trans,
+ struct btree_path *ck_path,
+ struct bkey_cached *ck)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ unsigned new_u64s = 0;
+ struct bkey_i *new_k = NULL;
+ int ret;
+
+ k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos,
+ BTREE_ITER_KEY_CACHE_FILL|
+ BTREE_ITER_CACHED_NOFILL);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (!bch2_btree_node_relock(trans, ck_path, 0)) {
+ trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
+ goto err;
+ }
+
+ /*
+ * bch2_varint_decode can read past the end of the buffer by at
+ * most 7 bytes (it won't be used):
+ */
+ new_u64s = k.k->u64s + 1;
+
+ /*
+ * Allocate some extra space so that the transaction commit path is less
+ * likely to have to reallocate, since that requires a transaction
+ * restart:
+ */
+ new_u64s = min(256U, (new_u64s * 3) / 2);
+
+ if (new_u64s > ck->u64s) {
+ new_u64s = roundup_pow_of_two(new_u64s);
+ new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
+ if (!new_k) {
+ bch2_trans_unlock(trans);
+
+ new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
+ if (!new_k) {
+ bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
+ bch2_btree_id_str(ck->key.btree_id), new_u64s);
+ ret = -BCH_ERR_ENOMEM_btree_key_cache_fill;
+ goto err;
+ }
+
+ if (!bch2_btree_node_relock(trans, ck_path, 0)) {
+ kfree(new_k);
+ trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
+ goto err;
+ }
+
+ ret = bch2_trans_relock(trans);
+ if (ret) {
+ kfree(new_k);
+ goto err;
+ }
+ }
+ }
+
+ ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c);
+ if (ret) {
+ kfree(new_k);
+ goto err;
+ }
+
+ if (new_k) {
+ kfree(ck->k);
+ ck->u64s = new_u64s;
+ ck->k = new_k;
+ }
+
+ bkey_reassemble(ck->k, k);
+ ck->valid = true;
+ bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
+
+ /* We're not likely to need this iterator again: */
+ set_btree_iter_dontneed(&iter);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static noinline int
+bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_cached *ck;
+ int ret = 0;
+
+ BUG_ON(path->level);
+
+ path->l[1].b = NULL;
+
+ if (bch2_btree_node_relock_notrace(trans, path, 0)) {
+ ck = (void *) path->l[0].b;
+ goto fill;
+ }
+retry:
+ ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
+ if (!ck) {
+ ck = btree_key_cache_create(trans, path);
+ ret = PTR_ERR_OR_ZERO(ck);
+ if (ret)
+ goto err;
+ if (!ck)
+ goto retry;
+
+ mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
+ path->locks_want = 1;
+ } else {
+ enum six_lock_type lock_want = __btree_lock_want(path, 0);
+
+ ret = btree_node_lock(trans, path, (void *) ck, 0,
+ lock_want, _THIS_IP_);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto err;
+
+ BUG_ON(ret);
+
+ if (ck->key.btree_id != path->btree_id ||
+ !bpos_eq(ck->key.pos, path->pos)) {
+ six_unlock_type(&ck->c.lock, lock_want);
+ goto retry;
+ }
+
+ mark_btree_node_locked(trans, path, 0,
+ (enum btree_node_locked_type) lock_want);
+ }
+
+ path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
+ path->l[0].b = (void *) ck;
+fill:
+ path->uptodate = BTREE_ITER_UPTODATE;
+
+ if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) {
+ /*
+ * Using the underscore version because we haven't set
+ * path->uptodate yet:
+ */
+ if (!path->locks_want &&
+ !__bch2_btree_path_upgrade(trans, path, 1, NULL)) {
+ trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_);
+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade);
+ goto err;
+ }
+
+ ret = btree_key_cache_fill(trans, path, ck);
+ if (ret)
+ goto err;
+
+ ret = bch2_btree_path_relock(trans, path, _THIS_IP_);
+ if (ret)
+ goto err;
+
+ path->uptodate = BTREE_ITER_UPTODATE;
+ }
+
+ if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+ set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+
+ BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
+ BUG_ON(path->uptodate);
+
+ return ret;
+err:
+ path->uptodate = BTREE_ITER_NEED_TRAVERSE;
+ if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+ btree_node_unlock(trans, path, 0);
+ path->l[0].b = ERR_PTR(ret);
+ }
+ return ret;
+}
+
+int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_cached *ck;
+ int ret = 0;
+
+ EBUG_ON(path->level);
+
+ path->l[1].b = NULL;
+
+ if (bch2_btree_node_relock_notrace(trans, path, 0)) {
+ ck = (void *) path->l[0].b;
+ goto fill;
+ }
+retry:
+ ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
+ if (!ck) {
+ return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
+ } else {
+ enum six_lock_type lock_want = __btree_lock_want(path, 0);
+
+ ret = btree_node_lock(trans, path, (void *) ck, 0,
+ lock_want, _THIS_IP_);
+ EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart));
+
+ if (ret)
+ return ret;
+
+ if (ck->key.btree_id != path->btree_id ||
+ !bpos_eq(ck->key.pos, path->pos)) {
+ six_unlock_type(&ck->c.lock, lock_want);
+ goto retry;
+ }
+
+ mark_btree_node_locked(trans, path, 0,
+ (enum btree_node_locked_type) lock_want);
+ }
+
+ path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
+ path->l[0].b = (void *) ck;
+fill:
+ if (!ck->valid)
+ return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
+
+ if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+ set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+
+ path->uptodate = BTREE_ITER_UPTODATE;
+ EBUG_ON(!ck->valid);
+ EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
+
+ return ret;
+}
+
+static int btree_key_cache_flush_pos(struct btree_trans *trans,
+ struct bkey_cached_key key,
+ u64 journal_seq,
+ unsigned commit_flags,
+ bool evict)
+{
+ struct bch_fs *c = trans->c;
+ struct journal *j = &c->journal;
+ struct btree_iter c_iter, b_iter;
+ struct bkey_cached *ck = NULL;
+ int ret;
+
+ bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
+ BTREE_ITER_SLOTS|
+ BTREE_ITER_INTENT|
+ BTREE_ITER_ALL_SNAPSHOTS);
+ bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
+ BTREE_ITER_CACHED|
+ BTREE_ITER_INTENT);
+ b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE;
+
+ ret = bch2_btree_iter_traverse(&c_iter);
+ if (ret)
+ goto out;
+
+ ck = (void *) c_iter.path->l[0].b;
+ if (!ck)
+ goto out;
+
+ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ if (evict)
+ goto evict;
+ goto out;
+ }
+
+ BUG_ON(!ck->valid);
+
+ if (journal_seq && ck->journal.seq != journal_seq)
+ goto out;
+
+ /*
+ * Since journal reclaim depends on us making progress here, and the
+ * allocator/copygc depend on journal reclaim making progress, we need
+ * to be using alloc reserves:
+ */
+ ret = bch2_btree_iter_traverse(&b_iter) ?:
+ bch2_trans_update(trans, &b_iter, ck->k,
+ BTREE_UPDATE_KEY_CACHE_RECLAIM|
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+ BTREE_TRIGGER_NORUN) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_NOCHECK_RW|
+ BTREE_INSERT_NOFAIL|
+ (ck->journal.seq == journal_last_seq(j)
+ ? BCH_WATERMARK_reclaim
+ : 0)|
+ commit_flags);
+
+ bch2_fs_fatal_err_on(ret &&
+ !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
+ !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
+ !bch2_journal_error(j), c,
+ "error flushing key cache: %s", bch2_err_str(ret));
+ if (ret)
+ goto out;
+
+ bch2_journal_pin_drop(j, &ck->journal);
+
+ BUG_ON(!btree_node_locked(c_iter.path, 0));
+
+ if (!evict) {
+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+ atomic_long_dec(&c->btree_key_cache.nr_dirty);
+ }
+ } else {
+ struct btree_path *path2;
+evict:
+ trans_for_each_path(trans, path2)
+ if (path2 != c_iter.path)
+ __bch2_btree_path_unlock(trans, path2);
+
+ bch2_btree_node_lock_write_nofail(trans, c_iter.path, &ck->c);
+
+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+ atomic_long_dec(&c->btree_key_cache.nr_dirty);
+ }
+
+ mark_btree_node_locked_noreset(c_iter.path, 0, BTREE_NODE_UNLOCKED);
+ bkey_cached_evict(&c->btree_key_cache, ck);
+ bkey_cached_free_fast(&c->btree_key_cache, ck);
+ }
+out:
+ bch2_trans_iter_exit(trans, &b_iter);
+ bch2_trans_iter_exit(trans, &c_iter);
+ return ret;
+}
+
+int bch2_btree_key_cache_journal_flush(struct journal *j,
+ struct journal_entry_pin *pin, u64 seq)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bkey_cached *ck =
+ container_of(pin, struct bkey_cached, journal);
+ struct bkey_cached_key key;
+ struct btree_trans *trans = bch2_trans_get(c);
+ int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+ int ret = 0;
+
+ btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read);
+ key = ck->key;
+
+ if (ck->journal.seq != seq ||
+ !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ six_unlock_read(&ck->c.lock);
+ goto unlock;
+ }
+
+ if (ck->seq != seq) {
+ bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal,
+ bch2_btree_key_cache_journal_flush);
+ six_unlock_read(&ck->c.lock);
+ goto unlock;
+ }
+ six_unlock_read(&ck->c.lock);
+
+ ret = commit_do(trans, NULL, NULL, 0,
+ btree_key_cache_flush_pos(trans, key, seq,
+ BTREE_INSERT_JOURNAL_RECLAIM, false));
+unlock:
+ srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+
+ bch2_trans_put(trans);
+ return ret;
+}
+
+/*
+ * Flush and evict a key from the key cache:
+ */
+int bch2_btree_key_cache_flush(struct btree_trans *trans,
+ enum btree_id id, struct bpos pos)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_cached_key key = { id, pos };
+
+ /* Fastpath - assume it won't be found: */
+ if (!bch2_btree_key_cache_find(c, id, pos))
+ return 0;
+
+ return btree_key_cache_flush_pos(trans, key, 0, 0, true);
+}
+
+bool bch2_btree_insert_key_cached(struct btree_trans *trans,
+ unsigned flags,
+ struct btree_insert_entry *insert_entry)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_cached *ck = (void *) insert_entry->path->l[0].b;
+ struct bkey_i *insert = insert_entry->k;
+ bool kick_reclaim = false;
+
+ BUG_ON(insert->k.u64s > ck->u64s);
+
+ bkey_copy(ck->k, insert);
+ ck->valid = true;
+
+ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
+ set_bit(BKEY_CACHED_DIRTY, &ck->flags);
+ atomic_long_inc(&c->btree_key_cache.nr_dirty);
+
+ if (bch2_nr_btree_keys_need_flush(c))
+ kick_reclaim = true;
+ }
+
+ /*
+ * To minimize lock contention, we only add the journal pin here and
+ * defer pin updates to the flush callback via ->seq. Be careful not to
+ * update ->seq on nojournal commits because we don't want to update the
+ * pin to a seq that doesn't include journal updates on disk. Otherwise
+ * we risk losing the update after a crash.
+ *
+ * The only exception is if the pin is not active in the first place. We
+ * have to add the pin because journal reclaim drives key cache
+ * flushing. The flush callback will not proceed unless ->seq matches
+ * the latest pin, so make sure it starts with a consistent value.
+ */
+ if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) ||
+ !journal_pin_active(&ck->journal)) {
+ ck->seq = trans->journal_res.seq;
+ }
+ bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
+ &ck->journal, bch2_btree_key_cache_journal_flush);
+
+ if (kick_reclaim)
+ journal_reclaim_kick(&c->journal);
+ return true;
+}
+
+void bch2_btree_key_cache_drop(struct btree_trans *trans,
+ struct btree_path *path)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_cached *ck = (void *) path->l[0].b;
+
+ BUG_ON(!ck->valid);
+
+ /*
+ * We just did an update to the btree, bypassing the key cache: the key
+ * cache key is now stale and must be dropped, even if dirty:
+ */
+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+ atomic_long_dec(&c->btree_key_cache.nr_dirty);
+ bch2_journal_pin_drop(&c->journal, &ck->journal);
+ }
+
+ ck->valid = false;
+}
+
+static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct bch_fs *c = shrink->private_data;
+ struct btree_key_cache *bc = &c->btree_key_cache;
+ struct bucket_table *tbl;
+ struct bkey_cached *ck, *t;
+ size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
+ unsigned start, flags;
+ int srcu_idx;
+
+ mutex_lock(&bc->lock);
+ srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+ flags = memalloc_nofs_save();
+
+ /*
+ * Newest freed entries are at the end of the list - once we hit one
+ * that's too new to be freed, we can bail out:
+ */
+ scanned += bc->nr_freed_nonpcpu;
+
+ list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
+ if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
+ ck->btree_trans_barrier_seq))
+ break;
+
+ list_del(&ck->list);
+ six_lock_exit(&ck->c.lock);
+ kmem_cache_free(bch2_key_cache, ck);
+ atomic_long_dec(&bc->nr_freed);
+ freed++;
+ bc->nr_freed_nonpcpu--;
+ }
+
+ if (scanned >= nr)
+ goto out;
+
+ scanned += bc->nr_freed_pcpu;
+
+ list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
+ if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
+ ck->btree_trans_barrier_seq))
+ break;
+
+ list_del(&ck->list);
+ six_lock_exit(&ck->c.lock);
+ kmem_cache_free(bch2_key_cache, ck);
+ atomic_long_dec(&bc->nr_freed);
+ freed++;
+ bc->nr_freed_pcpu--;
+ }
+
+ if (scanned >= nr)
+ goto out;
+
+ rcu_read_lock();
+ tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
+ if (bc->shrink_iter >= tbl->size)
+ bc->shrink_iter = 0;
+ start = bc->shrink_iter;
+
+ do {
+ struct rhash_head *pos, *next;
+
+ pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter));
+
+ while (!rht_is_a_nulls(pos)) {
+ next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
+ ck = container_of(pos, struct bkey_cached, hash);
+
+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
+ goto next;
+
+ if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+ clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+ else if (bkey_cached_lock_for_evict(ck)) {
+ bkey_cached_evict(bc, ck);
+ bkey_cached_free(bc, ck);
+ }
+
+ scanned++;
+ if (scanned >= nr)
+ break;
+next:
+ pos = next;
+ }
+
+ bc->shrink_iter++;
+ if (bc->shrink_iter >= tbl->size)
+ bc->shrink_iter = 0;
+ } while (scanned < nr && bc->shrink_iter != start);
+
+ rcu_read_unlock();
+out:
+ memalloc_nofs_restore(flags);
+ srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+ mutex_unlock(&bc->lock);
+
+ return freed;
+}
+
+static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct bch_fs *c = shrink->private_data;
+ struct btree_key_cache *bc = &c->btree_key_cache;
+ long nr = atomic_long_read(&bc->nr_keys) -
+ atomic_long_read(&bc->nr_dirty);
+
+ return max(0L, nr);
+}
+
+void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
+{
+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+ struct bucket_table *tbl;
+ struct bkey_cached *ck, *n;
+ struct rhash_head *pos;
+ LIST_HEAD(items);
+ unsigned i;
+#ifdef __KERNEL__
+ int cpu;
+#endif
+
+ shrinker_free(bc->shrink);
+
+ mutex_lock(&bc->lock);
+
+ /*
+ * The loop is needed to guard against racing with rehash:
+ */
+ while (atomic_long_read(&bc->nr_keys)) {
+ rcu_read_lock();
+ tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
+ if (tbl)
+ for (i = 0; i < tbl->size; i++)
+ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+ bkey_cached_evict(bc, ck);
+ list_add(&ck->list, &items);
+ }
+ rcu_read_unlock();
+ }
+
+#ifdef __KERNEL__
+ for_each_possible_cpu(cpu) {
+ struct btree_key_cache_freelist *f =
+ per_cpu_ptr(bc->pcpu_freed, cpu);
+
+ for (i = 0; i < f->nr; i++) {
+ ck = f->objs[i];
+ list_add(&ck->list, &items);
+ }
+ }
+#endif
+
+ BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu);
+ BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu);
+
+ list_splice(&bc->freed_pcpu, &items);
+ list_splice(&bc->freed_nonpcpu, &items);
+
+ mutex_unlock(&bc->lock);
+
+ list_for_each_entry_safe(ck, n, &items, list) {
+ cond_resched();
+
+ list_del(&ck->list);
+ kfree(ck->k);
+ six_lock_exit(&ck->c.lock);
+ kmem_cache_free(bch2_key_cache, ck);
+ }
+
+ if (atomic_long_read(&bc->nr_dirty) &&
+ !bch2_journal_error(&c->journal) &&
+ test_bit(BCH_FS_WAS_RW, &c->flags))
+ panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n",
+ atomic_long_read(&bc->nr_dirty));
+
+ if (atomic_long_read(&bc->nr_keys))
+ panic("btree key cache shutdown error: nr_keys nonzero (%li)\n",
+ atomic_long_read(&bc->nr_keys));
+
+ if (bc->table_init_done)
+ rhashtable_destroy(&bc->table);
+
+ free_percpu(bc->pcpu_freed);
+}
+
+void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
+{
+ mutex_init(&c->lock);
+ INIT_LIST_HEAD(&c->freed_pcpu);
+ INIT_LIST_HEAD(&c->freed_nonpcpu);
+}
+
+int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
+{
+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+ struct shrinker *shrink;
+
+#ifdef __KERNEL__
+ bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
+ if (!bc->pcpu_freed)
+ return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+#endif
+
+ if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
+ return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+
+ bc->table_init_done = true;
+
+ shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name);
+ if (!shrink)
+ return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+ bc->shrink = shrink;
+ shrink->seeks = 0;
+ shrink->count_objects = bch2_btree_key_cache_count;
+ shrink->scan_objects = bch2_btree_key_cache_scan;
+ shrink->private_data = c;
+ shrinker_register(shrink);
+ return 0;
+}
+
+void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
+{
+ prt_printf(out, "nr_freed:\t%lu", atomic_long_read(&c->nr_freed));
+ prt_newline(out);
+ prt_printf(out, "nr_keys:\t%lu", atomic_long_read(&c->nr_keys));
+ prt_newline(out);
+ prt_printf(out, "nr_dirty:\t%lu", atomic_long_read(&c->nr_dirty));
+ prt_newline(out);
+}
+
+void bch2_btree_key_cache_exit(void)
+{
+ kmem_cache_destroy(bch2_key_cache);
+}
+
+int __init bch2_btree_key_cache_init(void)
+{
+ bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT);
+ if (!bch2_key_cache)
+ return -ENOMEM;
+
+ return 0;
+}
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
new file mode 100644
index 000000000000..be3acde2caa0
--- /dev/null
+++ b/fs/bcachefs/btree_key_cache.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_KEY_CACHE_H
+#define _BCACHEFS_BTREE_KEY_CACHE_H
+
+static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
+{
+ size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
+ size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
+ size_t max_dirty = 1024 + nr_keys / 2;
+
+ return max_t(ssize_t, 0, nr_dirty - max_dirty);
+}
+
+static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
+{
+ size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
+ size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
+ size_t max_dirty = 4096 + (nr_keys * 3) / 4;
+
+ return nr_dirty > max_dirty;
+}
+
+int bch2_btree_key_cache_journal_flush(struct journal *,
+ struct journal_entry_pin *, u64);
+
+struct bkey_cached *
+bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
+
+int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *,
+ unsigned);
+
+bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned,
+ struct btree_insert_entry *);
+int bch2_btree_key_cache_flush(struct btree_trans *,
+ enum btree_id, struct bpos);
+void bch2_btree_key_cache_drop(struct btree_trans *,
+ struct btree_path *);
+
+void bch2_fs_btree_key_cache_exit(struct btree_key_cache *);
+void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *);
+int bch2_fs_btree_key_cache_init(struct btree_key_cache *);
+
+void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *);
+
+void bch2_btree_key_cache_exit(void);
+int __init bch2_btree_key_cache_init(void);
+
+#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */
diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h
new file mode 100644
index 000000000000..290e4e57df5b
--- /dev/null
+++ b/fs/bcachefs/btree_key_cache_types.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
+#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
+
+struct btree_key_cache_freelist {
+ struct bkey_cached *objs[16];
+ unsigned nr;
+};
+
+struct btree_key_cache {
+ struct mutex lock;
+ struct rhashtable table;
+ bool table_init_done;
+
+ struct list_head freed_pcpu;
+ size_t nr_freed_pcpu;
+ struct list_head freed_nonpcpu;
+ size_t nr_freed_nonpcpu;
+
+ struct shrinker *shrink;
+ unsigned shrink_iter;
+ struct btree_key_cache_freelist __percpu *pcpu_freed;
+
+ atomic_long_t nr_freed;
+ atomic_long_t nr_keys;
+ atomic_long_t nr_dirty;
+};
+
+struct bkey_cached_key {
+ u32 btree_id;
+ struct bpos pos;
+} __packed __aligned(4);
+
+#endif /* _BCACHEFS_BTREE_KEY_CACHE_TYPES_H */
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
new file mode 100644
index 000000000000..3d48834d091f
--- /dev/null
+++ b/fs/bcachefs/btree_locking.c
@@ -0,0 +1,817 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_locking.h"
+#include "btree_types.h"
+
+static struct lock_class_key bch2_btree_node_lock_key;
+
+void bch2_btree_lock_init(struct btree_bkey_cached_common *b,
+ enum six_lock_init_flags flags)
+{
+ __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags);
+ lockdep_set_novalidate_class(&b->lock);
+}
+
+#ifdef CONFIG_LOCKDEP
+void bch2_assert_btree_nodes_not_locked(void)
+{
+#if 0
+ //Re-enable when lock_class_is_held() is merged:
+ BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
+#endif
+}
+#endif
+
+/* Btree node locking: */
+
+struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
+ struct btree_path *skip,
+ struct btree_bkey_cached_common *b,
+ unsigned level)
+{
+ struct btree_path *path;
+ struct six_lock_count ret;
+
+ memset(&ret, 0, sizeof(ret));
+
+ if (IS_ERR_OR_NULL(b))
+ return ret;
+
+ trans_for_each_path(trans, path)
+ if (path != skip && &path->l[level].b->c == b) {
+ int t = btree_node_locked_type(path, level);
+
+ if (t != BTREE_NODE_UNLOCKED)
+ ret.n[t]++;
+ }
+
+ return ret;
+}
+
+/* unlock */
+
+void bch2_btree_node_unlock_write(struct btree_trans *trans,
+ struct btree_path *path, struct btree *b)
+{
+ bch2_btree_node_unlock_write_inlined(trans, path, b);
+}
+
+/* lock */
+
+/*
+ * @trans wants to lock @b with type @type
+ */
+struct trans_waiting_for_lock {
+ struct btree_trans *trans;
+ struct btree_bkey_cached_common *node_want;
+ enum six_lock_type lock_want;
+
+ /* for iterating over held locks :*/
+ u8 path_idx;
+ u8 level;
+ u64 lock_start_time;
+};
+
+struct lock_graph {
+ struct trans_waiting_for_lock g[8];
+ unsigned nr;
+};
+
+static noinline void print_cycle(struct printbuf *out, struct lock_graph *g)
+{
+ struct trans_waiting_for_lock *i;
+
+ prt_printf(out, "Found lock cycle (%u entries):", g->nr);
+ prt_newline(out);
+
+ for (i = g->g; i < g->g + g->nr; i++)
+ bch2_btree_trans_to_text(out, i->trans);
+}
+
+static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
+{
+ struct trans_waiting_for_lock *i;
+
+ for (i = g->g; i != g->g + g->nr; i++) {
+ if (i != g->g)
+ prt_str(out, "<- ");
+ prt_printf(out, "%u ", i->trans->locking_wait.task->pid);
+ }
+ prt_newline(out);
+}
+
+static void lock_graph_up(struct lock_graph *g)
+{
+ closure_put(&g->g[--g->nr].trans->ref);
+}
+
+static noinline void lock_graph_pop_all(struct lock_graph *g)
+{
+ while (g->nr)
+ lock_graph_up(g);
+}
+
+static void __lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
+{
+ g->g[g->nr++] = (struct trans_waiting_for_lock) {
+ .trans = trans,
+ .node_want = trans->locking,
+ .lock_want = trans->locking_wait.lock_want,
+ };
+}
+
+static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
+{
+ closure_get(&trans->ref);
+ __lock_graph_down(g, trans);
+}
+
+static bool lock_graph_remove_non_waiters(struct lock_graph *g)
+{
+ struct trans_waiting_for_lock *i;
+
+ for (i = g->g + 1; i < g->g + g->nr; i++)
+ if (i->trans->locking != i->node_want ||
+ i->trans->locking_wait.start_time != i[-1].lock_start_time) {
+ while (g->g + g->nr > i)
+ lock_graph_up(g);
+ return true;
+ }
+
+ return false;
+}
+
+static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
+{
+ if (i == g->g) {
+ trace_and_count(i->trans->c, trans_restart_would_deadlock, i->trans, _RET_IP_);
+ return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
+ } else {
+ i->trans->lock_must_abort = true;
+ wake_up_process(i->trans->locking_wait.task);
+ return 0;
+ }
+}
+
+static int btree_trans_abort_preference(struct btree_trans *trans)
+{
+ if (trans->lock_may_not_fail)
+ return 0;
+ if (trans->locking_wait.lock_want == SIX_LOCK_write)
+ return 1;
+ if (!trans->in_traverse_all)
+ return 2;
+ return 3;
+}
+
+static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
+{
+ struct trans_waiting_for_lock *i, *abort = NULL;
+ unsigned best = 0, pref;
+ int ret;
+
+ if (lock_graph_remove_non_waiters(g))
+ return 0;
+
+ /* Only checking, for debugfs: */
+ if (cycle) {
+ print_cycle(cycle, g);
+ ret = -1;
+ goto out;
+ }
+
+ for (i = g->g; i < g->g + g->nr; i++) {
+ pref = btree_trans_abort_preference(i->trans);
+ if (pref > best) {
+ abort = i;
+ best = pref;
+ }
+ }
+
+ if (unlikely(!best)) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks"));
+
+ for (i = g->g; i < g->g + g->nr; i++) {
+ struct btree_trans *trans = i->trans;
+
+ bch2_btree_trans_to_text(&buf, trans);
+
+ prt_printf(&buf, "backtrace:");
+ prt_newline(&buf);
+ printbuf_indent_add(&buf, 2);
+ bch2_prt_task_backtrace(&buf, trans->locking_wait.task);
+ printbuf_indent_sub(&buf, 2);
+ prt_newline(&buf);
+ }
+
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ BUG();
+ }
+
+ ret = abort_lock(g, abort);
+out:
+ if (ret)
+ while (g->nr)
+ lock_graph_up(g);
+ return ret;
+}
+
+static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
+ struct printbuf *cycle)
+{
+ struct btree_trans *orig_trans = g->g->trans;
+ struct trans_waiting_for_lock *i;
+
+ for (i = g->g; i < g->g + g->nr; i++)
+ if (i->trans == trans) {
+ closure_put(&trans->ref);
+ return break_cycle(g, cycle);
+ }
+
+ if (g->nr == ARRAY_SIZE(g->g)) {
+ closure_put(&trans->ref);
+
+ if (orig_trans->lock_may_not_fail)
+ return 0;
+
+ while (g->nr)
+ lock_graph_up(g);
+
+ if (cycle)
+ return 0;
+
+ trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_);
+ return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit);
+ }
+
+ __lock_graph_down(g, trans);
+ return 0;
+}
+
+static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2)
+{
+ return t1 + t2 > 1;
+}
+
+int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
+{
+ struct lock_graph g;
+ struct trans_waiting_for_lock *top;
+ struct btree_bkey_cached_common *b;
+ struct btree_path *path;
+ unsigned path_idx;
+ int ret;
+
+ if (trans->lock_must_abort) {
+ if (cycle)
+ return -1;
+
+ trace_and_count(trans->c, trans_restart_would_deadlock, trans, _RET_IP_);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
+ }
+
+ g.nr = 0;
+ lock_graph_down(&g, trans);
+next:
+ if (!g.nr)
+ return 0;
+
+ top = &g.g[g.nr - 1];
+
+ trans_for_each_path_safe_from(top->trans, path, path_idx, top->path_idx) {
+ if (!path->nodes_locked)
+ continue;
+
+ if (path_idx != top->path_idx) {
+ top->path_idx = path_idx;
+ top->level = 0;
+ top->lock_start_time = 0;
+ }
+
+ for (;
+ top->level < BTREE_MAX_DEPTH;
+ top->level++, top->lock_start_time = 0) {
+ int lock_held = btree_node_locked_type(path, top->level);
+
+ if (lock_held == BTREE_NODE_UNLOCKED)
+ continue;
+
+ b = &READ_ONCE(path->l[top->level].b)->c;
+
+ if (IS_ERR_OR_NULL(b)) {
+ /*
+ * If we get here, it means we raced with the
+ * other thread updating its btree_path
+ * structures - which means it can't be blocked
+ * waiting on a lock:
+ */
+ if (!lock_graph_remove_non_waiters(&g)) {
+ /*
+ * If lock_graph_remove_non_waiters()
+ * didn't do anything, it must be
+ * because we're being called by debugfs
+ * checking for lock cycles, which
+ * invokes us on btree_transactions that
+ * aren't actually waiting on anything.
+ * Just bail out:
+ */
+ lock_graph_pop_all(&g);
+ }
+
+ goto next;
+ }
+
+ if (list_empty_careful(&b->lock.wait_list))
+ continue;
+
+ raw_spin_lock(&b->lock.wait_lock);
+ list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) {
+ BUG_ON(b != trans->locking);
+
+ if (top->lock_start_time &&
+ time_after_eq64(top->lock_start_time, trans->locking_wait.start_time))
+ continue;
+
+ top->lock_start_time = trans->locking_wait.start_time;
+
+ /* Don't check for self deadlock: */
+ if (trans == top->trans ||
+ !lock_type_conflicts(lock_held, trans->locking_wait.lock_want))
+ continue;
+
+ closure_get(&trans->ref);
+ raw_spin_unlock(&b->lock.wait_lock);
+
+ ret = lock_graph_descend(&g, trans, cycle);
+ if (ret)
+ return ret;
+ goto next;
+
+ }
+ raw_spin_unlock(&b->lock.wait_lock);
+ }
+ }
+
+ if (g.nr > 1 && cycle)
+ print_chain(cycle, &g);
+ lock_graph_up(&g);
+ goto next;
+}
+
+int bch2_six_check_for_deadlock(struct six_lock *lock, void *p)
+{
+ struct btree_trans *trans = p;
+
+ return bch2_check_for_deadlock(trans, NULL);
+}
+
+int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path,
+ struct btree_bkey_cached_common *b,
+ bool lock_may_not_fail)
+{
+ int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read];
+ int ret;
+
+ /*
+ * Must drop our read locks before calling six_lock_write() -
+ * six_unlock() won't do wakeups until the reader count
+ * goes to 0, and it's safe because we have the node intent
+ * locked:
+ */
+ six_lock_readers_add(&b->lock, -readers);
+ ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write,
+ lock_may_not_fail, _RET_IP_);
+ six_lock_readers_add(&b->lock, readers);
+
+ if (ret)
+ mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_INTENT_LOCKED);
+
+ return ret;
+}
+
+void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree_bkey_cached_common *b)
+{
+ struct btree_path *linked;
+ unsigned i;
+ int ret;
+
+ /*
+ * XXX BIG FAT NOTICE
+ *
+ * Drop all read locks before taking a write lock:
+ *
+ * This is a hack, because bch2_btree_node_lock_write_nofail() is a
+ * hack - but by dropping read locks first, this should never fail, and
+ * we only use this in code paths where whatever read locks we've
+ * already taken are no longer needed:
+ */
+
+ trans_for_each_path(trans, linked) {
+ if (!linked->nodes_locked)
+ continue;
+
+ for (i = 0; i < BTREE_MAX_DEPTH; i++)
+ if (btree_node_read_locked(linked, i)) {
+ btree_node_unlock(trans, linked, i);
+ btree_path_set_dirty(linked, BTREE_ITER_NEED_RELOCK);
+ }
+ }
+
+ ret = __btree_node_lock_write(trans, path, b, true);
+ BUG_ON(ret);
+}
+
+/* relock */
+
+static inline bool btree_path_get_locks(struct btree_trans *trans,
+ struct btree_path *path,
+ bool upgrade,
+ struct get_locks_fail *f)
+{
+ unsigned l = path->level;
+ int fail_idx = -1;
+
+ do {
+ if (!btree_path_node(path, l))
+ break;
+
+ if (!(upgrade
+ ? bch2_btree_node_upgrade(trans, path, l)
+ : bch2_btree_node_relock(trans, path, l))) {
+ fail_idx = l;
+
+ if (f) {
+ f->l = l;
+ f->b = path->l[l].b;
+ }
+ }
+
+ l++;
+ } while (l < path->locks_want);
+
+ /*
+ * When we fail to get a lock, we have to ensure that any child nodes
+ * can't be relocked so bch2_btree_path_traverse has to walk back up to
+ * the node that we failed to relock:
+ */
+ if (fail_idx >= 0) {
+ __bch2_btree_path_unlock(trans, path);
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+
+ do {
+ path->l[fail_idx].b = upgrade
+ ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade)
+ : ERR_PTR(-BCH_ERR_no_btree_node_relock);
+ --fail_idx;
+ } while (fail_idx >= 0);
+ }
+
+ if (path->uptodate == BTREE_ITER_NEED_RELOCK)
+ path->uptodate = BTREE_ITER_UPTODATE;
+
+ bch2_trans_verify_locks(trans);
+
+ return path->uptodate < BTREE_ITER_NEED_RELOCK;
+}
+
+bool __bch2_btree_node_relock(struct btree_trans *trans,
+ struct btree_path *path, unsigned level,
+ bool trace)
+{
+ struct btree *b = btree_path_node(path, level);
+ int want = __btree_lock_want(path, level);
+
+ if (race_fault())
+ goto fail;
+
+ if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
+ (btree_node_lock_seq_matches(path, b, level) &&
+ btree_node_lock_increment(trans, &b->c, level, want))) {
+ mark_btree_node_locked(trans, path, level, want);
+ return true;
+ }
+fail:
+ if (trace && !trans->notrace_relock_fail)
+ trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level);
+ return false;
+}
+
+/* upgrade */
+
+bool bch2_btree_node_upgrade(struct btree_trans *trans,
+ struct btree_path *path, unsigned level)
+{
+ struct btree *b = path->l[level].b;
+ struct six_lock_count count = bch2_btree_node_lock_counts(trans, path, &b->c, level);
+
+ if (!is_btree_node(path, level))
+ return false;
+
+ switch (btree_lock_want(path, level)) {
+ case BTREE_NODE_UNLOCKED:
+ BUG_ON(btree_node_locked(path, level));
+ return true;
+ case BTREE_NODE_READ_LOCKED:
+ BUG_ON(btree_node_intent_locked(path, level));
+ return bch2_btree_node_relock(trans, path, level);
+ case BTREE_NODE_INTENT_LOCKED:
+ break;
+ case BTREE_NODE_WRITE_LOCKED:
+ BUG();
+ }
+
+ if (btree_node_intent_locked(path, level))
+ return true;
+
+ if (race_fault())
+ return false;
+
+ if (btree_node_locked(path, level)) {
+ bool ret;
+
+ six_lock_readers_add(&b->c.lock, -count.n[SIX_LOCK_read]);
+ ret = six_lock_tryupgrade(&b->c.lock);
+ six_lock_readers_add(&b->c.lock, count.n[SIX_LOCK_read]);
+
+ if (ret)
+ goto success;
+ } else {
+ if (six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
+ goto success;
+ }
+
+ /*
+ * Do we already have an intent lock via another path? If so, just bump
+ * lock count:
+ */
+ if (btree_node_lock_seq_matches(path, b, level) &&
+ btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) {
+ btree_node_unlock(trans, path, level);
+ goto success;
+ }
+
+ trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level);
+ return false;
+success:
+ mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
+ return true;
+}
+
+/* Btree path locking: */
+
+/*
+ * Only for btree_cache.c - only relocks intent locks
+ */
+int bch2_btree_path_relock_intent(struct btree_trans *trans,
+ struct btree_path *path)
+{
+ unsigned l;
+
+ for (l = path->level;
+ l < path->locks_want && btree_path_node(path, l);
+ l++) {
+ if (!bch2_btree_node_relock(trans, path, l)) {
+ __bch2_btree_path_unlock(trans, path);
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent);
+ }
+ }
+
+ return 0;
+}
+
+__flatten
+bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
+ struct btree_path *path, unsigned long trace_ip)
+{
+ struct get_locks_fail f;
+
+ return btree_path_get_locks(trans, path, false, &f);
+}
+
+int __bch2_btree_path_relock(struct btree_trans *trans,
+ struct btree_path *path, unsigned long trace_ip)
+{
+ if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) {
+ trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
+ }
+
+ return 0;
+}
+
+bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned new_locks_want,
+ struct get_locks_fail *f)
+{
+ EBUG_ON(path->locks_want >= new_locks_want);
+
+ path->locks_want = new_locks_want;
+
+ return btree_path_get_locks(trans, path, true, f);
+}
+
+bool __bch2_btree_path_upgrade(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned new_locks_want,
+ struct get_locks_fail *f)
+{
+ struct btree_path *linked;
+
+ if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f))
+ return true;
+
+ /*
+ * XXX: this is ugly - we'd prefer to not be mucking with other
+ * iterators in the btree_trans here.
+ *
+ * On failure to upgrade the iterator, setting iter->locks_want and
+ * calling get_locks() is sufficient to make bch2_btree_path_traverse()
+ * get the locks we want on transaction restart.
+ *
+ * But if this iterator was a clone, on transaction restart what we did
+ * to this iterator isn't going to be preserved.
+ *
+ * Possibly we could add an iterator field for the parent iterator when
+ * an iterator is a copy - for now, we'll just upgrade any other
+ * iterators with the same btree id.
+ *
+ * The code below used to be needed to ensure ancestor nodes get locked
+ * before interior nodes - now that's handled by
+ * bch2_btree_path_traverse_all().
+ */
+ if (!path->cached && !trans->in_traverse_all)
+ trans_for_each_path(trans, linked)
+ if (linked != path &&
+ linked->cached == path->cached &&
+ linked->btree_id == path->btree_id &&
+ linked->locks_want < new_locks_want) {
+ linked->locks_want = new_locks_want;
+ btree_path_get_locks(trans, linked, true, NULL);
+ }
+
+ return false;
+}
+
+void __bch2_btree_path_downgrade(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned new_locks_want)
+{
+ unsigned l;
+
+ if (trans->restarted)
+ return;
+
+ EBUG_ON(path->locks_want < new_locks_want);
+
+ path->locks_want = new_locks_want;
+
+ while (path->nodes_locked &&
+ (l = btree_path_highest_level_locked(path)) >= path->locks_want) {
+ if (l > path->level) {
+ btree_node_unlock(trans, path, l);
+ } else {
+ if (btree_node_intent_locked(path, l)) {
+ six_lock_downgrade(&path->l[l].b->c.lock);
+ mark_btree_node_locked_noreset(path, l, BTREE_NODE_READ_LOCKED);
+ }
+ break;
+ }
+ }
+
+ bch2_btree_path_verify_locks(path);
+
+ path->downgrade_seq++;
+ trace_path_downgrade(trans, _RET_IP_, path);
+}
+
+/* Btree transaction locking: */
+
+void bch2_trans_downgrade(struct btree_trans *trans)
+{
+ struct btree_path *path;
+
+ if (trans->restarted)
+ return;
+
+ trans_for_each_path(trans, path)
+ bch2_btree_path_downgrade(trans, path);
+}
+
+int bch2_trans_relock(struct btree_trans *trans)
+{
+ struct btree_path *path;
+
+ if (unlikely(trans->restarted))
+ return -((int) trans->restarted);
+
+ trans_for_each_path(trans, path)
+ if (path->should_be_locked &&
+ !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
+ trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
+ }
+ return 0;
+}
+
+int bch2_trans_relock_notrace(struct btree_trans *trans)
+{
+ struct btree_path *path;
+
+ if (unlikely(trans->restarted))
+ return -((int) trans->restarted);
+
+ trans_for_each_path(trans, path)
+ if (path->should_be_locked &&
+ !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
+ }
+ return 0;
+}
+
+void bch2_trans_unlock_noassert(struct btree_trans *trans)
+{
+ struct btree_path *path;
+
+ trans_for_each_path(trans, path)
+ __bch2_btree_path_unlock(trans, path);
+}
+
+void bch2_trans_unlock(struct btree_trans *trans)
+{
+ struct btree_path *path;
+
+ trans_for_each_path(trans, path)
+ __bch2_btree_path_unlock(trans, path);
+}
+
+void bch2_trans_unlock_long(struct btree_trans *trans)
+{
+ bch2_trans_unlock(trans);
+ bch2_trans_srcu_unlock(trans);
+}
+
+bool bch2_trans_locked(struct btree_trans *trans)
+{
+ struct btree_path *path;
+
+ trans_for_each_path(trans, path)
+ if (path->nodes_locked)
+ return true;
+ return false;
+}
+
+int __bch2_trans_mutex_lock(struct btree_trans *trans,
+ struct mutex *lock)
+{
+ int ret = drop_locks_do(trans, (mutex_lock(lock), 0));
+
+ if (ret)
+ mutex_unlock(lock);
+ return ret;
+}
+
+/* Debug */
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void bch2_btree_path_verify_locks(struct btree_path *path)
+{
+ unsigned l;
+
+ if (!path->nodes_locked) {
+ BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
+ btree_path_node(path, path->level));
+ return;
+ }
+
+ for (l = 0; l < BTREE_MAX_DEPTH; l++) {
+ int want = btree_lock_want(path, l);
+ int have = btree_node_locked_type(path, l);
+
+ BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED);
+
+ BUG_ON(is_btree_node(path, l) &&
+ (want == BTREE_NODE_UNLOCKED ||
+ have != BTREE_NODE_WRITE_LOCKED) &&
+ want != have);
+ }
+}
+
+void bch2_trans_verify_locks(struct btree_trans *trans)
+{
+ struct btree_path *path;
+
+ trans_for_each_path(trans, path)
+ bch2_btree_path_verify_locks(path);
+}
+
+#endif
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
new file mode 100644
index 000000000000..11b0a2c8cd69
--- /dev/null
+++ b/fs/bcachefs/btree_locking.h
@@ -0,0 +1,433 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_LOCKING_H
+#define _BCACHEFS_BTREE_LOCKING_H
+
+/*
+ * Only for internal btree use:
+ *
+ * The btree iterator tracks what locks it wants to take, and what locks it
+ * currently has - here we have wrappers for locking/unlocking btree nodes and
+ * updating the iterator state
+ */
+
+#include "btree_iter.h"
+#include "six.h"
+
+void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags);
+
+#ifdef CONFIG_LOCKDEP
+void bch2_assert_btree_nodes_not_locked(void);
+#else
+static inline void bch2_assert_btree_nodes_not_locked(void) {}
+#endif
+
+void bch2_trans_unlock_noassert(struct btree_trans *);
+
+static inline bool is_btree_node(struct btree_path *path, unsigned l)
+{
+ return l < BTREE_MAX_DEPTH && !IS_ERR_OR_NULL(path->l[l].b);
+}
+
+static inline struct btree_transaction_stats *btree_trans_stats(struct btree_trans *trans)
+{
+ return trans->fn_idx < ARRAY_SIZE(trans->c->btree_transaction_stats)
+ ? &trans->c->btree_transaction_stats[trans->fn_idx]
+ : NULL;
+}
+
+/* matches six lock types */
+enum btree_node_locked_type {
+ BTREE_NODE_UNLOCKED = -1,
+ BTREE_NODE_READ_LOCKED = SIX_LOCK_read,
+ BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent,
+ BTREE_NODE_WRITE_LOCKED = SIX_LOCK_write,
+};
+
+static inline int btree_node_locked_type(struct btree_path *path,
+ unsigned level)
+{
+ return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3);
+}
+
+static inline bool btree_node_write_locked(struct btree_path *path, unsigned l)
+{
+ return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED;
+}
+
+static inline bool btree_node_intent_locked(struct btree_path *path, unsigned l)
+{
+ return btree_node_locked_type(path, l) == BTREE_NODE_INTENT_LOCKED;
+}
+
+static inline bool btree_node_read_locked(struct btree_path *path, unsigned l)
+{
+ return btree_node_locked_type(path, l) == BTREE_NODE_READ_LOCKED;
+}
+
+static inline bool btree_node_locked(struct btree_path *path, unsigned level)
+{
+ return btree_node_locked_type(path, level) != BTREE_NODE_UNLOCKED;
+}
+
+static inline void mark_btree_node_locked_noreset(struct btree_path *path,
+ unsigned level,
+ enum btree_node_locked_type type)
+{
+ /* relying on this to avoid a branch */
+ BUILD_BUG_ON(SIX_LOCK_read != 0);
+ BUILD_BUG_ON(SIX_LOCK_intent != 1);
+
+ path->nodes_locked &= ~(3U << (level << 1));
+ path->nodes_locked |= (type + 1) << (level << 1);
+}
+
+static inline void mark_btree_node_unlocked(struct btree_path *path,
+ unsigned level)
+{
+ EBUG_ON(btree_node_write_locked(path, level));
+ mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED);
+}
+
+static inline void mark_btree_node_locked(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned level,
+ enum btree_node_locked_type type)
+{
+ mark_btree_node_locked_noreset(path, level, (enum btree_node_locked_type) type);
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+ path->l[level].lock_taken_time = local_clock();
+#endif
+}
+
+static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
+{
+ return level < path->locks_want
+ ? SIX_LOCK_intent
+ : SIX_LOCK_read;
+}
+
+static inline enum btree_node_locked_type
+btree_lock_want(struct btree_path *path, int level)
+{
+ if (level < path->level)
+ return BTREE_NODE_UNLOCKED;
+ if (level < path->locks_want)
+ return BTREE_NODE_INTENT_LOCKED;
+ if (level == path->level)
+ return BTREE_NODE_READ_LOCKED;
+ return BTREE_NODE_UNLOCKED;
+}
+
+static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
+ struct btree_path *path, unsigned level)
+{
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+ struct btree_transaction_stats *s = btree_trans_stats(trans);
+
+ if (s)
+ __bch2_time_stats_update(&s->lock_hold_times,
+ path->l[level].lock_taken_time,
+ local_clock());
+#endif
+}
+
+/* unlock: */
+
+static inline void btree_node_unlock(struct btree_trans *trans,
+ struct btree_path *path, unsigned level)
+{
+ int lock_type = btree_node_locked_type(path, level);
+
+ EBUG_ON(level >= BTREE_MAX_DEPTH);
+
+ if (lock_type != BTREE_NODE_UNLOCKED) {
+ six_unlock_type(&path->l[level].b->c.lock, lock_type);
+ btree_trans_lock_hold_time_update(trans, path, level);
+ }
+ mark_btree_node_unlocked(path, level);
+}
+
+static inline int btree_path_lowest_level_locked(struct btree_path *path)
+{
+ return __ffs(path->nodes_locked) >> 1;
+}
+
+static inline int btree_path_highest_level_locked(struct btree_path *path)
+{
+ return __fls(path->nodes_locked) >> 1;
+}
+
+static inline void __bch2_btree_path_unlock(struct btree_trans *trans,
+ struct btree_path *path)
+{
+ btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK);
+
+ while (path->nodes_locked)
+ btree_node_unlock(trans, path, btree_path_lowest_level_locked(path));
+}
+
+/*
+ * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
+ * succeed:
+ */
+static inline void
+bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
+ struct btree *b)
+{
+ struct btree_path *linked;
+
+ EBUG_ON(path->l[b->c.level].b != b);
+ EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock));
+ EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write);
+
+ mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
+
+ trans_for_each_path_with_node(trans, b, linked)
+ linked->l[b->c.level].lock_seq++;
+
+ six_unlock_write(&b->c.lock);
+}
+
+void bch2_btree_node_unlock_write(struct btree_trans *,
+ struct btree_path *, struct btree *);
+
+int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
+
+/* lock: */
+
+static inline int __btree_node_lock_nopath(struct btree_trans *trans,
+ struct btree_bkey_cached_common *b,
+ enum six_lock_type type,
+ bool lock_may_not_fail,
+ unsigned long ip)
+{
+ int ret;
+
+ trans->lock_may_not_fail = lock_may_not_fail;
+ trans->lock_must_abort = false;
+ trans->locking = b;
+
+ ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
+ bch2_six_check_for_deadlock, trans, ip);
+ WRITE_ONCE(trans->locking, NULL);
+ WRITE_ONCE(trans->locking_wait.start_time, 0);
+ return ret;
+}
+
+static inline int __must_check
+btree_node_lock_nopath(struct btree_trans *trans,
+ struct btree_bkey_cached_common *b,
+ enum six_lock_type type,
+ unsigned long ip)
+{
+ return __btree_node_lock_nopath(trans, b, type, false, ip);
+}
+
+static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans,
+ struct btree_bkey_cached_common *b,
+ enum six_lock_type type)
+{
+ int ret = __btree_node_lock_nopath(trans, b, type, true, _THIS_IP_);
+
+ BUG_ON(ret);
+}
+
+/*
+ * Lock a btree node if we already have it locked on one of our linked
+ * iterators:
+ */
+static inline bool btree_node_lock_increment(struct btree_trans *trans,
+ struct btree_bkey_cached_common *b,
+ unsigned level,
+ enum btree_node_locked_type want)
+{
+ struct btree_path *path;
+
+ trans_for_each_path(trans, path)
+ if (&path->l[level].b->c == b &&
+ btree_node_locked_type(path, level) >= want) {
+ six_lock_increment(&b->lock, (enum six_lock_type) want);
+ return true;
+ }
+
+ return false;
+}
+
+static inline int btree_node_lock(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree_bkey_cached_common *b,
+ unsigned level,
+ enum six_lock_type type,
+ unsigned long ip)
+{
+ int ret = 0;
+
+ EBUG_ON(level >= BTREE_MAX_DEPTH);
+ EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
+
+ if (likely(six_trylock_type(&b->lock, type)) ||
+ btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) ||
+ !(ret = btree_node_lock_nopath(trans, b, type, btree_path_ip_allocated(path)))) {
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+ path->l[b->level].lock_taken_time = local_clock();
+#endif
+ }
+
+ return ret;
+}
+
+int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_path *,
+ struct btree_bkey_cached_common *b, bool);
+
+static inline int __btree_node_lock_write(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree_bkey_cached_common *b,
+ bool lock_may_not_fail)
+{
+ EBUG_ON(&path->l[b->level].b->c != b);
+ EBUG_ON(path->l[b->level].lock_seq != six_lock_seq(&b->lock));
+ EBUG_ON(!btree_node_intent_locked(path, b->level));
+
+ /*
+ * six locks are unfair, and read locks block while a thread wants a
+ * write lock: thus, we need to tell the cycle detector we have a write
+ * lock _before_ taking the lock:
+ */
+ mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_WRITE_LOCKED);
+
+ return likely(six_trylock_write(&b->lock))
+ ? 0
+ : __bch2_btree_node_lock_write(trans, path, b, lock_may_not_fail);
+}
+
+static inline int __must_check
+bch2_btree_node_lock_write(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree_bkey_cached_common *b)
+{
+ return __btree_node_lock_write(trans, path, b, false);
+}
+
+void bch2_btree_node_lock_write_nofail(struct btree_trans *,
+ struct btree_path *,
+ struct btree_bkey_cached_common *);
+
+/* relock: */
+
+bool bch2_btree_path_relock_norestart(struct btree_trans *,
+ struct btree_path *, unsigned long);
+int __bch2_btree_path_relock(struct btree_trans *,
+ struct btree_path *, unsigned long);
+
+static inline int bch2_btree_path_relock(struct btree_trans *trans,
+ struct btree_path *path, unsigned long trace_ip)
+{
+ return btree_node_locked(path, path->level)
+ ? 0
+ : __bch2_btree_path_relock(trans, path, trace_ip);
+}
+
+bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned, bool trace);
+
+static inline bool bch2_btree_node_relock(struct btree_trans *trans,
+ struct btree_path *path, unsigned level)
+{
+ EBUG_ON(btree_node_locked(path, level) &&
+ !btree_node_write_locked(path, level) &&
+ btree_node_locked_type(path, level) != __btree_lock_want(path, level));
+
+ return likely(btree_node_locked(path, level)) ||
+ (!IS_ERR_OR_NULL(path->l[level].b) &&
+ __bch2_btree_node_relock(trans, path, level, true));
+}
+
+static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
+ struct btree_path *path, unsigned level)
+{
+ EBUG_ON(btree_node_locked(path, level) &&
+ !btree_node_write_locked(path, level) &&
+ btree_node_locked_type(path, level) != __btree_lock_want(path, level));
+
+ return likely(btree_node_locked(path, level)) ||
+ (!IS_ERR_OR_NULL(path->l[level].b) &&
+ __bch2_btree_node_relock(trans, path, level, false));
+}
+
+/* upgrade */
+
+
+struct get_locks_fail {
+ unsigned l;
+ struct btree *b;
+};
+
+bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
+ struct btree_path *, unsigned,
+ struct get_locks_fail *);
+
+bool __bch2_btree_path_upgrade(struct btree_trans *,
+ struct btree_path *, unsigned,
+ struct get_locks_fail *);
+
+static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned new_locks_want)
+{
+ struct get_locks_fail f;
+ unsigned old_locks_want = path->locks_want;
+
+ new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
+
+ if (path->locks_want < new_locks_want
+ ? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f)
+ : path->uptodate == BTREE_ITER_UPTODATE)
+ return 0;
+
+ trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path,
+ old_locks_want, new_locks_want, &f);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
+}
+
+/* misc: */
+
+static inline void btree_path_set_should_be_locked(struct btree_path *path)
+{
+ EBUG_ON(!btree_node_locked(path, path->level));
+ EBUG_ON(path->uptodate);
+
+ path->should_be_locked = true;
+}
+
+static inline void __btree_path_set_level_up(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned l)
+{
+ btree_node_unlock(trans, path, l);
+ path->l[l].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
+}
+
+static inline void btree_path_set_level_up(struct btree_trans *trans,
+ struct btree_path *path)
+{
+ __btree_path_set_level_up(trans, path, path->level++);
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+}
+
+/* debug */
+
+struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *,
+ struct btree_path *,
+ struct btree_bkey_cached_common *b,
+ unsigned);
+
+int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *);
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_btree_path_verify_locks(struct btree_path *);
+void bch2_trans_verify_locks(struct btree_trans *);
+#else
+static inline void bch2_btree_path_verify_locks(struct btree_path *path) {}
+static inline void bch2_trans_verify_locks(struct btree_trans *trans) {}
+#endif
+
+#endif /* _BCACHEFS_BTREE_LOCKING_H */
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
new file mode 100644
index 000000000000..12907beda98c
--- /dev/null
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -0,0 +1,1162 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_gc.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_journal_iter.h"
+#include "btree_key_cache.h"
+#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
+#include "buckets.h"
+#include "errcode.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "replicas.h"
+#include "snapshot.h"
+
+#include <linux/prefetch.h>
+
+static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+ struct bch_fs *c = trans->c;
+ struct bkey u;
+ struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u);
+
+ if (unlikely(trans->journal_replay_not_finished)) {
+ struct bkey_i *j_k =
+ bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p);
+
+ if (j_k)
+ k = bkey_i_to_s_c(j_k);
+ }
+
+ u = *k.k;
+ u.needs_whiteout = i->old_k.needs_whiteout;
+
+ BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey)));
+ BUG_ON(i->old_v != k.v);
+#endif
+}
+
+static inline struct btree_path_level *insert_l(struct btree_insert_entry *i)
+{
+ return i->path->l + i->level;
+}
+
+static inline bool same_leaf_as_prev(struct btree_trans *trans,
+ struct btree_insert_entry *i)
+{
+ return i != trans->updates &&
+ insert_l(&i[0])->b == insert_l(&i[-1])->b;
+}
+
+static inline bool same_leaf_as_next(struct btree_trans *trans,
+ struct btree_insert_entry *i)
+{
+ return i + 1 < trans->updates + trans->nr_updates &&
+ insert_l(&i[0])->b == insert_l(&i[1])->b;
+}
+
+inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b)
+{
+ struct bch_fs *c = trans->c;
+
+ if (unlikely(btree_node_just_written(b)) &&
+ bch2_btree_post_write_cleanup(c, b))
+ bch2_trans_node_reinit_iter(trans, b);
+
+ /*
+ * If the last bset has been written, or if it's gotten too big - start
+ * a new bset to insert into:
+ */
+ if (want_new_bset(c, b))
+ bch2_btree_init_next(trans, b);
+}
+
+static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
+{
+ while (--i >= trans->updates) {
+ if (same_leaf_as_prev(trans, i))
+ continue;
+
+ bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
+ }
+
+ trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
+}
+
+static inline int bch2_trans_lock_write(struct btree_trans *trans)
+{
+ struct btree_insert_entry *i;
+
+ EBUG_ON(trans->write_locked);
+
+ trans_for_each_update(trans, i) {
+ if (same_leaf_as_prev(trans, i))
+ continue;
+
+ if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
+ return trans_lock_write_fail(trans, i);
+
+ if (!i->cached)
+ bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
+ }
+
+ trans->write_locked = true;
+ return 0;
+}
+
+static inline void bch2_trans_unlock_write(struct btree_trans *trans)
+{
+ if (likely(trans->write_locked)) {
+ struct btree_insert_entry *i;
+
+ trans_for_each_update(trans, i)
+ if (!same_leaf_as_prev(trans, i))
+ bch2_btree_node_unlock_write_inlined(trans, i->path,
+ insert_l(i)->b);
+ trans->write_locked = false;
+ }
+}
+
+/* Inserting into a given leaf node (last stage of insert): */
+
+/* Handle overwrites and do insert, for non extents: */
+bool bch2_btree_bset_insert_key(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b,
+ struct btree_node_iter *node_iter,
+ struct bkey_i *insert)
+{
+ struct bkey_packed *k;
+ unsigned clobber_u64s = 0, new_u64s = 0;
+
+ EBUG_ON(btree_node_just_written(b));
+ EBUG_ON(bset_written(b, btree_bset_last(b)));
+ EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
+ EBUG_ON(bpos_lt(insert->k.p, b->data->min_key));
+ EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
+ EBUG_ON(insert->k.u64s >
+ bch_btree_keys_u64s_remaining(trans->c, b));
+ EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos));
+
+ k = bch2_btree_node_iter_peek_all(node_iter, b);
+ if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
+ k = NULL;
+
+ /* @k is the key being overwritten/deleted, if any: */
+ EBUG_ON(k && bkey_deleted(k));
+
+ /* Deleting, but not found? nothing to do: */
+ if (bkey_deleted(&insert->k) && !k)
+ return false;
+
+ if (bkey_deleted(&insert->k)) {
+ /* Deleting: */
+ btree_account_key_drop(b, k);
+ k->type = KEY_TYPE_deleted;
+
+ if (k->needs_whiteout)
+ push_whiteout(trans->c, b, insert->k.p);
+ k->needs_whiteout = false;
+
+ if (k >= btree_bset_last(b)->start) {
+ clobber_u64s = k->u64s;
+ bch2_bset_delete(b, k, clobber_u64s);
+ goto fix_iter;
+ } else {
+ bch2_btree_path_fix_key_modified(trans, b, k);
+ }
+
+ return true;
+ }
+
+ if (k) {
+ /* Overwriting: */
+ btree_account_key_drop(b, k);
+ k->type = KEY_TYPE_deleted;
+
+ insert->k.needs_whiteout = k->needs_whiteout;
+ k->needs_whiteout = false;
+
+ if (k >= btree_bset_last(b)->start) {
+ clobber_u64s = k->u64s;
+ goto overwrite;
+ } else {
+ bch2_btree_path_fix_key_modified(trans, b, k);
+ }
+ }
+
+ k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
+overwrite:
+ bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
+ new_u64s = k->u64s;
+fix_iter:
+ if (clobber_u64s != new_u64s)
+ bch2_btree_node_iter_fix(trans, path, b, node_iter, k,
+ clobber_u64s, new_u64s);
+ return true;
+}
+
+static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
+ unsigned i, u64 seq)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct btree_write *w = container_of(pin, struct btree_write, journal);
+ struct btree *b = container_of(w, struct btree, writes[i]);
+ struct btree_trans *trans = bch2_trans_get(c);
+ unsigned long old, new, v;
+ unsigned idx = w - b->writes;
+
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+ v = READ_ONCE(b->flags);
+
+ do {
+ old = new = v;
+
+ if (!(old & (1 << BTREE_NODE_dirty)) ||
+ !!(old & (1 << BTREE_NODE_write_idx)) != idx ||
+ w->journal.seq != seq)
+ break;
+
+ new &= ~BTREE_WRITE_TYPE_MASK;
+ new |= BTREE_WRITE_journal_reclaim;
+ new |= 1 << BTREE_NODE_need_write;
+ } while ((v = cmpxchg(&b->flags, old, new)) != old);
+
+ btree_node_write_if_need(c, b, SIX_LOCK_read);
+ six_unlock_read(&b->c.lock);
+
+ bch2_trans_put(trans);
+ return 0;
+}
+
+int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+{
+ return __btree_node_flush(j, pin, 0, seq);
+}
+
+int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+{
+ return __btree_node_flush(j, pin, 1, seq);
+}
+
+inline void bch2_btree_add_journal_pin(struct bch_fs *c,
+ struct btree *b, u64 seq)
+{
+ struct btree_write *w = btree_current_write(b);
+
+ bch2_journal_pin_add(&c->journal, seq, &w->journal,
+ btree_node_write_idx(b) == 0
+ ? bch2_btree_node_flush0
+ : bch2_btree_node_flush1);
+}
+
+/**
+ * bch2_btree_insert_key_leaf() - insert a key one key into a leaf node
+ * @trans: btree transaction object
+ * @path: path pointing to @insert's pos
+ * @insert: key to insert
+ * @journal_seq: sequence number of journal reservation
+ */
+inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
+ struct btree_path *path,
+ struct bkey_i *insert,
+ u64 journal_seq)
+{
+ struct bch_fs *c = trans->c;
+ struct btree *b = path_l(path)->b;
+ struct bset_tree *t = bset_tree_last(b);
+ struct bset *i = bset(b, t);
+ int old_u64s = bset_u64s(t);
+ int old_live_u64s = b->nr.live_u64s;
+ int live_u64s_added, u64s_added;
+
+ if (unlikely(!bch2_btree_bset_insert_key(trans, path, b,
+ &path_l(path)->iter, insert)))
+ return;
+
+ i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq)));
+
+ bch2_btree_add_journal_pin(c, b, journal_seq);
+
+ if (unlikely(!btree_node_dirty(b))) {
+ EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
+ set_btree_node_dirty_acct(c, b);
+ }
+
+ live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
+ u64s_added = (int) bset_u64s(t) - old_u64s;
+
+ if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
+ b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
+ if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
+ b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
+
+ if (u64s_added > live_u64s_added &&
+ bch2_maybe_compact_whiteouts(c, b))
+ bch2_trans_node_reinit_iter(trans, b);
+}
+
+/* Cached btree updates: */
+
+/* Normal update interface: */
+
+static inline void btree_insert_entry_checks(struct btree_trans *trans,
+ struct btree_insert_entry *i)
+{
+ BUG_ON(!bpos_eq(i->k->k.p, i->path->pos));
+ BUG_ON(i->cached != i->path->cached);
+ BUG_ON(i->level != i->path->level);
+ BUG_ON(i->btree_id != i->path->btree_id);
+ EBUG_ON(!i->level &&
+ btree_type_has_snapshots(i->btree_id) &&
+ !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
+ test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
+ i->k->k.p.snapshot &&
+ bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot));
+}
+
+static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
+ unsigned flags)
+{
+ return bch2_journal_res_get(&trans->c->journal, &trans->journal_res,
+ trans->journal_u64s, flags);
+}
+
+#define JSET_ENTRY_LOG_U64s 4
+
+static noinline void journal_transaction_name(struct btree_trans *trans)
+{
+ struct bch_fs *c = trans->c;
+ struct journal *j = &c->journal;
+ struct jset_entry *entry =
+ bch2_journal_add_entry(j, &trans->journal_res,
+ BCH_JSET_ENTRY_log, 0, 0,
+ JSET_ENTRY_LOG_U64s);
+ struct jset_entry_log *l =
+ container_of(entry, struct jset_entry_log, entry);
+
+ strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64));
+}
+
+static inline int btree_key_can_insert(struct btree_trans *trans,
+ struct btree *b, unsigned u64s)
+{
+ struct bch_fs *c = trans->c;
+
+ if (!bch2_btree_node_insert_fits(c, b, u64s))
+ return -BCH_ERR_btree_insert_btree_node_full;
+
+ return 0;
+}
+
+noinline static int
+btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
+ struct btree_path *path, unsigned new_u64s)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_insert_entry *i;
+ struct bkey_cached *ck = (void *) path->l[0].b;
+ struct bkey_i *new_k;
+ int ret;
+
+ bch2_trans_unlock_write(trans);
+ bch2_trans_unlock(trans);
+
+ new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
+ if (!new_k) {
+ bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
+ bch2_btree_id_str(path->btree_id), new_u64s);
+ return -BCH_ERR_ENOMEM_btree_key_cache_insert;
+ }
+
+ ret = bch2_trans_relock(trans) ?:
+ bch2_trans_lock_write(trans);
+ if (unlikely(ret)) {
+ kfree(new_k);
+ return ret;
+ }
+
+ memcpy(new_k, ck->k, ck->u64s * sizeof(u64));
+
+ trans_for_each_update(trans, i)
+ if (i->old_v == &ck->k->v)
+ i->old_v = &new_k->v;
+
+ kfree(ck->k);
+ ck->u64s = new_u64s;
+ ck->k = new_k;
+ return 0;
+}
+
+static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags,
+ struct btree_path *path, unsigned u64s)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_cached *ck = (void *) path->l[0].b;
+ struct btree_insert_entry *i;
+ unsigned new_u64s;
+ struct bkey_i *new_k;
+
+ EBUG_ON(path->level);
+
+ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+ bch2_btree_key_cache_must_wait(c) &&
+ !(flags & BTREE_INSERT_JOURNAL_RECLAIM))
+ return -BCH_ERR_btree_insert_need_journal_reclaim;
+
+ /*
+ * bch2_varint_decode can read past the end of the buffer by at most 7
+ * bytes (it won't be used):
+ */
+ u64s += 1;
+
+ if (u64s <= ck->u64s)
+ return 0;
+
+ new_u64s = roundup_pow_of_two(u64s);
+ new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT);
+ if (unlikely(!new_k))
+ return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s);
+
+ trans_for_each_update(trans, i)
+ if (i->old_v == &ck->k->v)
+ i->old_v = &new_k->v;
+
+ ck->u64s = new_u64s;
+ ck->k = new_k;
+ return 0;
+}
+
+/* Triggers: */
+
+static int run_one_mem_trigger(struct btree_trans *trans,
+ struct btree_insert_entry *i,
+ unsigned flags)
+{
+ struct bkey_s_c old = { &i->old_k, i->old_v };
+ struct bkey_i *new = i->k;
+ const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
+ const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
+ int ret;
+
+ verify_update_old_key(trans, i);
+
+ if (unlikely(flags & BTREE_TRIGGER_NORUN))
+ return 0;
+
+ if (!btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)))
+ return 0;
+
+ if (old_ops->atomic_trigger == new_ops->atomic_trigger) {
+ ret = bch2_mark_key(trans, i->btree_id, i->level,
+ old, bkey_i_to_s_c(new),
+ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
+ } else {
+ struct bkey _deleted = KEY(0, 0, 0);
+ struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL };
+
+ _deleted.p = i->path->pos;
+
+ ret = bch2_mark_key(trans, i->btree_id, i->level,
+ deleted, bkey_i_to_s_c(new),
+ BTREE_TRIGGER_INSERT|flags) ?:
+ bch2_mark_key(trans, i->btree_id, i->level,
+ old, deleted,
+ BTREE_TRIGGER_OVERWRITE|flags);
+ }
+
+ return ret;
+}
+
+static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
+ bool overwrite)
+{
+ /*
+ * Transactional triggers create new btree_insert_entries, so we can't
+ * pass them a pointer to a btree_insert_entry, that memory is going to
+ * move:
+ */
+ struct bkey old_k = i->old_k;
+ struct bkey_s_c old = { &old_k, i->old_v };
+ const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
+ const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
+
+ verify_update_old_key(trans, i);
+
+ if ((i->flags & BTREE_TRIGGER_NORUN) ||
+ !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
+ return 0;
+
+ if (!i->insert_trigger_run &&
+ !i->overwrite_trigger_run &&
+ old_ops->trans_trigger == new_ops->trans_trigger) {
+ i->overwrite_trigger_run = true;
+ i->insert_trigger_run = true;
+ return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k,
+ BTREE_TRIGGER_INSERT|
+ BTREE_TRIGGER_OVERWRITE|
+ i->flags) ?: 1;
+ } else if (overwrite && !i->overwrite_trigger_run) {
+ i->overwrite_trigger_run = true;
+ return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1;
+ } else if (!overwrite && !i->insert_trigger_run) {
+ i->insert_trigger_run = true;
+ return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1;
+ } else {
+ return 0;
+ }
+}
+
+static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
+ struct btree_insert_entry *btree_id_start)
+{
+ struct btree_insert_entry *i;
+ bool trans_trigger_run;
+ int ret, overwrite;
+
+ for (overwrite = 1; overwrite >= 0; --overwrite) {
+
+ /*
+ * Running triggers will append more updates to the list of updates as
+ * we're walking it:
+ */
+ do {
+ trans_trigger_run = false;
+
+ for (i = btree_id_start;
+ i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
+ i++) {
+ if (i->btree_id != btree_id)
+ continue;
+
+ ret = run_one_trans_trigger(trans, i, overwrite);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ trans_trigger_run = true;
+ }
+ } while (trans_trigger_run);
+ }
+
+ return 0;
+}
+
+static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
+{
+ struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
+ unsigned btree_id = 0;
+ int ret = 0;
+
+ /*
+ *
+ * For a given btree, this algorithm runs insert triggers before
+ * overwrite triggers: this is so that when extents are being moved
+ * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
+ * they are re-added.
+ */
+ for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
+ if (btree_id == BTREE_ID_alloc)
+ continue;
+
+ while (btree_id_start < trans->updates + trans->nr_updates &&
+ btree_id_start->btree_id < btree_id)
+ btree_id_start++;
+
+ ret = run_btree_triggers(trans, btree_id, btree_id_start);
+ if (ret)
+ return ret;
+ }
+
+ trans_for_each_update(trans, i) {
+ if (i->btree_id > BTREE_ID_alloc)
+ break;
+ if (i->btree_id == BTREE_ID_alloc) {
+ ret = run_btree_triggers(trans, BTREE_ID_alloc, i);
+ if (ret)
+ return ret;
+ break;
+ }
+ }
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+ trans_for_each_update(trans, i)
+ BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
+ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
+ (!i->insert_trigger_run || !i->overwrite_trigger_run));
+#endif
+ return 0;
+}
+
+static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_insert_entry *i;
+ int ret = 0;
+
+ trans_for_each_update(trans, i) {
+ /*
+ * XXX: synchronization of cached update triggers with gc
+ * XXX: synchronization of interior node updates with gc
+ */
+ BUG_ON(i->cached || i->level);
+
+ if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) {
+ ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
+ if (ret)
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static inline int
+bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
+ struct btree_insert_entry **stopped_at,
+ unsigned long trace_ip)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_insert_entry *i;
+ struct btree_write_buffered_key *wb;
+ struct btree_trans_commit_hook *h;
+ unsigned u64s = 0;
+ int ret;
+
+ if (race_fault()) {
+ trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
+ return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject);
+ }
+
+ /*
+ * Check if the insert will fit in the leaf node with the write lock
+ * held, otherwise another thread could write the node changing the
+ * amount of space available:
+ */
+
+ prefetch(&trans->c->journal.flags);
+
+ trans_for_each_update(trans, i) {
+ /* Multiple inserts might go to same leaf: */
+ if (!same_leaf_as_prev(trans, i))
+ u64s = 0;
+
+ u64s += i->k->k.u64s;
+ ret = !i->cached
+ ? btree_key_can_insert(trans, insert_l(i)->b, u64s)
+ : btree_key_can_insert_cached(trans, flags, i->path, u64s);
+ if (ret) {
+ *stopped_at = i;
+ return ret;
+ }
+ }
+
+ if (trans->nr_wb_updates &&
+ trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size)
+ return -BCH_ERR_btree_insert_need_flush_buffer;
+
+ /*
+ * Don't get journal reservation until after we know insert will
+ * succeed:
+ */
+ if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+ ret = bch2_trans_journal_res_get(trans,
+ (flags & BCH_WATERMARK_MASK)|
+ JOURNAL_RES_GET_NONBLOCK);
+ if (ret)
+ return ret;
+
+ if (unlikely(trans->journal_transaction_names))
+ journal_transaction_name(trans);
+ } else {
+ trans->journal_res.seq = c->journal.replay_journal_seq;
+ }
+
+ /*
+ * Not allowed to fail after we've gotten our journal reservation - we
+ * have to use it:
+ */
+
+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+ !(flags & BTREE_INSERT_JOURNAL_REPLAY)) {
+ if (bch2_journal_seq_verify)
+ trans_for_each_update(trans, i)
+ i->k->k.version.lo = trans->journal_res.seq;
+ else if (bch2_inject_invalid_keys)
+ trans_for_each_update(trans, i)
+ i->k->k.version = MAX_VERSION;
+ }
+
+ if (trans->fs_usage_deltas &&
+ bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
+ return -BCH_ERR_btree_insert_need_mark_replicas;
+
+ if (trans->nr_wb_updates) {
+ EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY);
+
+ ret = bch2_btree_insert_keys_write_buffer(trans);
+ if (ret)
+ goto revert_fs_usage;
+ }
+
+ h = trans->hooks;
+ while (h) {
+ ret = h->fn(trans, h);
+ if (ret)
+ goto revert_fs_usage;
+ h = h->next;
+ }
+
+ trans_for_each_update(trans, i)
+ if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
+ ret = run_one_mem_trigger(trans, i, i->flags);
+ if (ret)
+ goto fatal_err;
+ }
+
+ if (unlikely(c->gc_pos.phase)) {
+ ret = bch2_trans_commit_run_gc_triggers(trans);
+ if (ret)
+ goto fatal_err;
+ }
+
+ if (unlikely(trans->extra_journal_entries.nr)) {
+ memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
+ trans->extra_journal_entries.data,
+ trans->extra_journal_entries.nr);
+
+ trans->journal_res.offset += trans->extra_journal_entries.nr;
+ trans->journal_res.u64s -= trans->extra_journal_entries.nr;
+ }
+
+ if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+ struct journal *j = &c->journal;
+ struct jset_entry *entry;
+
+ trans_for_each_update(trans, i) {
+ if (i->key_cache_already_flushed)
+ continue;
+
+ if (i->flags & BTREE_UPDATE_NOJOURNAL)
+ continue;
+
+ verify_update_old_key(trans, i);
+
+ if (trans->journal_transaction_names) {
+ entry = bch2_journal_add_entry(j, &trans->journal_res,
+ BCH_JSET_ENTRY_overwrite,
+ i->btree_id, i->level,
+ i->old_k.u64s);
+ bkey_reassemble((struct bkey_i *) entry->start,
+ (struct bkey_s_c) { &i->old_k, i->old_v });
+ }
+
+ entry = bch2_journal_add_entry(j, &trans->journal_res,
+ BCH_JSET_ENTRY_btree_keys,
+ i->btree_id, i->level,
+ i->k->k.u64s);
+ bkey_copy((struct bkey_i *) entry->start, i->k);
+ }
+
+ trans_for_each_wb_update(trans, wb) {
+ entry = bch2_journal_add_entry(j, &trans->journal_res,
+ BCH_JSET_ENTRY_btree_keys,
+ wb->btree, 0,
+ wb->k.k.u64s);
+ bkey_copy((struct bkey_i *) entry->start, &wb->k);
+ }
+
+ if (trans->journal_seq)
+ *trans->journal_seq = trans->journal_res.seq;
+ }
+
+ trans_for_each_update(trans, i) {
+ i->k->k.needs_whiteout = false;
+
+ if (!i->cached) {
+ u64 seq = trans->journal_res.seq;
+
+ if (i->flags & BTREE_UPDATE_PREJOURNAL)
+ seq = i->seq;
+
+ bch2_btree_insert_key_leaf(trans, i->path, i->k, seq);
+ } else if (!i->key_cache_already_flushed)
+ bch2_btree_insert_key_cached(trans, flags, i);
+ else {
+ bch2_btree_key_cache_drop(trans, i->path);
+ btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE);
+ }
+ }
+
+ return 0;
+fatal_err:
+ bch2_fatal_error(c);
+revert_fs_usage:
+ if (trans->fs_usage_deltas)
+ bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas);
+ return ret;
+}
+
+static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
+{
+ struct btree_insert_entry *i;
+ struct btree_write_buffered_key *wb;
+
+ trans_for_each_update(trans, i)
+ bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
+
+ trans_for_each_wb_update(trans, wb)
+ bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p);
+}
+
+static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
+ enum bkey_invalid_flags flags,
+ struct btree_insert_entry *i,
+ struct printbuf *err)
+{
+ struct bch_fs *c = trans->c;
+
+ printbuf_reset(err);
+ prt_printf(err, "invalid bkey on insert from %s -> %ps",
+ trans->fn, (void *) i->ip_allocated);
+ prt_newline(err);
+ printbuf_indent_add(err, 2);
+
+ bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k));
+ prt_newline(err);
+
+ bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, flags, err);
+ bch2_print_string_as_lines(KERN_ERR, err->buf);
+
+ bch2_inconsistent_error(c);
+ bch2_dump_trans_updates(trans);
+
+ return -EINVAL;
+}
+
+/*
+ * Get journal reservation, take write locks, and attempt to do btree update(s):
+ */
+static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags,
+ struct btree_insert_entry **stopped_at,
+ unsigned long trace_ip)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_insert_entry *i;
+ int ret = 0, u64s_delta = 0;
+
+ trans_for_each_update(trans, i) {
+ if (i->cached)
+ continue;
+
+ u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
+ u64s_delta -= i->old_btree_u64s;
+
+ if (!same_leaf_as_next(trans, i)) {
+ if (u64s_delta <= 0) {
+ ret = bch2_foreground_maybe_merge(trans, i->path,
+ i->level, flags);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ u64s_delta = 0;
+ }
+ }
+
+ ret = bch2_trans_lock_write(trans);
+ if (unlikely(ret))
+ return ret;
+
+ ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip);
+
+ if (!ret && unlikely(trans->journal_replay_not_finished))
+ bch2_drop_overwrites_from_journal(trans);
+
+ bch2_trans_unlock_write(trans);
+
+ if (!ret && trans->journal_pin)
+ bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
+ trans->journal_pin, NULL);
+
+ /*
+ * Drop journal reservation after dropping write locks, since dropping
+ * the journal reservation may kick off a journal write:
+ */
+ bch2_journal_res_put(&c->journal, &trans->journal_res);
+
+ return ret;
+}
+
+static int journal_reclaim_wait_done(struct bch_fs *c)
+{
+ int ret = bch2_journal_error(&c->journal) ?:
+ !bch2_btree_key_cache_must_wait(c);
+
+ if (!ret)
+ journal_reclaim_kick(&c->journal);
+ return ret;
+}
+
+static noinline
+int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
+ struct btree_insert_entry *i,
+ int ret, unsigned long trace_ip)
+{
+ struct bch_fs *c = trans->c;
+
+ switch (ret) {
+ case -BCH_ERR_btree_insert_btree_node_full:
+ ret = bch2_btree_split_leaf(trans, i->path, flags);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path);
+ break;
+ case -BCH_ERR_btree_insert_need_mark_replicas:
+ ret = drop_locks_do(trans,
+ bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas));
+ break;
+ case -BCH_ERR_journal_res_get_blocked:
+ /*
+ * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
+ * flag
+ */
+ if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+ (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) {
+ ret = -BCH_ERR_journal_reclaim_would_deadlock;
+ break;
+ }
+
+ ret = drop_locks_do(trans,
+ bch2_trans_journal_res_get(trans,
+ (flags & BCH_WATERMARK_MASK)|
+ JOURNAL_RES_GET_CHECK));
+ break;
+ case -BCH_ERR_btree_insert_need_journal_reclaim:
+ bch2_trans_unlock(trans);
+
+ trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip);
+
+ wait_event_freezable(c->journal.reclaim_wait,
+ (ret = journal_reclaim_wait_done(c)));
+ if (ret < 0)
+ break;
+
+ ret = bch2_trans_relock(trans);
+ break;
+ case -BCH_ERR_btree_insert_need_flush_buffer: {
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ ret = 0;
+
+ if (wb->state.nr > wb->size * 3 / 4) {
+ bch2_trans_unlock(trans);
+ mutex_lock(&wb->flush_lock);
+
+ if (wb->state.nr > wb->size * 3 / 4) {
+ bch2_trans_begin(trans);
+ ret = __bch2_btree_write_buffer_flush(trans,
+ flags|BTREE_INSERT_NOCHECK_RW, true);
+ if (!ret) {
+ trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
+ }
+ } else {
+ mutex_unlock(&wb->flush_lock);
+ ret = bch2_trans_relock(trans);
+ }
+ }
+ break;
+ }
+ default:
+ BUG_ON(ret >= 0);
+ break;
+ }
+
+ BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
+
+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
+ !(flags & BTREE_INSERT_NOWAIT) &&
+ (flags & BTREE_INSERT_NOFAIL), c,
+ "%s: incorrectly got %s\n", __func__, bch2_err_str(ret));
+
+ return ret;
+}
+
+static noinline int
+bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ int ret;
+
+ if (likely(!(flags & BTREE_INSERT_LAZY_RW)) ||
+ test_bit(BCH_FS_STARTED, &c->flags))
+ return -BCH_ERR_erofs_trans_commit;
+
+ ret = drop_locks_do(trans, bch2_fs_read_write_early(c));
+ if (ret)
+ return ret;
+
+ bch2_write_ref_get(c, BCH_WRITE_REF_trans);
+ return 0;
+}
+
+/*
+ * This is for updates done in the early part of fsck - btree_gc - before we've
+ * gone RW. we only add the new key to the list of keys for journal replay to
+ * do.
+ */
+static noinline int
+do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_insert_entry *i;
+ int ret = 0;
+
+ trans_for_each_update(trans, i) {
+ ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_insert_entry *i = NULL;
+ struct btree_write_buffered_key *wb;
+ int ret = 0;
+
+ if (!trans->nr_updates &&
+ !trans->nr_wb_updates &&
+ !trans->extra_journal_entries.nr)
+ goto out_reset;
+
+ if (flags & BTREE_INSERT_GC_LOCK_HELD)
+ lockdep_assert_held(&c->gc_lock);
+
+ ret = bch2_trans_commit_run_triggers(trans);
+ if (ret)
+ goto out_reset;
+
+ trans_for_each_update(trans, i) {
+ struct printbuf buf = PRINTBUF;
+ enum bkey_invalid_flags invalid_flags = 0;
+
+ if (!(flags & BTREE_INSERT_JOURNAL_REPLAY))
+ invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
+
+ if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
+ i->bkey_type, invalid_flags, &buf)))
+ ret = bch2_trans_commit_bkey_invalid(trans, invalid_flags, i, &buf);
+ btree_insert_entry_checks(trans, i);
+ printbuf_exit(&buf);
+
+ if (ret)
+ return ret;
+ }
+
+ if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
+ ret = do_bch2_trans_commit_to_journal_replay(trans);
+ goto out_reset;
+ }
+
+ if (!(flags & BTREE_INSERT_NOCHECK_RW) &&
+ unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
+ ret = bch2_trans_commit_get_rw_cold(trans, flags);
+ if (ret)
+ goto out_reset;
+ }
+
+ if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 &&
+ mutex_trylock(&c->btree_write_buffer.flush_lock)) {
+ bch2_trans_begin(trans);
+ bch2_trans_unlock(trans);
+
+ ret = __bch2_btree_write_buffer_flush(trans,
+ flags|BTREE_INSERT_NOCHECK_RW, true);
+ if (!ret) {
+ trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
+ }
+ goto out;
+ }
+
+ EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
+
+ trans->journal_u64s = trans->extra_journal_entries.nr;
+ trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
+ if (trans->journal_transaction_names)
+ trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
+
+ trans_for_each_update(trans, i) {
+ EBUG_ON(!i->path->should_be_locked);
+
+ ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1);
+ if (unlikely(ret))
+ goto out;
+
+ EBUG_ON(!btree_node_intent_locked(i->path, i->level));
+
+ if (i->key_cache_already_flushed)
+ continue;
+
+ if (i->flags & BTREE_UPDATE_NOJOURNAL)
+ continue;
+
+ /* we're going to journal the key being updated: */
+ trans->journal_u64s += jset_u64s(i->k->k.u64s);
+
+ /* and we're also going to log the overwrite: */
+ if (trans->journal_transaction_names)
+ trans->journal_u64s += jset_u64s(i->old_k.u64s);
+ }
+
+ trans_for_each_wb_update(trans, wb)
+ trans->journal_u64s += jset_u64s(wb->k.k.u64s);
+
+ if (trans->extra_journal_res) {
+ ret = bch2_disk_reservation_add(c, trans->disk_res,
+ trans->extra_journal_res,
+ (flags & BTREE_INSERT_NOFAIL)
+ ? BCH_DISK_RESERVATION_NOFAIL : 0);
+ if (ret)
+ goto err;
+ }
+retry:
+ bch2_trans_verify_not_in_restart(trans);
+ memset(&trans->journal_res, 0, sizeof(trans->journal_res));
+
+ ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_);
+
+ /* make sure we didn't drop or screw up locks: */
+ bch2_trans_verify_locks(trans);
+
+ if (ret)
+ goto err;
+
+ trace_and_count(c, transaction_commit, trans, _RET_IP_);
+out:
+ if (likely(!(flags & BTREE_INSERT_NOCHECK_RW)))
+ bch2_write_ref_put(c, BCH_WRITE_REF_trans);
+out_reset:
+ if (!ret)
+ bch2_trans_downgrade(trans);
+ bch2_trans_reset_updates(trans);
+
+ return ret;
+err:
+ ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_);
+ if (ret)
+ goto out;
+
+ goto retry;
+}
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
new file mode 100644
index 000000000000..60453ba86c4b
--- /dev/null
+++ b/fs/bcachefs/btree_types.h
@@ -0,0 +1,725 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_TYPES_H
+#define _BCACHEFS_BTREE_TYPES_H
+
+#include <linux/list.h>
+#include <linux/rhashtable.h>
+
+#include "btree_key_cache_types.h"
+#include "buckets_types.h"
+#include "darray.h"
+#include "errcode.h"
+#include "journal_types.h"
+#include "replicas_types.h"
+#include "six.h"
+
+struct open_bucket;
+struct btree_update;
+struct btree_trans;
+
+#define MAX_BSETS 3U
+
+struct btree_nr_keys {
+
+ /*
+ * Amount of live metadata (i.e. size of node after a compaction) in
+ * units of u64s
+ */
+ u16 live_u64s;
+ u16 bset_u64s[MAX_BSETS];
+
+ /* live keys only: */
+ u16 packed_keys;
+ u16 unpacked_keys;
+};
+
+struct bset_tree {
+ /*
+ * We construct a binary tree in an array as if the array
+ * started at 1, so that things line up on the same cachelines
+ * better: see comments in bset.c at cacheline_to_bkey() for
+ * details
+ */
+
+ /* size of the binary tree and prev array */
+ u16 size;
+
+ /* function of size - precalculated for to_inorder() */
+ u16 extra;
+
+ u16 data_offset;
+ u16 aux_data_offset;
+ u16 end_offset;
+};
+
+struct btree_write {
+ struct journal_entry_pin journal;
+};
+
+struct btree_alloc {
+ struct open_buckets ob;
+ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX);
+};
+
+struct btree_bkey_cached_common {
+ struct six_lock lock;
+ u8 level;
+ u8 btree_id;
+ bool cached;
+};
+
+struct btree {
+ struct btree_bkey_cached_common c;
+
+ struct rhash_head hash;
+ u64 hash_val;
+
+ unsigned long flags;
+ u16 written;
+ u8 nsets;
+ u8 nr_key_bits;
+ u16 version_ondisk;
+
+ struct bkey_format format;
+
+ struct btree_node *data;
+ void *aux_data;
+
+ /*
+ * Sets of sorted keys - the real btree node - plus a binary search tree
+ *
+ * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
+ * to the memory we have allocated for this btree node. Additionally,
+ * set[0]->data points to the entire btree node as it exists on disk.
+ */
+ struct bset_tree set[MAX_BSETS];
+
+ struct btree_nr_keys nr;
+ u16 sib_u64s[2];
+ u16 whiteout_u64s;
+ u8 byte_order;
+ u8 unpack_fn_len;
+
+ struct btree_write writes[2];
+
+ /* Key/pointer for this btree node */
+ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+
+ /*
+ * XXX: add a delete sequence number, so when bch2_btree_node_relock()
+ * fails because the lock sequence number has changed - i.e. the
+ * contents were modified - we can still relock the node if it's still
+ * the one we want, without redoing the traversal
+ */
+
+ /*
+ * For asynchronous splits/interior node updates:
+ * When we do a split, we allocate new child nodes and update the parent
+ * node to point to them: we update the parent in memory immediately,
+ * but then we must wait until the children have been written out before
+ * the update to the parent can be written - this is a list of the
+ * btree_updates that are blocking this node from being
+ * written:
+ */
+ struct list_head write_blocked;
+
+ /*
+ * Also for asynchronous splits/interior node updates:
+ * If a btree node isn't reachable yet, we don't want to kick off
+ * another write - because that write also won't yet be reachable and
+ * marking it as completed before it's reachable would be incorrect:
+ */
+ unsigned long will_make_reachable;
+
+ struct open_buckets ob;
+
+ /* lru list */
+ struct list_head list;
+};
+
+struct btree_cache {
+ struct rhashtable table;
+ bool table_init_done;
+ /*
+ * We never free a struct btree, except on shutdown - we just put it on
+ * the btree_cache_freed list and reuse it later. This simplifies the
+ * code, and it doesn't cost us much memory as the memory usage is
+ * dominated by buffers that hold the actual btree node data and those
+ * can be freed - and the number of struct btrees allocated is
+ * effectively bounded.
+ *
+ * btree_cache_freeable effectively is a small cache - we use it because
+ * high order page allocations can be rather expensive, and it's quite
+ * common to delete and allocate btree nodes in quick succession. It
+ * should never grow past ~2-3 nodes in practice.
+ */
+ struct mutex lock;
+ struct list_head live;
+ struct list_head freeable;
+ struct list_head freed_pcpu;
+ struct list_head freed_nonpcpu;
+
+ /* Number of elements in live + freeable lists */
+ unsigned used;
+ unsigned reserve;
+ atomic_t dirty;
+ struct shrinker *shrink;
+
+ /*
+ * If we need to allocate memory for a new btree node and that
+ * allocation fails, we can cannibalize another node in the btree cache
+ * to satisfy the allocation - lock to guarantee only one thread does
+ * this at a time:
+ */
+ struct task_struct *alloc_lock;
+ struct closure_waitlist alloc_wait;
+};
+
+struct btree_node_iter {
+ struct btree_node_iter_set {
+ u16 k, end;
+ } data[MAX_BSETS];
+};
+
+/*
+ * Iterate over all possible positions, synthesizing deleted keys for holes:
+ */
+static const __maybe_unused u16 BTREE_ITER_SLOTS = 1 << 0;
+static const __maybe_unused u16 BTREE_ITER_ALL_LEVELS = 1 << 1;
+/*
+ * Indicates that intent locks should be taken on leaf nodes, because we expect
+ * to be doing updates:
+ */
+static const __maybe_unused u16 BTREE_ITER_INTENT = 1 << 2;
+/*
+ * Causes the btree iterator code to prefetch additional btree nodes from disk:
+ */
+static const __maybe_unused u16 BTREE_ITER_PREFETCH = 1 << 3;
+/*
+ * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
+ * @pos or the first key strictly greater than @pos
+ */
+static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS = 1 << 4;
+static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS = 1 << 5;
+static const __maybe_unused u16 BTREE_ITER_CACHED = 1 << 6;
+static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 7;
+static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES = 1 << 8;
+static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL = 1 << 9;
+static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 10;
+static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 11;
+static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 12;
+static const __maybe_unused u16 BTREE_ITER_NOPRESERVE = 1 << 13;
+static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL = 1 << 14;
+static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 15;
+#define __BTREE_ITER_FLAGS_END 16
+
+enum btree_path_uptodate {
+ BTREE_ITER_UPTODATE = 0,
+ BTREE_ITER_NEED_RELOCK = 1,
+ BTREE_ITER_NEED_TRAVERSE = 2,
+};
+
+#if defined(CONFIG_BCACHEFS_LOCK_TIME_STATS) || defined(CONFIG_BCACHEFS_DEBUG)
+#define TRACK_PATH_ALLOCATED
+#endif
+
+struct btree_path {
+ u8 idx;
+ u8 sorted_idx;
+ u8 ref;
+ u8 intent_ref;
+ u32 alloc_seq;
+ u32 downgrade_seq;
+
+ /* btree_iter_copy starts here: */
+ struct bpos pos;
+
+ enum btree_id btree_id:5;
+ bool cached:1;
+ bool preserve:1;
+ enum btree_path_uptodate uptodate:2;
+ /*
+ * When true, failing to relock this path will cause the transaction to
+ * restart:
+ */
+ bool should_be_locked:1;
+ unsigned level:3,
+ locks_want:3;
+ u8 nodes_locked;
+
+ struct btree_path_level {
+ struct btree *b;
+ struct btree_node_iter iter;
+ u32 lock_seq;
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+ u64 lock_taken_time;
+#endif
+ } l[BTREE_MAX_DEPTH];
+#ifdef TRACK_PATH_ALLOCATED
+ unsigned long ip_allocated;
+#endif
+};
+
+static inline struct btree_path_level *path_l(struct btree_path *path)
+{
+ return path->l + path->level;
+}
+
+static inline unsigned long btree_path_ip_allocated(struct btree_path *path)
+{
+#ifdef TRACK_PATH_ALLOCATED
+ return path->ip_allocated;
+#else
+ return _THIS_IP_;
+#endif
+}
+
+/*
+ * @pos - iterator's current position
+ * @level - current btree depth
+ * @locks_want - btree level below which we start taking intent locks
+ * @nodes_locked - bitmask indicating which nodes in @nodes are locked
+ * @nodes_intent_locked - bitmask indicating which locks are intent locks
+ */
+struct btree_iter {
+ struct btree_trans *trans;
+ struct btree_path *path;
+ struct btree_path *update_path;
+ struct btree_path *key_cache_path;
+
+ enum btree_id btree_id:8;
+ unsigned min_depth:3;
+ unsigned advanced:1;
+
+ /* btree_iter_copy starts here: */
+ u16 flags;
+
+ /* When we're filtering by snapshot, the snapshot ID we're looking for: */
+ unsigned snapshot;
+
+ struct bpos pos;
+ /*
+ * Current unpacked key - so that bch2_btree_iter_next()/
+ * bch2_btree_iter_next_slot() can correctly advance pos.
+ */
+ struct bkey k;
+
+ /* BTREE_ITER_WITH_JOURNAL: */
+ size_t journal_idx;
+ struct bpos journal_pos;
+#ifdef TRACK_PATH_ALLOCATED
+ unsigned long ip_allocated;
+#endif
+};
+
+#define BKEY_CACHED_ACCESSED 0
+#define BKEY_CACHED_DIRTY 1
+
+struct bkey_cached {
+ struct btree_bkey_cached_common c;
+
+ unsigned long flags;
+ u16 u64s;
+ bool valid;
+ u32 btree_trans_barrier_seq;
+ struct bkey_cached_key key;
+
+ struct rhash_head hash;
+ struct list_head list;
+
+ struct journal_entry_pin journal;
+ u64 seq;
+
+ struct bkey_i *k;
+};
+
+static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)
+{
+ return !b->cached
+ ? container_of(b, struct btree, c)->key.k.p
+ : container_of(b, struct bkey_cached, c)->key.pos;
+}
+
+struct btree_insert_entry {
+ unsigned flags;
+ u8 bkey_type;
+ enum btree_id btree_id:8;
+ u8 level:4;
+ bool cached:1;
+ bool insert_trigger_run:1;
+ bool overwrite_trigger_run:1;
+ bool key_cache_already_flushed:1;
+ /*
+ * @old_k may be a key from the journal; @old_btree_u64s always refers
+ * to the size of the key being overwritten in the btree:
+ */
+ u8 old_btree_u64s;
+ struct bkey_i *k;
+ struct btree_path *path;
+ u64 seq;
+ /* key being overwritten: */
+ struct bkey old_k;
+ const struct bch_val *old_v;
+ unsigned long ip_allocated;
+};
+
+#define BTREE_ITER_MAX 64
+
+struct btree_trans_commit_hook;
+typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
+
+struct btree_trans_commit_hook {
+ btree_trans_commit_hook_fn *fn;
+ struct btree_trans_commit_hook *next;
+};
+
+#define BTREE_TRANS_MEM_MAX (1U << 16)
+
+#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS 10000
+
+struct btree_trans {
+ struct bch_fs *c;
+ const char *fn;
+ struct closure ref;
+ struct list_head list;
+ u64 last_begin_time;
+
+ u8 lock_may_not_fail;
+ u8 lock_must_abort;
+ struct btree_bkey_cached_common *locking;
+ struct six_lock_waiter locking_wait;
+
+ int srcu_idx;
+
+ u8 fn_idx;
+ u8 nr_sorted;
+ u8 nr_updates;
+ u8 nr_wb_updates;
+ u8 wb_updates_size;
+ bool srcu_held:1;
+ bool used_mempool:1;
+ bool in_traverse_all:1;
+ bool paths_sorted:1;
+ bool memory_allocation_failure:1;
+ bool journal_transaction_names:1;
+ bool journal_replay_not_finished:1;
+ bool notrace_relock_fail:1;
+ bool write_locked:1;
+ enum bch_errcode restarted:16;
+ u32 restart_count;
+ unsigned long last_begin_ip;
+ unsigned long last_restarted_ip;
+ unsigned long srcu_lock_time;
+
+ /*
+ * For when bch2_trans_update notices we'll be splitting a compressed
+ * extent:
+ */
+ unsigned extra_journal_res;
+ unsigned nr_max_paths;
+
+ u64 paths_allocated;
+
+ unsigned mem_top;
+ unsigned mem_max;
+ unsigned mem_bytes;
+ void *mem;
+
+ u8 sorted[BTREE_ITER_MAX + 8];
+ struct btree_path paths[BTREE_ITER_MAX];
+ struct btree_insert_entry updates[BTREE_ITER_MAX];
+ struct btree_write_buffered_key *wb_updates;
+
+ /* update path: */
+ struct btree_trans_commit_hook *hooks;
+ darray_u64 extra_journal_entries;
+ struct journal_entry_pin *journal_pin;
+
+ struct journal_res journal_res;
+ u64 *journal_seq;
+ struct disk_reservation *disk_res;
+ unsigned journal_u64s;
+ struct replicas_delta_list *fs_usage_deltas;
+};
+
+#define BCH_BTREE_WRITE_TYPES() \
+ x(initial, 0) \
+ x(init_next_bset, 1) \
+ x(cache_reclaim, 2) \
+ x(journal_reclaim, 3) \
+ x(interior, 4)
+
+enum btree_write_type {
+#define x(t, n) BTREE_WRITE_##t,
+ BCH_BTREE_WRITE_TYPES()
+#undef x
+ BTREE_WRITE_TYPE_NR,
+};
+
+#define BTREE_WRITE_TYPE_MASK (roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1)
+#define BTREE_WRITE_TYPE_BITS ilog2(roundup_pow_of_two(BTREE_WRITE_TYPE_NR))
+
+#define BTREE_FLAGS() \
+ x(read_in_flight) \
+ x(read_error) \
+ x(dirty) \
+ x(need_write) \
+ x(write_blocked) \
+ x(will_make_reachable) \
+ x(noevict) \
+ x(write_idx) \
+ x(accessed) \
+ x(write_in_flight) \
+ x(write_in_flight_inner) \
+ x(just_written) \
+ x(dying) \
+ x(fake) \
+ x(need_rewrite) \
+ x(never_write)
+
+enum btree_flags {
+ /* First bits for btree node write type */
+ BTREE_NODE_FLAGS_START = BTREE_WRITE_TYPE_BITS - 1,
+#define x(flag) BTREE_NODE_##flag,
+ BTREE_FLAGS()
+#undef x
+};
+
+#define x(flag) \
+static inline bool btree_node_ ## flag(struct btree *b) \
+{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \
+ \
+static inline void set_btree_node_ ## flag(struct btree *b) \
+{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \
+ \
+static inline void clear_btree_node_ ## flag(struct btree *b) \
+{ clear_bit(BTREE_NODE_ ## flag, &b->flags); }
+
+BTREE_FLAGS()
+#undef x
+
+static inline struct btree_write *btree_current_write(struct btree *b)
+{
+ return b->writes + btree_node_write_idx(b);
+}
+
+static inline struct btree_write *btree_prev_write(struct btree *b)
+{
+ return b->writes + (btree_node_write_idx(b) ^ 1);
+}
+
+static inline struct bset_tree *bset_tree_last(struct btree *b)
+{
+ EBUG_ON(!b->nsets);
+ return b->set + b->nsets - 1;
+}
+
+static inline void *
+__btree_node_offset_to_ptr(const struct btree *b, u16 offset)
+{
+ return (void *) ((u64 *) b->data + 1 + offset);
+}
+
+static inline u16
+__btree_node_ptr_to_offset(const struct btree *b, const void *p)
+{
+ u16 ret = (u64 *) p - 1 - (u64 *) b->data;
+
+ EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p);
+ return ret;
+}
+
+static inline struct bset *bset(const struct btree *b,
+ const struct bset_tree *t)
+{
+ return __btree_node_offset_to_ptr(b, t->data_offset);
+}
+
+static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
+{
+ t->end_offset =
+ __btree_node_ptr_to_offset(b, vstruct_last(bset(b, t)));
+}
+
+static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
+ const struct bset *i)
+{
+ t->data_offset = __btree_node_ptr_to_offset(b, i);
+ set_btree_bset_end(b, t);
+}
+
+static inline struct bset *btree_bset_first(struct btree *b)
+{
+ return bset(b, b->set);
+}
+
+static inline struct bset *btree_bset_last(struct btree *b)
+{
+ return bset(b, bset_tree_last(b));
+}
+
+static inline u16
+__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k)
+{
+ return __btree_node_ptr_to_offset(b, k);
+}
+
+static inline struct bkey_packed *
+__btree_node_offset_to_key(const struct btree *b, u16 k)
+{
+ return __btree_node_offset_to_ptr(b, k);
+}
+
+static inline unsigned btree_bkey_first_offset(const struct bset_tree *t)
+{
+ return t->data_offset + offsetof(struct bset, _data) / sizeof(u64);
+}
+
+#define btree_bkey_first(_b, _t) \
+({ \
+ EBUG_ON(bset(_b, _t)->start != \
+ __btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\
+ \
+ bset(_b, _t)->start; \
+})
+
+#define btree_bkey_last(_b, _t) \
+({ \
+ EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) != \
+ vstruct_last(bset(_b, _t))); \
+ \
+ __btree_node_offset_to_key(_b, (_t)->end_offset); \
+})
+
+static inline unsigned bset_u64s(struct bset_tree *t)
+{
+ return t->end_offset - t->data_offset -
+ sizeof(struct bset) / sizeof(u64);
+}
+
+static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t)
+{
+ return bset_u64s(t) - b->nr.bset_u64s[t - b->set];
+}
+
+static inline unsigned bset_byte_offset(struct btree *b, void *i)
+{
+ return i - (void *) b->data;
+}
+
+enum btree_node_type {
+ BKEY_TYPE_btree,
+#define x(kwd, val, ...) BKEY_TYPE_##kwd = val + 1,
+ BCH_BTREE_IDS()
+#undef x
+ BKEY_TYPE_NR
+};
+
+/* Type of a key in btree @id at level @level: */
+static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id)
+{
+ return level ? BKEY_TYPE_btree : (unsigned) id + 1;
+}
+
+/* Type of keys @b contains: */
+static inline enum btree_node_type btree_node_type(struct btree *b)
+{
+ return __btree_node_type(b->c.level, b->c.btree_id);
+}
+
+const char *bch2_btree_node_type_str(enum btree_node_type);
+
+#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \
+ (BIT_ULL(BKEY_TYPE_extents)| \
+ BIT_ULL(BKEY_TYPE_alloc)| \
+ BIT_ULL(BKEY_TYPE_inodes)| \
+ BIT_ULL(BKEY_TYPE_stripes)| \
+ BIT_ULL(BKEY_TYPE_reflink)| \
+ BIT_ULL(BKEY_TYPE_btree))
+
+#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \
+ (BIT_ULL(BKEY_TYPE_alloc)| \
+ BIT_ULL(BKEY_TYPE_inodes)| \
+ BIT_ULL(BKEY_TYPE_stripes)| \
+ BIT_ULL(BKEY_TYPE_snapshots))
+
+#define BTREE_NODE_TYPE_HAS_TRIGGERS \
+ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \
+ BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
+
+static inline bool btree_node_type_needs_gc(enum btree_node_type type)
+{
+ return BTREE_NODE_TYPE_HAS_TRIGGERS & BIT_ULL(type);
+}
+
+static inline bool btree_node_type_is_extents(enum btree_node_type type)
+{
+ const unsigned mask = 0
+#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1))
+ BCH_BTREE_IDS()
+#undef x
+ ;
+
+ return (1U << type) & mask;
+}
+
+static inline bool btree_id_is_extents(enum btree_id btree)
+{
+ return btree_node_type_is_extents(__btree_node_type(0, btree));
+}
+
+static inline bool btree_type_has_snapshots(enum btree_id id)
+{
+ const unsigned mask = 0
+#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr)
+ BCH_BTREE_IDS()
+#undef x
+ ;
+
+ return (1U << id) & mask;
+}
+
+static inline bool btree_type_has_snapshot_field(enum btree_id id)
+{
+ const unsigned mask = 0
+#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr)
+ BCH_BTREE_IDS()
+#undef x
+ ;
+
+ return (1U << id) & mask;
+}
+
+static inline bool btree_type_has_ptrs(enum btree_id id)
+{
+ const unsigned mask = 0
+#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_DATA)) << nr)
+ BCH_BTREE_IDS()
+#undef x
+ ;
+
+ return (1U << id) & mask;
+}
+
+struct btree_root {
+ struct btree *b;
+
+ /* On disk root - see async splits: */
+ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+ u8 level;
+ u8 alive;
+ s8 error;
+};
+
+enum btree_gc_coalesce_fail_reason {
+ BTREE_GC_COALESCE_FAIL_RESERVE_GET,
+ BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
+ BTREE_GC_COALESCE_FAIL_FORMAT_FITS,
+};
+
+enum btree_node_sibling {
+ btree_prev_sib,
+ btree_next_sib,
+};
+
+#endif /* _BCACHEFS_BTREE_TYPES_H */
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
new file mode 100644
index 000000000000..2fd3c8cc6f51
--- /dev/null
+++ b/fs/bcachefs/btree_update.c
@@ -0,0 +1,950 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "btree_iter.h"
+#include "btree_journal_iter.h"
+#include "btree_locking.h"
+#include "buckets.h"
+#include "debug.h"
+#include "errcode.h"
+#include "error.h"
+#include "extents.h"
+#include "keylist.h"
+#include "snapshot.h"
+#include "trace.h"
+
+static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
+ const struct btree_insert_entry *r)
+{
+ return cmp_int(l->btree_id, r->btree_id) ?:
+ cmp_int(l->cached, r->cached) ?:
+ -cmp_int(l->level, r->level) ?:
+ bpos_cmp(l->k->k.p, r->k->k.p);
+}
+
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
+ struct bkey_i *, enum btree_update_flags,
+ unsigned long ip);
+
+static noinline int extent_front_merge(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ struct bkey_i **insert,
+ enum btree_update_flags flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_i *update;
+ int ret;
+
+ update = bch2_bkey_make_mut_noupdate(trans, k);
+ ret = PTR_ERR_OR_ZERO(update);
+ if (ret)
+ return ret;
+
+ if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert)))
+ return 0;
+
+ ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p) ?:
+ bch2_key_has_snapshot_overwrites(trans, iter->btree_id, (*insert)->k.p);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ return 0;
+
+ ret = bch2_btree_delete_at(trans, iter, flags);
+ if (ret)
+ return ret;
+
+ *insert = update;
+ return 0;
+}
+
+static noinline int extent_back_merge(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_i *insert,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ int ret;
+
+ ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?:
+ bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ return 0;
+
+ bch2_bkey_merge(c, bkey_i_to_s(insert), k);
+ return 0;
+}
+
+/*
+ * When deleting, check if we need to emit a whiteout (because we're overwriting
+ * something in an ancestor snapshot)
+ */
+static int need_whiteout_for_snapshot(struct btree_trans *trans,
+ enum btree_id btree_id, struct bpos pos)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u32 snapshot = pos.snapshot;
+ int ret;
+
+ if (!bch2_snapshot_parent(trans->c, pos.snapshot))
+ return 0;
+
+ pos.snapshot++;
+
+ for_each_btree_key_norestart(trans, iter, btree_id, pos,
+ BTREE_ITER_ALL_SNAPSHOTS|
+ BTREE_ITER_NOPRESERVE, k, ret) {
+ if (!bkey_eq(k.k->p, pos))
+ break;
+
+ if (bch2_snapshot_is_ancestor(trans->c, snapshot,
+ k.k->p.snapshot)) {
+ ret = !bkey_whiteout(k.k);
+ break;
+ }
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret;
+}
+
+int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
+ enum btree_id id,
+ struct bpos old_pos,
+ struct bpos new_pos)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter old_iter, new_iter = { NULL };
+ struct bkey_s_c old_k, new_k;
+ snapshot_id_list s;
+ struct bkey_i *update;
+ int ret = 0;
+
+ if (!bch2_snapshot_has_children(c, old_pos.snapshot))
+ return 0;
+
+ darray_init(&s);
+
+ bch2_trans_iter_init(trans, &old_iter, id, old_pos,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS);
+ while ((old_k = bch2_btree_iter_prev(&old_iter)).k &&
+ !(ret = bkey_err(old_k)) &&
+ bkey_eq(old_pos, old_k.k->p)) {
+ struct bpos whiteout_pos =
+ SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);;
+
+ if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
+ snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))
+ continue;
+
+ new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_INTENT);
+ ret = bkey_err(new_k);
+ if (ret)
+ break;
+
+ if (new_k.k->type == KEY_TYPE_deleted) {
+ update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+ ret = PTR_ERR_OR_ZERO(update);
+ if (ret)
+ break;
+
+ bkey_init(&update->k);
+ update->k.p = whiteout_pos;
+ update->k.type = KEY_TYPE_whiteout;
+
+ ret = bch2_trans_update(trans, &new_iter, update,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ }
+ bch2_trans_iter_exit(trans, &new_iter);
+
+ ret = snapshot_list_add(c, &s, old_k.k->p.snapshot);
+ if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(trans, &new_iter);
+ bch2_trans_iter_exit(trans, &old_iter);
+ darray_exit(&s);
+
+ return ret;
+}
+
+int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
+ struct btree_iter *iter,
+ enum btree_update_flags flags,
+ struct bkey_s_c old,
+ struct bkey_s_c new)
+{
+ enum btree_id btree_id = iter->btree_id;
+ struct bkey_i *update;
+ struct bpos new_start = bkey_start_pos(new.k);
+ unsigned front_split = bkey_lt(bkey_start_pos(old.k), new_start);
+ unsigned back_split = bkey_gt(old.k->p, new.k->p);
+ unsigned middle_split = (front_split || back_split) &&
+ old.k->p.snapshot != new.k->p.snapshot;
+ unsigned nr_splits = front_split + back_split + middle_split;
+ int ret = 0, compressed_sectors;
+
+ /*
+ * If we're going to be splitting a compressed extent, note it
+ * so that __bch2_trans_commit() can increase our disk
+ * reservation:
+ */
+ if (nr_splits > 1 &&
+ (compressed_sectors = bch2_bkey_sectors_compressed(old)))
+ trans->extra_journal_res += compressed_sectors * (nr_splits - 1);
+
+ if (front_split) {
+ update = bch2_bkey_make_mut_noupdate(trans, old);
+ if ((ret = PTR_ERR_OR_ZERO(update)))
+ return ret;
+
+ bch2_cut_back(new_start, update);
+
+ ret = bch2_insert_snapshot_whiteouts(trans, btree_id,
+ old.k->p, update->k.p) ?:
+ bch2_btree_insert_nonextent(trans, btree_id, update,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+ if (ret)
+ return ret;
+ }
+
+ /* If we're overwriting in a different snapshot - middle split: */
+ if (middle_split) {
+ update = bch2_bkey_make_mut_noupdate(trans, old);
+ if ((ret = PTR_ERR_OR_ZERO(update)))
+ return ret;
+
+ bch2_cut_front(new_start, update);
+ bch2_cut_back(new.k->p, update);
+
+ ret = bch2_insert_snapshot_whiteouts(trans, btree_id,
+ old.k->p, update->k.p) ?:
+ bch2_btree_insert_nonextent(trans, btree_id, update,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+ if (ret)
+ return ret;
+ }
+
+ if (bkey_le(old.k->p, new.k->p)) {
+ update = bch2_trans_kmalloc(trans, sizeof(*update));
+ if ((ret = PTR_ERR_OR_ZERO(update)))
+ return ret;
+
+ bkey_init(&update->k);
+ update->k.p = old.k->p;
+ update->k.p.snapshot = new.k->p.snapshot;
+
+ if (new.k->p.snapshot != old.k->p.snapshot) {
+ update->k.type = KEY_TYPE_whiteout;
+ } else if (btree_type_has_snapshots(btree_id)) {
+ ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ update->k.type = KEY_TYPE_whiteout;
+ }
+
+ ret = bch2_btree_insert_nonextent(trans, btree_id, update,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+ if (ret)
+ return ret;
+ }
+
+ if (back_split) {
+ update = bch2_bkey_make_mut_noupdate(trans, old);
+ if ((ret = PTR_ERR_OR_ZERO(update)))
+ return ret;
+
+ bch2_cut_front(new.k->p, update);
+
+ ret = bch2_trans_update_by_path(trans, iter->path, update,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+ flags, _RET_IP_);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int bch2_trans_update_extent(struct btree_trans *trans,
+ struct btree_iter *orig_iter,
+ struct bkey_i *insert,
+ enum btree_update_flags flags)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ enum btree_id btree_id = orig_iter->btree_id;
+ int ret = 0;
+
+ bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k),
+ BTREE_ITER_INTENT|
+ BTREE_ITER_WITH_UPDATES|
+ BTREE_ITER_NOT_EXTENTS);
+ k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
+ if ((ret = bkey_err(k)))
+ goto err;
+ if (!k.k)
+ goto out;
+
+ if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) {
+ if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
+ ret = extent_front_merge(trans, &iter, k, &insert, flags);
+ if (ret)
+ goto err;
+ }
+
+ goto next;
+ }
+
+ while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) {
+ bool done = bkey_lt(insert->k.p, k.k->p);
+
+ ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert));
+ if (ret)
+ goto err;
+
+ if (done)
+ goto out;
+next:
+ bch2_btree_iter_advance(&iter);
+ k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
+ if ((ret = bkey_err(k)))
+ goto err;
+ if (!k.k)
+ goto out;
+ }
+
+ if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
+ ret = extent_back_merge(trans, &iter, insert, k);
+ if (ret)
+ goto err;
+ }
+out:
+ if (!bkey_deleted(&insert->k))
+ ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret;
+}
+
+static noinline int flush_new_cached_update(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree_insert_entry *i,
+ enum btree_update_flags flags,
+ unsigned long ip)
+{
+ struct btree_path *btree_path;
+ struct bkey k;
+ int ret;
+
+ btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
+ BTREE_ITER_INTENT, _THIS_IP_);
+ ret = bch2_btree_path_traverse(trans, btree_path, 0);
+ if (ret)
+ goto out;
+
+ /*
+ * The old key in the insert entry might actually refer to an existing
+ * key in the btree that has been deleted from cache and not yet
+ * flushed. Check for this and skip the flush so we don't run triggers
+ * against a stale key.
+ */
+ bch2_btree_path_peek_slot_exact(btree_path, &k);
+ if (!bkey_deleted(&k))
+ goto out;
+
+ i->key_cache_already_flushed = true;
+ i->flags |= BTREE_TRIGGER_NORUN;
+
+ btree_path_set_should_be_locked(btree_path);
+ ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip);
+out:
+ bch2_path_put(trans, btree_path, true);
+ return ret;
+}
+
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
+ struct bkey_i *k, enum btree_update_flags flags,
+ unsigned long ip)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_insert_entry *i, n;
+ u64 seq = 0;
+ int cmp;
+
+ EBUG_ON(!path->should_be_locked);
+ EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
+ EBUG_ON(!bpos_eq(k->k.p, path->pos));
+
+ /*
+ * The transaction journal res hasn't been allocated at this point.
+ * That occurs at commit time. Reuse the seq field to pass in the seq
+ * of a prejournaled key.
+ */
+ if (flags & BTREE_UPDATE_PREJOURNAL)
+ seq = trans->journal_res.seq;
+
+ n = (struct btree_insert_entry) {
+ .flags = flags,
+ .bkey_type = __btree_node_type(path->level, path->btree_id),
+ .btree_id = path->btree_id,
+ .level = path->level,
+ .cached = path->cached,
+ .path = path,
+ .k = k,
+ .seq = seq,
+ .ip_allocated = ip,
+ };
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+ trans_for_each_update(trans, i)
+ BUG_ON(i != trans->updates &&
+ btree_insert_entry_cmp(i - 1, i) >= 0);
+#endif
+
+ /*
+ * Pending updates are kept sorted: first, find position of new update,
+ * then delete/trim any updates the new update overwrites:
+ */
+ trans_for_each_update(trans, i) {
+ cmp = btree_insert_entry_cmp(&n, i);
+ if (cmp <= 0)
+ break;
+ }
+
+ if (!cmp && i < trans->updates + trans->nr_updates) {
+ EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
+
+ bch2_path_put(trans, i->path, true);
+ i->flags = n.flags;
+ i->cached = n.cached;
+ i->k = n.k;
+ i->path = n.path;
+ i->seq = n.seq;
+ i->ip_allocated = n.ip_allocated;
+ } else {
+ array_insert_item(trans->updates, trans->nr_updates,
+ i - trans->updates, n);
+
+ i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v;
+ i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
+
+ if (unlikely(trans->journal_replay_not_finished)) {
+ struct bkey_i *j_k =
+ bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p);
+
+ if (j_k) {
+ i->old_k = j_k->k;
+ i->old_v = &j_k->v;
+ }
+ }
+ }
+
+ __btree_path_get(i->path, true);
+
+ /*
+ * If a key is present in the key cache, it must also exist in the
+ * btree - this is necessary for cache coherency. When iterating over
+ * a btree that's cached in the key cache, the btree iter code checks
+ * the key cache - but the key has to exist in the btree for that to
+ * work:
+ */
+ if (path->cached && bkey_deleted(&i->old_k))
+ return flush_new_cached_update(trans, path, i, flags, ip);
+
+ return 0;
+}
+
+static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct btree_path *path)
+{
+ if (!iter->key_cache_path ||
+ !iter->key_cache_path->should_be_locked ||
+ !bpos_eq(iter->key_cache_path->pos, iter->pos)) {
+ struct bkey_cached *ck;
+ int ret;
+
+ if (!iter->key_cache_path)
+ iter->key_cache_path =
+ bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_CACHED, _THIS_IP_);
+
+ iter->key_cache_path =
+ bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
+ iter->flags & BTREE_ITER_INTENT,
+ _THIS_IP_);
+
+ ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
+ BTREE_ITER_CACHED);
+ if (unlikely(ret))
+ return ret;
+
+ ck = (void *) iter->key_cache_path->l[0].b;
+
+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
+ }
+
+ btree_path_set_should_be_locked(iter->key_cache_path);
+ }
+
+ return 0;
+}
+
+int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_i *k, enum btree_update_flags flags)
+{
+ struct btree_path *path = iter->update_path ?: iter->path;
+ int ret;
+
+ if (iter->flags & BTREE_ITER_IS_EXTENTS)
+ return bch2_trans_update_extent(trans, iter, k, flags);
+
+ if (bkey_deleted(&k->k) &&
+ !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
+ (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
+ ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
+ if (unlikely(ret < 0))
+ return ret;
+
+ if (ret)
+ k->k.type = KEY_TYPE_whiteout;
+ }
+
+ /*
+ * Ensure that updates to cached btrees go to the key cache:
+ */
+ if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
+ !path->cached &&
+ !path->level &&
+ btree_id_cached(trans->c, path->btree_id)) {
+ ret = bch2_trans_update_get_key_cache(trans, iter, path);
+ if (ret)
+ return ret;
+
+ path = iter->key_cache_path;
+ }
+
+ return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_);
+}
+
+/*
+ * Add a transaction update for a key that has already been journaled.
+ */
+int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq,
+ struct btree_iter *iter, struct bkey_i *k,
+ enum btree_update_flags flags)
+{
+ trans->journal_res.seq = seq;
+ return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL|
+ BTREE_UPDATE_PREJOURNAL);
+}
+
+static noinline int bch2_btree_insert_clone_trans(struct btree_trans *trans,
+ enum btree_id btree,
+ struct bkey_i *k)
+{
+ struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(&k->k));
+ int ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
+
+ bkey_copy(n, k);
+ return bch2_btree_insert_trans(trans, btree, n, 0);
+}
+
+int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
+ enum btree_id btree,
+ struct bkey_i *k)
+{
+ struct btree_write_buffered_key *i;
+ int ret;
+
+ EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size);
+ EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
+
+ if (unlikely(trans->journal_replay_not_finished))
+ return bch2_btree_insert_clone_trans(trans, btree, k);
+
+ trans_for_each_wb_update(trans, i) {
+ if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) {
+ bkey_copy(&i->k, k);
+ return 0;
+ }
+ }
+
+ if (!trans->wb_updates ||
+ trans->nr_wb_updates == trans->wb_updates_size) {
+ struct btree_write_buffered_key *u;
+
+ if (trans->nr_wb_updates == trans->wb_updates_size) {
+ struct btree_transaction_stats *s = btree_trans_stats(trans);
+
+ BUG_ON(trans->wb_updates_size > U8_MAX / 2);
+ trans->wb_updates_size = max(1, trans->wb_updates_size * 2);
+ if (s)
+ s->wb_updates_size = trans->wb_updates_size;
+ }
+
+ u = bch2_trans_kmalloc_nomemzero(trans,
+ trans->wb_updates_size *
+ sizeof(struct btree_write_buffered_key));
+ ret = PTR_ERR_OR_ZERO(u);
+ if (ret)
+ return ret;
+
+ if (trans->nr_wb_updates)
+ memcpy(u, trans->wb_updates, trans->nr_wb_updates *
+ sizeof(struct btree_write_buffered_key));
+ trans->wb_updates = u;
+ }
+
+ trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) {
+ .btree = btree,
+ };
+
+ bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k);
+ trans->nr_wb_updates++;
+
+ return 0;
+}
+
+int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
+ enum btree_id btree, struct bpos end)
+{
+ struct bkey_s_c k;
+ int ret = 0;
+
+ bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT);
+ k = bch2_btree_iter_prev(iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ bch2_btree_iter_advance(iter);
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ BUG_ON(k.k->type != KEY_TYPE_deleted);
+
+ if (bkey_gt(k.k->p, end)) {
+ ret = -BCH_ERR_ENOSPC_btree_slot;
+ goto err;
+ }
+
+ return 0;
+err:
+ bch2_trans_iter_exit(trans, iter);
+ return ret;
+}
+
+void bch2_trans_commit_hook(struct btree_trans *trans,
+ struct btree_trans_commit_hook *h)
+{
+ h->next = trans->hooks;
+ trans->hooks = h;
+}
+
+int bch2_btree_insert_nonextent(struct btree_trans *trans,
+ enum btree_id btree, struct bkey_i *k,
+ enum btree_update_flags flags)
+{
+ struct btree_iter iter;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, btree, k->k.p,
+ BTREE_ITER_CACHED|
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_INTENT);
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(trans, &iter, k, flags);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id,
+ struct bkey_i *k, enum btree_update_flags flags)
+{
+ struct btree_iter iter;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
+ BTREE_ITER_CACHED|
+ BTREE_ITER_INTENT);
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(trans, &iter, k, flags);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+/**
+ * bch2_btree_insert - insert keys into the extent btree
+ * @c: pointer to struct bch_fs
+ * @id: btree to insert into
+ * @k: key to insert
+ * @disk_res: must be non-NULL whenever inserting or potentially
+ * splitting data extents
+ * @flags: transaction commit flags
+ *
+ * Returns: 0 on success, error code on failure
+ */
+int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k,
+ struct disk_reservation *disk_res, int flags)
+{
+ return bch2_trans_do(c, disk_res, NULL, flags,
+ bch2_btree_insert_trans(trans, id, k, 0));
+}
+
+int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
+ unsigned len, unsigned update_flags)
+{
+ struct bkey_i *k;
+
+ k = bch2_trans_kmalloc(trans, sizeof(*k));
+ if (IS_ERR(k))
+ return PTR_ERR(k);
+
+ bkey_init(&k->k);
+ k->k.p = iter->pos;
+ bch2_key_resize(&k->k, len);
+ return bch2_trans_update(trans, iter, k, update_flags);
+}
+
+int bch2_btree_delete_at(struct btree_trans *trans,
+ struct btree_iter *iter, unsigned update_flags)
+{
+ return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
+}
+
+int bch2_btree_delete_at_buffered(struct btree_trans *trans,
+ enum btree_id btree, struct bpos pos)
+{
+ struct bkey_i *k;
+
+ k = bch2_trans_kmalloc(trans, sizeof(*k));
+ if (IS_ERR(k))
+ return PTR_ERR(k);
+
+ bkey_init(&k->k);
+ k->k.p = pos;
+ return bch2_trans_update_buffered(trans, btree, k);
+}
+
+int bch2_btree_delete(struct btree_trans *trans,
+ enum btree_id btree, struct bpos pos,
+ unsigned update_flags)
+{
+ struct btree_iter iter;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, btree, pos,
+ BTREE_ITER_CACHED|
+ BTREE_ITER_INTENT);
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_delete_at(trans, &iter, update_flags);
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret;
+}
+
+int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
+ struct bpos start, struct bpos end,
+ unsigned update_flags,
+ u64 *journal_seq)
+{
+ u32 restart_count = trans->restart_count;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
+ while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) {
+ struct disk_reservation disk_res =
+ bch2_disk_reservation_init(trans->c, 0);
+ struct bkey_i delete;
+
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ bkey_init(&delete.k);
+
+ /*
+ * This could probably be more efficient for extents:
+ */
+
+ /*
+ * For extents, iter.pos won't necessarily be the same as
+ * bkey_start_pos(k.k) (for non extents they always will be the
+ * same). It's important that we delete starting from iter.pos
+ * because the range we want to delete could start in the middle
+ * of k.
+ *
+ * (bch2_btree_iter_peek() does guarantee that iter.pos >=
+ * bkey_start_pos(k.k)).
+ */
+ delete.k.p = iter.pos;
+
+ if (iter.flags & BTREE_ITER_IS_EXTENTS)
+ bch2_key_resize(&delete.k,
+ bpos_min(end, k.k->p).offset -
+ iter.pos.offset);
+
+ ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
+ bch2_trans_commit(trans, &disk_res, journal_seq,
+ BTREE_INSERT_NOFAIL);
+ bch2_disk_reservation_put(trans->c, &disk_res);
+err:
+ /*
+ * the bch2_trans_begin() call is in a weird place because we
+ * need to call it after every transaction commit, to avoid path
+ * overflow, but don't want to call it if the delete operation
+ * is a no-op and we have no work to do:
+ */
+ bch2_trans_begin(trans);
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ ret = 0;
+ if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret ?: trans_was_restarted(trans, restart_count);
+}
+
+/*
+ * bch_btree_delete_range - delete everything within a given range
+ *
+ * Range is a half open interval - [start, end)
+ */
+int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
+ struct bpos start, struct bpos end,
+ unsigned update_flags,
+ u64 *journal_seq)
+{
+ int ret = bch2_trans_run(c,
+ bch2_btree_delete_range_trans(trans, id, start, end,
+ update_flags, journal_seq));
+ if (ret == -BCH_ERR_transaction_restart_nested)
+ ret = 0;
+ return ret;
+}
+
+int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
+ struct bpos pos, bool set)
+{
+ struct bkey_i *k;
+ int ret = 0;
+
+ k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
+ ret = PTR_ERR_OR_ZERO(k);
+ if (unlikely(ret))
+ return ret;
+
+ bkey_init(&k->k);
+ k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
+ k->k.p = pos;
+
+ return bch2_trans_update_buffered(trans, btree, k);
+}
+
+__printf(2, 0)
+static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args)
+{
+ struct printbuf buf = PRINTBUF;
+ struct jset_entry_log *l;
+ unsigned u64s;
+ int ret;
+
+ prt_vprintf(&buf, fmt, args);
+ ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
+ if (ret)
+ goto err;
+
+ u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
+
+ ret = darray_make_room(entries, jset_u64s(u64s));
+ if (ret)
+ goto err;
+
+ l = (void *) &darray_top(*entries);
+ l->entry.u64s = cpu_to_le16(u64s);
+ l->entry.btree_id = 0;
+ l->entry.level = 1;
+ l->entry.type = BCH_JSET_ENTRY_log;
+ l->entry.pad[0] = 0;
+ l->entry.pad[1] = 0;
+ l->entry.pad[2] = 0;
+ memcpy(l->d, buf.buf, buf.pos);
+ while (buf.pos & 7)
+ l->d[buf.pos++] = '\0';
+
+ entries->nr += jset_u64s(u64s);
+err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+__printf(3, 0)
+static int
+__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
+ va_list args)
+{
+ int ret;
+
+ if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) {
+ ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args);
+ } else {
+ ret = bch2_trans_do(c, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|commit_flags,
+ __bch2_trans_log_msg(&trans->extra_journal_entries, fmt, args));
+ }
+
+ return ret;
+}
+
+__printf(2, 3)
+int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
+{
+ va_list args;
+ int ret;
+
+ va_start(args, fmt);
+ ret = __bch2_fs_log_msg(c, 0, fmt, args);
+ va_end(args);
+ return ret;
+}
+
+/*
+ * Use for logging messages during recovery to enable reserved space and avoid
+ * blocking.
+ */
+__printf(2, 3)
+int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...)
+{
+ va_list args;
+ int ret;
+
+ va_start(args, fmt);
+ ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args);
+ va_end(args);
+ return ret;
+}
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
new file mode 100644
index 000000000000..9816d2286540
--- /dev/null
+++ b/fs/bcachefs/btree_update.h
@@ -0,0 +1,340 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_UPDATE_H
+#define _BCACHEFS_BTREE_UPDATE_H
+
+#include "btree_iter.h"
+#include "journal.h"
+
+struct bch_fs;
+struct btree;
+
+void bch2_btree_node_prep_for_write(struct btree_trans *,
+ struct btree_path *, struct btree *);
+bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
+ struct btree *, struct btree_node_iter *,
+ struct bkey_i *);
+
+int bch2_btree_node_flush0(struct journal *, struct journal_entry_pin *, u64);
+int bch2_btree_node_flush1(struct journal *, struct journal_entry_pin *, u64);
+void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
+
+void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
+ struct bkey_i *, u64);
+
+enum btree_insert_flags {
+ /* First bits for bch_watermark: */
+ __BTREE_INSERT_NOFAIL = BCH_WATERMARK_BITS,
+ __BTREE_INSERT_NOCHECK_RW,
+ __BTREE_INSERT_LAZY_RW,
+ __BTREE_INSERT_JOURNAL_REPLAY,
+ __BTREE_INSERT_JOURNAL_RECLAIM,
+ __BTREE_INSERT_NOWAIT,
+ __BTREE_INSERT_GC_LOCK_HELD,
+ __BCH_HASH_SET_MUST_CREATE,
+ __BCH_HASH_SET_MUST_REPLACE,
+};
+
+/* Don't check for -ENOSPC: */
+#define BTREE_INSERT_NOFAIL BIT(__BTREE_INSERT_NOFAIL)
+
+#define BTREE_INSERT_NOCHECK_RW BIT(__BTREE_INSERT_NOCHECK_RW)
+#define BTREE_INSERT_LAZY_RW BIT(__BTREE_INSERT_LAZY_RW)
+
+/* Insert is for journal replay - don't get journal reservations: */
+#define BTREE_INSERT_JOURNAL_REPLAY BIT(__BTREE_INSERT_JOURNAL_REPLAY)
+
+/* Insert is being called from journal reclaim path: */
+#define BTREE_INSERT_JOURNAL_RECLAIM BIT(__BTREE_INSERT_JOURNAL_RECLAIM)
+
+/* Don't block on allocation failure (for new btree nodes: */
+#define BTREE_INSERT_NOWAIT BIT(__BTREE_INSERT_NOWAIT)
+#define BTREE_INSERT_GC_LOCK_HELD BIT(__BTREE_INSERT_GC_LOCK_HELD)
+
+#define BCH_HASH_SET_MUST_CREATE BIT(__BCH_HASH_SET_MUST_CREATE)
+#define BCH_HASH_SET_MUST_REPLACE BIT(__BCH_HASH_SET_MUST_REPLACE)
+
+int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
+ unsigned, unsigned);
+int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
+int bch2_btree_delete_at_buffered(struct btree_trans *, enum btree_id, struct bpos);
+int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned);
+
+int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
+ struct bkey_i *, enum btree_update_flags);
+
+int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *,
+ enum btree_update_flags);
+int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
+ struct disk_reservation *, int flags);
+
+int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
+ struct bpos, struct bpos, unsigned, u64 *);
+int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
+ struct bpos, struct bpos, unsigned, u64 *);
+
+int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool);
+
+int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id,
+ struct bpos, struct bpos);
+
+/*
+ * For use when splitting extents in existing snapshots:
+ *
+ * If @old_pos is an interior snapshot node, iterate over descendent snapshot
+ * nodes: for every descendent snapshot in whiche @old_pos is overwritten and
+ * not visible, emit a whiteout at @new_pos.
+ */
+static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
+ enum btree_id btree,
+ struct bpos old_pos,
+ struct bpos new_pos)
+{
+ if (!btree_type_has_snapshots(btree) ||
+ bkey_eq(old_pos, new_pos))
+ return 0;
+
+ return __bch2_insert_snapshot_whiteouts(trans, btree, old_pos, new_pos);
+}
+
+int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *,
+ enum btree_update_flags,
+ struct bkey_s_c, struct bkey_s_c);
+
+int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *,
+ enum btree_id, struct bpos);
+
+int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
+ struct bkey_i *, enum btree_update_flags);
+int __must_check bch2_trans_update_seq(struct btree_trans *, u64, struct btree_iter *,
+ struct bkey_i *, enum btree_update_flags);
+int __must_check bch2_trans_update_buffered(struct btree_trans *,
+ enum btree_id, struct bkey_i *);
+
+void bch2_trans_commit_hook(struct btree_trans *,
+ struct btree_trans_commit_hook *);
+int __bch2_trans_commit(struct btree_trans *, unsigned);
+
+__printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
+__printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...);
+
+/**
+ * bch2_trans_commit - insert keys at given iterator positions
+ *
+ * This is main entry point for btree updates.
+ *
+ * Return values:
+ * -EROFS: filesystem read only
+ * -EIO: journal or btree node IO error
+ */
+static inline int bch2_trans_commit(struct btree_trans *trans,
+ struct disk_reservation *disk_res,
+ u64 *journal_seq,
+ unsigned flags)
+{
+ trans->disk_res = disk_res;
+ trans->journal_seq = journal_seq;
+
+ return __bch2_trans_commit(trans, flags);
+}
+
+#define commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \
+ lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
+ (_journal_seq), (_flags)))
+
+#define nested_commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \
+ nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
+ (_journal_seq), (_flags)))
+
+#define bch2_trans_run(_c, _do) \
+({ \
+ struct btree_trans *trans = bch2_trans_get(_c); \
+ int _ret = (_do); \
+ bch2_trans_put(trans); \
+ _ret; \
+})
+
+#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \
+ bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do))
+
+#define trans_for_each_update(_trans, _i) \
+ for ((_i) = (_trans)->updates; \
+ (_i) < (_trans)->updates + (_trans)->nr_updates; \
+ (_i)++)
+
+#define trans_for_each_wb_update(_trans, _i) \
+ for ((_i) = (_trans)->wb_updates; \
+ (_i) < (_trans)->wb_updates + (_trans)->nr_wb_updates; \
+ (_i)++)
+
+static inline void bch2_trans_reset_updates(struct btree_trans *trans)
+{
+ struct btree_insert_entry *i;
+
+ trans_for_each_update(trans, i)
+ bch2_path_put(trans, i->path, true);
+
+ trans->extra_journal_res = 0;
+ trans->nr_updates = 0;
+ trans->nr_wb_updates = 0;
+ trans->wb_updates = NULL;
+ trans->hooks = NULL;
+ trans->extra_journal_entries.nr = 0;
+
+ if (trans->fs_usage_deltas) {
+ trans->fs_usage_deltas->used = 0;
+ memset((void *) trans->fs_usage_deltas +
+ offsetof(struct replicas_delta_list, memset_start), 0,
+ (void *) &trans->fs_usage_deltas->memset_end -
+ (void *) &trans->fs_usage_deltas->memset_start);
+ }
+}
+
+static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k,
+ unsigned type, unsigned min_bytes)
+{
+ unsigned bytes = max_t(unsigned, min_bytes, bkey_bytes(k.k));
+ struct bkey_i *mut;
+
+ if (type && k.k->type != type)
+ return ERR_PTR(-ENOENT);
+
+ mut = bch2_trans_kmalloc_nomemzero(trans, bytes);
+ if (!IS_ERR(mut)) {
+ bkey_reassemble(mut, k);
+
+ if (unlikely(bytes > bkey_bytes(k.k))) {
+ memset((void *) mut + bkey_bytes(k.k), 0,
+ bytes - bkey_bytes(k.k));
+ mut->k.u64s = DIV_ROUND_UP(bytes, sizeof(u64));
+ }
+ }
+ return mut;
+}
+
+static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k)
+{
+ return __bch2_bkey_make_mut_noupdate(trans, k, 0, 0);
+}
+
+#define bch2_bkey_make_mut_noupdate_typed(_trans, _k, _type) \
+ bkey_i_to_##_type(__bch2_bkey_make_mut_noupdate(_trans, _k, \
+ KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
+
+static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c *k, unsigned flags,
+ unsigned type, unsigned min_bytes)
+{
+ struct bkey_i *mut = __bch2_bkey_make_mut_noupdate(trans, *k, type, min_bytes);
+ int ret;
+
+ if (IS_ERR(mut))
+ return mut;
+
+ ret = bch2_trans_update(trans, iter, mut, flags);
+ if (ret)
+ return ERR_PTR(ret);
+
+ *k = bkey_i_to_s_c(mut);
+ return mut;
+}
+
+static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c *k, unsigned flags)
+{
+ return __bch2_bkey_make_mut(trans, iter, k, flags, 0, 0);
+}
+
+#define bch2_bkey_make_mut_typed(_trans, _iter, _k, _flags, _type) \
+ bkey_i_to_##_type(__bch2_bkey_make_mut(_trans, _iter, _k, _flags,\
+ KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
+
+static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *trans,
+ struct btree_iter *iter,
+ unsigned btree_id, struct bpos pos,
+ unsigned flags, unsigned type, unsigned min_bytes)
+{
+ struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter,
+ btree_id, pos, flags|BTREE_ITER_INTENT, type);
+ struct bkey_i *ret = IS_ERR(k.k)
+ ? ERR_CAST(k.k)
+ : __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes);
+ if (IS_ERR(ret))
+ bch2_trans_iter_exit(trans, iter);
+ return ret;
+}
+
+static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *trans,
+ struct btree_iter *iter,
+ unsigned btree_id, struct bpos pos,
+ unsigned flags)
+{
+ return __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags, 0, 0);
+}
+
+static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans,
+ struct btree_iter *iter,
+ unsigned btree_id, struct bpos pos,
+ unsigned flags, unsigned type, unsigned min_bytes)
+{
+ struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter,
+ btree_id, pos, flags|BTREE_ITER_INTENT, type, min_bytes);
+ int ret;
+
+ if (IS_ERR(mut))
+ return mut;
+
+ ret = bch2_trans_update(trans, iter, mut, flags);
+ if (ret) {
+ bch2_trans_iter_exit(trans, iter);
+ return ERR_PTR(ret);
+ }
+
+ return mut;
+}
+
+static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans,
+ struct btree_iter *iter,
+ unsigned btree_id, struct bpos pos,
+ unsigned flags, unsigned min_bytes)
+{
+ return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, min_bytes);
+}
+
+static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans,
+ struct btree_iter *iter,
+ unsigned btree_id, struct bpos pos,
+ unsigned flags)
+{
+ return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, 0);
+}
+
+#define bch2_bkey_get_mut_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\
+ bkey_i_to_##_type(__bch2_bkey_get_mut(_trans, _iter, \
+ _btree_id, _pos, _flags, \
+ KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
+
+static inline struct bkey_i *__bch2_bkey_alloc(struct btree_trans *trans, struct btree_iter *iter,
+ unsigned flags, unsigned type, unsigned val_size)
+{
+ struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k) + val_size);
+ int ret;
+
+ if (IS_ERR(k))
+ return k;
+
+ bkey_init(&k->k);
+ k->k.p = iter->pos;
+ k->k.type = type;
+ set_bkey_val_bytes(&k->k, val_size);
+
+ ret = bch2_trans_update(trans, iter, k, flags);
+ if (unlikely(ret))
+ return ERR_PTR(ret);
+ return k;
+}
+
+#define bch2_bkey_alloc(_trans, _iter, _flags, _type) \
+ bkey_i_to_##_type(__bch2_bkey_alloc(_trans, _iter, _flags, \
+ KEY_TYPE_##_type, sizeof(struct bch_##_type)))
+
+#endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
new file mode 100644
index 000000000000..239fcc3c7c99
--- /dev/null
+++ b/fs/bcachefs/btree_update_interior.c
@@ -0,0 +1,2476 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+#include "btree_gc.h"
+#include "btree_journal_iter.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "buckets.h"
+#include "clock.h"
+#include "error.h"
+#include "extents.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "keylist.h"
+#include "replicas.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/random.h>
+
+static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
+ struct btree_path *, struct btree *,
+ struct keylist *, unsigned);
+static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
+
+static struct btree_path *get_unlocked_mut_path(struct btree_trans *trans,
+ enum btree_id btree_id,
+ unsigned level,
+ struct bpos pos)
+{
+ struct btree_path *path;
+
+ path = bch2_path_get(trans, btree_id, pos, level + 1, level,
+ BTREE_ITER_NOPRESERVE|
+ BTREE_ITER_INTENT, _RET_IP_);
+ path = bch2_btree_path_make_mut(trans, path, true, _RET_IP_);
+ bch2_btree_path_downgrade(trans, path);
+ __bch2_btree_path_unlock(trans, path);
+ return path;
+}
+
+/* Debug code: */
+
+/*
+ * Verify that child nodes correctly span parent node's range:
+ */
+static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+ struct bpos next_node = b->data->min_key;
+ struct btree_node_iter iter;
+ struct bkey_s_c k;
+ struct bkey_s_c_btree_ptr_v2 bp;
+ struct bkey unpacked;
+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+
+ BUG_ON(!b->c.level);
+
+ if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+ return;
+
+ bch2_btree_node_iter_init_from_start(&iter, b);
+
+ while (1) {
+ k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked);
+ if (k.k->type != KEY_TYPE_btree_ptr_v2)
+ break;
+ bp = bkey_s_c_to_btree_ptr_v2(k);
+
+ if (!bpos_eq(next_node, bp.v->min_key)) {
+ bch2_dump_btree_node(c, b);
+ bch2_bpos_to_text(&buf1, next_node);
+ bch2_bpos_to_text(&buf2, bp.v->min_key);
+ panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf);
+ }
+
+ bch2_btree_node_iter_advance(&iter, b);
+
+ if (bch2_btree_node_iter_end(&iter)) {
+ if (!bpos_eq(k.k->p, b->key.k.p)) {
+ bch2_dump_btree_node(c, b);
+ bch2_bpos_to_text(&buf1, b->key.k.p);
+ bch2_bpos_to_text(&buf2, k.k->p);
+ panic("expected end %s got %s\n", buf1.buf, buf2.buf);
+ }
+ break;
+ }
+
+ next_node = bpos_successor(k.k->p);
+ }
+#endif
+}
+
+/* Calculate ideal packed bkey format for new btree nodes: */
+
+static void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
+{
+ struct bkey_packed *k;
+ struct bset_tree *t;
+ struct bkey uk;
+
+ for_each_bset(b, t)
+ bset_tree_for_each_key(b, t, k)
+ if (!bkey_deleted(k)) {
+ uk = bkey_unpack_key(b, k);
+ bch2_bkey_format_add_key(s, &uk);
+ }
+}
+
+static struct bkey_format bch2_btree_calc_format(struct btree *b)
+{
+ struct bkey_format_state s;
+
+ bch2_bkey_format_init(&s);
+ bch2_bkey_format_add_pos(&s, b->data->min_key);
+ bch2_bkey_format_add_pos(&s, b->data->max_key);
+ __bch2_btree_calc_format(&s, b);
+
+ return bch2_bkey_format_done(&s);
+}
+
+static size_t btree_node_u64s_with_format(struct btree_nr_keys nr,
+ struct bkey_format *old_f,
+ struct bkey_format *new_f)
+{
+ /* stupid integer promotion rules */
+ ssize_t delta =
+ (((int) new_f->key_u64s - old_f->key_u64s) *
+ (int) nr.packed_keys) +
+ (((int) new_f->key_u64s - BKEY_U64s) *
+ (int) nr.unpacked_keys);
+
+ BUG_ON(delta + nr.live_u64s < 0);
+
+ return nr.live_u64s + delta;
+}
+
+/**
+ * bch2_btree_node_format_fits - check if we could rewrite node with a new format
+ *
+ * @c: filesystem handle
+ * @b: btree node to rewrite
+ * @nr: number of keys for new node (i.e. b->nr)
+ * @new_f: bkey format to translate keys to
+ *
+ * Returns: true if all re-packed keys will be able to fit in a new node.
+ *
+ * Assumes all keys will successfully pack with the new format.
+ */
+static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
+ struct btree_nr_keys nr,
+ struct bkey_format *new_f)
+{
+ size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f);
+
+ return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c);
+}
+
+/* Btree node freeing/allocation: */
+
+static void __btree_node_free(struct bch_fs *c, struct btree *b)
+{
+ trace_and_count(c, btree_node_free, c, b);
+
+ BUG_ON(btree_node_write_blocked(b));
+ BUG_ON(btree_node_dirty(b));
+ BUG_ON(btree_node_need_write(b));
+ BUG_ON(b == btree_node_root(c, b));
+ BUG_ON(b->ob.nr);
+ BUG_ON(!list_empty(&b->write_blocked));
+ BUG_ON(b->will_make_reachable);
+
+ clear_btree_node_noevict(b);
+
+ mutex_lock(&c->btree_cache.lock);
+ list_move(&b->list, &c->btree_cache.freeable);
+ mutex_unlock(&c->btree_cache.lock);
+}
+
+static void bch2_btree_node_free_inmem(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b)
+{
+ struct bch_fs *c = trans->c;
+ unsigned level = b->c.level;
+
+ bch2_btree_node_lock_write_nofail(trans, path, &b->c);
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
+ __btree_node_free(c, b);
+ six_unlock_write(&b->c.lock);
+ mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
+
+ trans_for_each_path(trans, path)
+ if (path->l[level].b == b) {
+ btree_node_unlock(trans, path, level);
+ path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
+ }
+}
+
+static void bch2_btree_node_free_never_used(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree *b)
+{
+ struct bch_fs *c = as->c;
+ struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL];
+ struct btree_path *path;
+ unsigned level = b->c.level;
+
+ BUG_ON(!list_empty(&b->write_blocked));
+ BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as));
+
+ b->will_make_reachable = 0;
+ closure_put(&as->cl);
+
+ clear_btree_node_will_make_reachable(b);
+ clear_btree_node_accessed(b);
+ clear_btree_node_dirty_acct(c, b);
+ clear_btree_node_need_write(b);
+
+ mutex_lock(&c->btree_cache.lock);
+ list_del_init(&b->list);
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
+ mutex_unlock(&c->btree_cache.lock);
+
+ BUG_ON(p->nr >= ARRAY_SIZE(p->b));
+ p->b[p->nr++] = b;
+
+ six_unlock_intent(&b->c.lock);
+
+ trans_for_each_path(trans, path)
+ if (path->l[level].b == b) {
+ btree_node_unlock(trans, path, level);
+ path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
+ }
+}
+
+static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
+ struct disk_reservation *res,
+ struct closure *cl,
+ bool interior_node,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct write_point *wp;
+ struct btree *b;
+ BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+ struct open_buckets obs = { .nr = 0 };
+ struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
+ enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
+ unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim
+ ? BTREE_NODE_RESERVE
+ : 0;
+ int ret;
+
+ mutex_lock(&c->btree_reserve_cache_lock);
+ if (c->btree_reserve_cache_nr > nr_reserve) {
+ struct btree_alloc *a =
+ &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
+
+ obs = a->ob;
+ bkey_copy(&tmp.k, &a->k);
+ mutex_unlock(&c->btree_reserve_cache_lock);
+ goto mem_alloc;
+ }
+ mutex_unlock(&c->btree_reserve_cache_lock);
+
+retry:
+ ret = bch2_alloc_sectors_start_trans(trans,
+ c->opts.metadata_target ?:
+ c->opts.foreground_target,
+ 0,
+ writepoint_ptr(&c->btree_write_point),
+ &devs_have,
+ res->nr_replicas,
+ c->opts.metadata_replicas_required,
+ watermark, 0, cl, &wp);
+ if (unlikely(ret))
+ return ERR_PTR(ret);
+
+ if (wp->sectors_free < btree_sectors(c)) {
+ struct open_bucket *ob;
+ unsigned i;
+
+ open_bucket_for_each(c, &wp->ptrs, ob, i)
+ if (ob->sectors_free < btree_sectors(c))
+ ob->sectors_free = 0;
+
+ bch2_alloc_sectors_done(c, wp);
+ goto retry;
+ }
+
+ bkey_btree_ptr_v2_init(&tmp.k);
+ bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false);
+
+ bch2_open_bucket_get(c, wp, &obs);
+ bch2_alloc_sectors_done(c, wp);
+mem_alloc:
+ b = bch2_btree_node_mem_alloc(trans, interior_node);
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
+
+ /* we hold cannibalize_lock: */
+ BUG_ON(IS_ERR(b));
+ BUG_ON(b->ob.nr);
+
+ bkey_copy(&b->key, &tmp.k);
+ b->ob = obs;
+
+ return b;
+}
+
+static struct btree *bch2_btree_node_alloc(struct btree_update *as,
+ struct btree_trans *trans,
+ unsigned level)
+{
+ struct bch_fs *c = as->c;
+ struct btree *b;
+ struct prealloc_nodes *p = &as->prealloc_nodes[!!level];
+ int ret;
+
+ BUG_ON(level >= BTREE_MAX_DEPTH);
+ BUG_ON(!p->nr);
+
+ b = p->b[--p->nr];
+
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
+
+ set_btree_node_accessed(b);
+ set_btree_node_dirty_acct(c, b);
+ set_btree_node_need_write(b);
+
+ bch2_bset_init_first(b, &b->data->keys);
+ b->c.level = level;
+ b->c.btree_id = as->btree_id;
+ b->version_ondisk = c->sb.version;
+
+ memset(&b->nr, 0, sizeof(b->nr));
+ b->data->magic = cpu_to_le64(bset_magic(c));
+ memset(&b->data->_ptr, 0, sizeof(b->data->_ptr));
+ b->data->flags = 0;
+ SET_BTREE_NODE_ID(b->data, as->btree_id);
+ SET_BTREE_NODE_LEVEL(b->data, level);
+
+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+ struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key);
+
+ bp->v.mem_ptr = 0;
+ bp->v.seq = b->data->keys.seq;
+ bp->v.sectors_written = 0;
+ }
+
+ SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
+
+ bch2_btree_build_aux_trees(b);
+
+ ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
+ BUG_ON(ret);
+
+ trace_and_count(c, btree_node_alloc, c, b);
+ bch2_increment_clock(c, btree_sectors(c), WRITE);
+ return b;
+}
+
+static void btree_set_min(struct btree *b, struct bpos pos)
+{
+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2)
+ bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos;
+ b->data->min_key = pos;
+}
+
+static void btree_set_max(struct btree *b, struct bpos pos)
+{
+ b->key.k.p = pos;
+ b->data->max_key = pos;
+}
+
+static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree *b)
+{
+ struct btree *n = bch2_btree_node_alloc(as, trans, b->c.level);
+ struct bkey_format format = bch2_btree_calc_format(b);
+
+ /*
+ * The keys might expand with the new format - if they wouldn't fit in
+ * the btree node anymore, use the old format for now:
+ */
+ if (!bch2_btree_node_format_fits(as->c, b, b->nr, &format))
+ format = b->format;
+
+ SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
+
+ btree_set_min(n, b->data->min_key);
+ btree_set_max(n, b->data->max_key);
+
+ n->data->format = format;
+ btree_node_set_format(n, format);
+
+ bch2_btree_sort_into(as->c, n, b);
+
+ btree_node_reset_sib_u64s(n);
+ return n;
+}
+
+static struct btree *__btree_root_alloc(struct btree_update *as,
+ struct btree_trans *trans, unsigned level)
+{
+ struct btree *b = bch2_btree_node_alloc(as, trans, level);
+
+ btree_set_min(b, POS_MIN);
+ btree_set_max(b, SPOS_MAX);
+ b->data->format = bch2_btree_calc_format(b);
+
+ btree_node_set_format(b, b->data->format);
+ bch2_btree_build_aux_trees(b);
+
+ return b;
+}
+
+static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *trans)
+{
+ struct bch_fs *c = as->c;
+ struct prealloc_nodes *p;
+
+ for (p = as->prealloc_nodes;
+ p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes);
+ p++) {
+ while (p->nr) {
+ struct btree *b = p->b[--p->nr];
+
+ mutex_lock(&c->btree_reserve_cache_lock);
+
+ if (c->btree_reserve_cache_nr <
+ ARRAY_SIZE(c->btree_reserve_cache)) {
+ struct btree_alloc *a =
+ &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
+
+ a->ob = b->ob;
+ b->ob.nr = 0;
+ bkey_copy(&a->k, &b->key);
+ } else {
+ bch2_open_buckets_put(c, &b->ob);
+ }
+
+ mutex_unlock(&c->btree_reserve_cache_lock);
+
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
+ __btree_node_free(c, b);
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
+ }
+ }
+}
+
+static int bch2_btree_reserve_get(struct btree_trans *trans,
+ struct btree_update *as,
+ unsigned nr_nodes[2],
+ unsigned flags,
+ struct closure *cl)
+{
+ struct bch_fs *c = as->c;
+ struct btree *b;
+ unsigned interior;
+ int ret = 0;
+
+ BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX);
+
+ /*
+ * Protects reaping from the btree node cache and using the btree node
+ * open bucket reserve:
+ *
+ * BTREE_INSERT_NOWAIT only applies to btree node allocation, not
+ * blocking on this lock:
+ */
+ ret = bch2_btree_cache_cannibalize_lock(c, cl);
+ if (ret)
+ return ret;
+
+ for (interior = 0; interior < 2; interior++) {
+ struct prealloc_nodes *p = as->prealloc_nodes + interior;
+
+ while (p->nr < nr_nodes[interior]) {
+ b = __bch2_btree_node_alloc(trans, &as->disk_res,
+ flags & BTREE_INSERT_NOWAIT ? NULL : cl,
+ interior, flags);
+ if (IS_ERR(b)) {
+ ret = PTR_ERR(b);
+ goto err;
+ }
+
+ p->b[p->nr++] = b;
+ }
+ }
+err:
+ bch2_btree_cache_cannibalize_unlock(c);
+ return ret;
+}
+
+/* Asynchronous interior node update machinery */
+
+static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *trans)
+{
+ struct bch_fs *c = as->c;
+
+ if (as->took_gc_lock)
+ up_read(&c->gc_lock);
+ as->took_gc_lock = false;
+
+ bch2_journal_pin_drop(&c->journal, &as->journal);
+ bch2_journal_pin_flush(&c->journal, &as->journal);
+ bch2_disk_reservation_put(c, &as->disk_res);
+ bch2_btree_reserve_put(as, trans);
+
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
+ as->start_time);
+
+ mutex_lock(&c->btree_interior_update_lock);
+ list_del(&as->unwritten_list);
+ list_del(&as->list);
+
+ closure_debug_destroy(&as->cl);
+ mempool_free(as, &c->btree_interior_update_pool);
+
+ /*
+ * Have to do the wakeup with btree_interior_update_lock still held,
+ * since being on btree_interior_update_list is our ref on @c:
+ */
+ closure_wake_up(&c->btree_interior_update_wait);
+
+ mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static void btree_update_add_key(struct btree_update *as,
+ struct keylist *keys, struct btree *b)
+{
+ struct bkey_i *k = &b->key;
+
+ BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s >
+ ARRAY_SIZE(as->_old_keys));
+
+ bkey_copy(keys->top, k);
+ bkey_i_to_btree_ptr_v2(keys->top)->v.mem_ptr = b->c.level + 1;
+
+ bch2_keylist_push(keys);
+}
+
+/*
+ * The transactional part of an interior btree node update, where we journal the
+ * update we did to the interior node and update alloc info:
+ */
+static int btree_update_nodes_written_trans(struct btree_trans *trans,
+ struct btree_update *as)
+{
+ struct bkey_i *k;
+ int ret;
+
+ ret = darray_make_room(&trans->extra_journal_entries, as->journal_u64s);
+ if (ret)
+ return ret;
+
+ memcpy(&darray_top(trans->extra_journal_entries),
+ as->journal_entries,
+ as->journal_u64s * sizeof(u64));
+ trans->extra_journal_entries.nr += as->journal_u64s;
+
+ trans->journal_pin = &as->journal;
+
+ for_each_keylist_key(&as->old_keys, k) {
+ unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
+
+ ret = bch2_trans_mark_old(trans, as->btree_id, level, bkey_i_to_s_c(k), 0);
+ if (ret)
+ return ret;
+ }
+
+ for_each_keylist_key(&as->new_keys, k) {
+ unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
+
+ ret = bch2_trans_mark_new(trans, as->btree_id, level, k, 0);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static void btree_update_nodes_written(struct btree_update *as)
+{
+ struct bch_fs *c = as->c;
+ struct btree *b;
+ struct btree_trans *trans = bch2_trans_get(c);
+ u64 journal_seq = 0;
+ unsigned i;
+ int ret;
+
+ /*
+ * If we're already in an error state, it might be because a btree node
+ * was never written, and we might be trying to free that same btree
+ * node here, but it won't have been marked as allocated and we'll see
+ * spurious disk usage inconsistencies in the transactional part below
+ * if we don't skip it:
+ */
+ ret = bch2_journal_error(&c->journal);
+ if (ret)
+ goto err;
+
+ /*
+ * Wait for any in flight writes to finish before we free the old nodes
+ * on disk:
+ */
+ for (i = 0; i < as->nr_old_nodes; i++) {
+ __le64 seq;
+
+ b = as->old_nodes[i];
+
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+ seq = b->data ? b->data->keys.seq : 0;
+ six_unlock_read(&b->c.lock);
+
+ if (seq == as->old_nodes_seq[i])
+ wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner,
+ TASK_UNINTERRUPTIBLE);
+ }
+
+ /*
+ * We did an update to a parent node where the pointers we added pointed
+ * to child nodes that weren't written yet: now, the child nodes have
+ * been written so we can write out the update to the interior node.
+ */
+
+ /*
+ * We can't call into journal reclaim here: we'd block on the journal
+ * reclaim lock, but we may need to release the open buckets we have
+ * pinned in order for other btree updates to make forward progress, and
+ * journal reclaim does btree updates when flushing bkey_cached entries,
+ * which may require allocations as well.
+ */
+ ret = commit_do(trans, &as->disk_res, &journal_seq,
+ BCH_WATERMARK_reclaim|
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_NOCHECK_RW|
+ BTREE_INSERT_JOURNAL_RECLAIM,
+ btree_update_nodes_written_trans(trans, as));
+ bch2_trans_unlock(trans);
+
+ bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
+ "%s(): error %s", __func__, bch2_err_str(ret));
+err:
+ if (as->b) {
+ struct btree_path *path;
+
+ b = as->b;
+ path = get_unlocked_mut_path(trans, as->btree_id, b->c.level, b->key.k.p);
+ /*
+ * @b is the node we did the final insert into:
+ *
+ * On failure to get a journal reservation, we still have to
+ * unblock the write and allow most of the write path to happen
+ * so that shutdown works, but the i->journal_seq mechanism
+ * won't work to prevent the btree write from being visible (we
+ * didn't get a journal sequence number) - instead
+ * __bch2_btree_node_write() doesn't do the actual write if
+ * we're in journal error state:
+ */
+
+ /*
+ * Ensure transaction is unlocked before using
+ * btree_node_lock_nopath() (the use of which is always suspect,
+ * we need to work on removing this in the future)
+ *
+ * It should be, but get_unlocked_mut_path() -> bch2_path_get()
+ * calls bch2_path_upgrade(), before we call path_make_mut(), so
+ * we may rarely end up with a locked path besides the one we
+ * have here:
+ */
+ bch2_trans_unlock(trans);
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED);
+ path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
+ path->l[b->c.level].b = b;
+
+ bch2_btree_node_lock_write_nofail(trans, path, &b->c);
+
+ mutex_lock(&c->btree_interior_update_lock);
+
+ list_del(&as->write_blocked_list);
+ if (list_empty(&b->write_blocked))
+ clear_btree_node_write_blocked(b);
+
+ /*
+ * Node might have been freed, recheck under
+ * btree_interior_update_lock:
+ */
+ if (as->b == b) {
+ BUG_ON(!b->c.level);
+ BUG_ON(!btree_node_dirty(b));
+
+ if (!ret) {
+ struct bset *last = btree_bset_last(b);
+
+ last->journal_seq = cpu_to_le64(
+ max(journal_seq,
+ le64_to_cpu(last->journal_seq)));
+
+ bch2_btree_add_journal_pin(c, b, journal_seq);
+ } else {
+ /*
+ * If we didn't get a journal sequence number we
+ * can't write this btree node, because recovery
+ * won't know to ignore this write:
+ */
+ set_btree_node_never_write(b);
+ }
+ }
+
+ mutex_unlock(&c->btree_interior_update_lock);
+
+ mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
+ six_unlock_write(&b->c.lock);
+
+ btree_node_write_if_need(c, b, SIX_LOCK_intent);
+ btree_node_unlock(trans, path, b->c.level);
+ bch2_path_put(trans, path, true);
+ }
+
+ bch2_journal_pin_drop(&c->journal, &as->journal);
+
+ mutex_lock(&c->btree_interior_update_lock);
+ for (i = 0; i < as->nr_new_nodes; i++) {
+ b = as->new_nodes[i];
+
+ BUG_ON(b->will_make_reachable != (unsigned long) as);
+ b->will_make_reachable = 0;
+ clear_btree_node_will_make_reachable(b);
+ }
+ mutex_unlock(&c->btree_interior_update_lock);
+
+ for (i = 0; i < as->nr_new_nodes; i++) {
+ b = as->new_nodes[i];
+
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+ btree_node_write_if_need(c, b, SIX_LOCK_read);
+ six_unlock_read(&b->c.lock);
+ }
+
+ for (i = 0; i < as->nr_open_buckets; i++)
+ bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
+
+ bch2_btree_update_free(as, trans);
+ bch2_trans_put(trans);
+}
+
+static void btree_interior_update_work(struct work_struct *work)
+{
+ struct bch_fs *c =
+ container_of(work, struct bch_fs, btree_interior_update_work);
+ struct btree_update *as;
+
+ while (1) {
+ mutex_lock(&c->btree_interior_update_lock);
+ as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
+ struct btree_update, unwritten_list);
+ if (as && !as->nodes_written)
+ as = NULL;
+ mutex_unlock(&c->btree_interior_update_lock);
+
+ if (!as)
+ break;
+
+ btree_update_nodes_written(as);
+ }
+}
+
+static CLOSURE_CALLBACK(btree_update_set_nodes_written)
+{
+ closure_type(as, struct btree_update, cl);
+ struct bch_fs *c = as->c;
+
+ mutex_lock(&c->btree_interior_update_lock);
+ as->nodes_written = true;
+ mutex_unlock(&c->btree_interior_update_lock);
+
+ queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
+}
+
+/*
+ * We're updating @b with pointers to nodes that haven't finished writing yet:
+ * block @b from being written until @as completes
+ */
+static void btree_update_updated_node(struct btree_update *as, struct btree *b)
+{
+ struct bch_fs *c = as->c;
+
+ mutex_lock(&c->btree_interior_update_lock);
+ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
+
+ BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+ BUG_ON(!btree_node_dirty(b));
+ BUG_ON(!b->c.level);
+
+ as->mode = BTREE_INTERIOR_UPDATING_NODE;
+ as->b = b;
+
+ set_btree_node_write_blocked(b);
+ list_add(&as->write_blocked_list, &b->write_blocked);
+
+ mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static void btree_update_reparent(struct btree_update *as,
+ struct btree_update *child)
+{
+ struct bch_fs *c = as->c;
+
+ lockdep_assert_held(&c->btree_interior_update_lock);
+
+ child->b = NULL;
+ child->mode = BTREE_INTERIOR_UPDATING_AS;
+
+ bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
+}
+
+static void btree_update_updated_root(struct btree_update *as, struct btree *b)
+{
+ struct bkey_i *insert = &b->key;
+ struct bch_fs *c = as->c;
+
+ BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+
+ BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
+ ARRAY_SIZE(as->journal_entries));
+
+ as->journal_u64s +=
+ journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
+ BCH_JSET_ENTRY_btree_root,
+ b->c.btree_id, b->c.level,
+ insert, insert->k.u64s);
+
+ mutex_lock(&c->btree_interior_update_lock);
+ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
+
+ as->mode = BTREE_INTERIOR_UPDATING_ROOT;
+ mutex_unlock(&c->btree_interior_update_lock);
+}
+
+/*
+ * bch2_btree_update_add_new_node:
+ *
+ * This causes @as to wait on @b to be written, before it gets to
+ * bch2_btree_update_nodes_written
+ *
+ * Additionally, it sets b->will_make_reachable to prevent any additional writes
+ * to @b from happening besides the first until @b is reachable on disk
+ *
+ * And it adds @b to the list of @as's new nodes, so that we can update sector
+ * counts in bch2_btree_update_nodes_written:
+ */
+static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
+{
+ struct bch_fs *c = as->c;
+
+ closure_get(&as->cl);
+
+ mutex_lock(&c->btree_interior_update_lock);
+ BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes));
+ BUG_ON(b->will_make_reachable);
+
+ as->new_nodes[as->nr_new_nodes++] = b;
+ b->will_make_reachable = 1UL|(unsigned long) as;
+ set_btree_node_will_make_reachable(b);
+
+ mutex_unlock(&c->btree_interior_update_lock);
+
+ btree_update_add_key(as, &as->new_keys, b);
+
+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+ unsigned bytes = vstruct_end(&b->data->keys) - (void *) b->data;
+ unsigned sectors = round_up(bytes, block_bytes(c)) >> 9;
+
+ bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
+ cpu_to_le16(sectors);
+ }
+}
+
+/*
+ * returns true if @b was a new node
+ */
+static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
+{
+ struct btree_update *as;
+ unsigned long v;
+ unsigned i;
+
+ mutex_lock(&c->btree_interior_update_lock);
+ /*
+ * When b->will_make_reachable != 0, it owns a ref on as->cl that's
+ * dropped when it gets written by bch2_btree_complete_write - the
+ * xchg() is for synchronization with bch2_btree_complete_write:
+ */
+ v = xchg(&b->will_make_reachable, 0);
+ clear_btree_node_will_make_reachable(b);
+ as = (struct btree_update *) (v & ~1UL);
+
+ if (!as) {
+ mutex_unlock(&c->btree_interior_update_lock);
+ return;
+ }
+
+ for (i = 0; i < as->nr_new_nodes; i++)
+ if (as->new_nodes[i] == b)
+ goto found;
+
+ BUG();
+found:
+ array_remove_item(as->new_nodes, as->nr_new_nodes, i);
+ mutex_unlock(&c->btree_interior_update_lock);
+
+ if (v & 1)
+ closure_put(&as->cl);
+}
+
+static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
+{
+ while (b->ob.nr)
+ as->open_buckets[as->nr_open_buckets++] =
+ b->ob.v[--b->ob.nr];
+}
+
+/*
+ * @b is being split/rewritten: it may have pointers to not-yet-written btree
+ * nodes and thus outstanding btree_updates - redirect @b's
+ * btree_updates to point to this btree_update:
+ */
+static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
+ struct btree *b)
+{
+ struct bch_fs *c = as->c;
+ struct btree_update *p, *n;
+ struct btree_write *w;
+
+ set_btree_node_dying(b);
+
+ if (btree_node_fake(b))
+ return;
+
+ mutex_lock(&c->btree_interior_update_lock);
+
+ /*
+ * Does this node have any btree_update operations preventing
+ * it from being written?
+ *
+ * If so, redirect them to point to this btree_update: we can
+ * write out our new nodes, but we won't make them visible until those
+ * operations complete
+ */
+ list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
+ list_del_init(&p->write_blocked_list);
+ btree_update_reparent(as, p);
+
+ /*
+ * for flush_held_btree_writes() waiting on updates to flush or
+ * nodes to be writeable:
+ */
+ closure_wake_up(&c->btree_interior_update_wait);
+ }
+
+ clear_btree_node_dirty_acct(c, b);
+ clear_btree_node_need_write(b);
+ clear_btree_node_write_blocked(b);
+
+ /*
+ * Does this node have unwritten data that has a pin on the journal?
+ *
+ * If so, transfer that pin to the btree_update operation -
+ * note that if we're freeing multiple nodes, we only need to keep the
+ * oldest pin of any of the nodes we're freeing. We'll release the pin
+ * when the new nodes are persistent and reachable on disk:
+ */
+ w = btree_current_write(b);
+ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
+ bch2_journal_pin_drop(&c->journal, &w->journal);
+
+ w = btree_prev_write(b);
+ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
+ bch2_journal_pin_drop(&c->journal, &w->journal);
+
+ mutex_unlock(&c->btree_interior_update_lock);
+
+ /*
+ * Is this a node that isn't reachable on disk yet?
+ *
+ * Nodes that aren't reachable yet have writes blocked until they're
+ * reachable - now that we've cancelled any pending writes and moved
+ * things waiting on that write to wait on this update, we can drop this
+ * node from the list of nodes that the other update is making
+ * reachable, prior to freeing it:
+ */
+ btree_update_drop_new_node(c, b);
+
+ btree_update_add_key(as, &as->old_keys, b);
+
+ as->old_nodes[as->nr_old_nodes] = b;
+ as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq;
+ as->nr_old_nodes++;
+}
+
+static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *trans)
+{
+ struct bch_fs *c = as->c;
+ u64 start_time = as->start_time;
+
+ BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
+
+ if (as->took_gc_lock)
+ up_read(&as->c->gc_lock);
+ as->took_gc_lock = false;
+
+ bch2_btree_reserve_put(as, trans);
+
+ continue_at(&as->cl, btree_update_set_nodes_written,
+ as->c->btree_interior_update_worker);
+
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
+ start_time);
+}
+
+static struct btree_update *
+bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
+ unsigned level, bool split, unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_update *as;
+ u64 start_time = local_clock();
+ int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
+ ? BCH_DISK_RESERVATION_NOFAIL : 0;
+ unsigned nr_nodes[2] = { 0, 0 };
+ unsigned update_level = level;
+ enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
+ int ret = 0;
+ u32 restart_count = trans->restart_count;
+
+ BUG_ON(!path->should_be_locked);
+
+ if (watermark == BCH_WATERMARK_copygc)
+ watermark = BCH_WATERMARK_btree_copygc;
+ if (watermark < BCH_WATERMARK_btree)
+ watermark = BCH_WATERMARK_btree;
+
+ flags &= ~BCH_WATERMARK_MASK;
+ flags |= watermark;
+
+ if (!(flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+ watermark < c->journal.watermark) {
+ struct journal_res res = { 0 };
+
+ ret = drop_locks_do(trans,
+ bch2_journal_res_get(&c->journal, &res, 1,
+ watermark|JOURNAL_RES_GET_CHECK));
+ if (ret)
+ return ERR_PTR(ret);
+ }
+
+ while (1) {
+ nr_nodes[!!update_level] += 1 + split;
+ update_level++;
+
+ ret = bch2_btree_path_upgrade(trans, path, update_level + 1);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (!btree_path_node(path, update_level)) {
+ /* Allocating new root? */
+ nr_nodes[1] += split;
+ update_level = BTREE_MAX_DEPTH;
+ break;
+ }
+
+ /*
+ * Always check for space for two keys, even if we won't have to
+ * split at prior level - it might have been a merge instead:
+ */
+ if (bch2_btree_node_insert_fits(c, path->l[update_level].b,
+ BKEY_BTREE_PTR_U64s_MAX * 2))
+ break;
+
+ split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
+ }
+
+ if (flags & BTREE_INSERT_GC_LOCK_HELD)
+ lockdep_assert_held(&c->gc_lock);
+ else if (!down_read_trylock(&c->gc_lock)) {
+ ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0));
+ if (ret) {
+ up_read(&c->gc_lock);
+ return ERR_PTR(ret);
+ }
+ }
+
+ as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS);
+ memset(as, 0, sizeof(*as));
+ closure_init(&as->cl, NULL);
+ as->c = c;
+ as->start_time = start_time;
+ as->mode = BTREE_INTERIOR_NO_UPDATE;
+ as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
+ as->btree_id = path->btree_id;
+ as->update_level = update_level;
+ INIT_LIST_HEAD(&as->list);
+ INIT_LIST_HEAD(&as->unwritten_list);
+ INIT_LIST_HEAD(&as->write_blocked_list);
+ bch2_keylist_init(&as->old_keys, as->_old_keys);
+ bch2_keylist_init(&as->new_keys, as->_new_keys);
+ bch2_keylist_init(&as->parent_keys, as->inline_keys);
+
+ mutex_lock(&c->btree_interior_update_lock);
+ list_add_tail(&as->list, &c->btree_interior_update_list);
+ mutex_unlock(&c->btree_interior_update_lock);
+
+ /*
+ * We don't want to allocate if we're in an error state, that can cause
+ * deadlock on emergency shutdown due to open buckets getting stuck in
+ * the btree_reserve_cache after allocator shutdown has cleared it out.
+ * This check needs to come after adding us to the btree_interior_update
+ * list but before calling bch2_btree_reserve_get, to synchronize with
+ * __bch2_fs_read_only().
+ */
+ ret = bch2_journal_error(&c->journal);
+ if (ret)
+ goto err;
+
+ ret = bch2_disk_reservation_get(c, &as->disk_res,
+ (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
+ c->opts.metadata_replicas,
+ disk_res_flags);
+ if (ret)
+ goto err;
+
+ ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL);
+ if (bch2_err_matches(ret, ENOSPC) ||
+ bch2_err_matches(ret, ENOMEM)) {
+ struct closure cl;
+
+ /*
+ * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
+ * flag
+ */
+ if (bch2_err_matches(ret, ENOSPC) &&
+ (flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+ watermark != BCH_WATERMARK_reclaim) {
+ ret = -BCH_ERR_journal_reclaim_would_deadlock;
+ goto err;
+ }
+
+ closure_init_stack(&cl);
+
+ do {
+ ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl);
+
+ bch2_trans_unlock(trans);
+ closure_sync(&cl);
+ } while (bch2_err_matches(ret, BCH_ERR_operation_blocked));
+ }
+
+ if (ret) {
+ trace_and_count(c, btree_reserve_get_fail, trans->fn,
+ _RET_IP_, nr_nodes[0] + nr_nodes[1], ret);
+ goto err;
+ }
+
+ ret = bch2_trans_relock(trans);
+ if (ret)
+ goto err;
+
+ bch2_trans_verify_not_restarted(trans, restart_count);
+ return as;
+err:
+ bch2_btree_update_free(as, trans);
+ return ERR_PTR(ret);
+}
+
+/* Btree root updates: */
+
+static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
+{
+ /* Root nodes cannot be reaped */
+ mutex_lock(&c->btree_cache.lock);
+ list_del_init(&b->list);
+ mutex_unlock(&c->btree_cache.lock);
+
+ mutex_lock(&c->btree_root_lock);
+ BUG_ON(btree_node_root(c, b) &&
+ (b->c.level < btree_node_root(c, b)->c.level ||
+ !btree_node_dying(btree_node_root(c, b))));
+
+ bch2_btree_id_root(c, b->c.btree_id)->b = b;
+ mutex_unlock(&c->btree_root_lock);
+
+ bch2_recalc_btree_reserve(c);
+}
+
+static void bch2_btree_set_root(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b)
+{
+ struct bch_fs *c = as->c;
+ struct btree *old;
+
+ trace_and_count(c, btree_node_set_root, c, b);
+
+ old = btree_node_root(c, b);
+
+ /*
+ * Ensure no one is using the old root while we switch to the
+ * new root:
+ */
+ bch2_btree_node_lock_write_nofail(trans, path, &old->c);
+
+ bch2_btree_set_root_inmem(c, b);
+
+ btree_update_updated_root(as, b);
+
+ /*
+ * Unlock old root after new root is visible:
+ *
+ * The new root isn't persistent, but that's ok: we still have
+ * an intent lock on the new root, and any updates that would
+ * depend on the new root would have to update the new root.
+ */
+ bch2_btree_node_unlock_write(trans, path, old);
+}
+
+/* Interior node updates: */
+
+static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b,
+ struct btree_node_iter *node_iter,
+ struct bkey_i *insert)
+{
+ struct bch_fs *c = as->c;
+ struct bkey_packed *k;
+ struct printbuf buf = PRINTBUF;
+ unsigned long old, new, v;
+
+ BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
+ !btree_ptr_sectors_written(insert));
+
+ if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
+ bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
+
+ if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
+ btree_node_type(b), WRITE, &buf) ?:
+ bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf)) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "inserting invalid bkey\n ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+ prt_printf(&buf, "\n ");
+ bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
+ btree_node_type(b), WRITE, &buf);
+ bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf);
+
+ bch2_fs_inconsistent(c, "%s", buf.buf);
+ dump_stack();
+ }
+
+ BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
+ ARRAY_SIZE(as->journal_entries));
+
+ as->journal_u64s +=
+ journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
+ BCH_JSET_ENTRY_btree_keys,
+ b->c.btree_id, b->c.level,
+ insert, insert->k.u64s);
+
+ while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
+ bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
+ bch2_btree_node_iter_advance(node_iter, b);
+
+ bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
+ set_btree_node_dirty_acct(c, b);
+
+ v = READ_ONCE(b->flags);
+ do {
+ old = new = v;
+
+ new &= ~BTREE_WRITE_TYPE_MASK;
+ new |= BTREE_WRITE_interior;
+ new |= 1 << BTREE_NODE_need_write;
+ } while ((v = cmpxchg(&b->flags, old, new)) != old);
+
+ printbuf_exit(&buf);
+}
+
+static void
+__bch2_btree_insert_keys_interior(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b,
+ struct btree_node_iter node_iter,
+ struct keylist *keys)
+{
+ struct bkey_i *insert = bch2_keylist_front(keys);
+ struct bkey_packed *k;
+
+ BUG_ON(btree_node_type(b) != BKEY_TYPE_btree);
+
+ while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
+ (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
+ ;
+
+ while (!bch2_keylist_empty(keys)) {
+ insert = bch2_keylist_front(keys);
+
+ if (bpos_gt(insert->k.p, b->key.k.p))
+ break;
+
+ bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert);
+ bch2_keylist_pop_front(keys);
+ }
+}
+
+/*
+ * Move keys from n1 (original replacement node, now lower node) to n2 (higher
+ * node)
+ */
+static void __btree_split_node(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree *b,
+ struct btree *n[2])
+{
+ struct bkey_packed *k;
+ struct bpos n1_pos = POS_MIN;
+ struct btree_node_iter iter;
+ struct bset *bsets[2];
+ struct bkey_format_state format[2];
+ struct bkey_packed *out[2];
+ struct bkey uk;
+ unsigned u64s, n1_u64s = (b->nr.live_u64s * 3) / 5;
+ struct { unsigned nr_keys, val_u64s; } nr_keys[2];
+ int i;
+
+ memset(&nr_keys, 0, sizeof(nr_keys));
+
+ for (i = 0; i < 2; i++) {
+ BUG_ON(n[i]->nsets != 1);
+
+ bsets[i] = btree_bset_first(n[i]);
+ out[i] = bsets[i]->start;
+
+ SET_BTREE_NODE_SEQ(n[i]->data, BTREE_NODE_SEQ(b->data) + 1);
+ bch2_bkey_format_init(&format[i]);
+ }
+
+ u64s = 0;
+ for_each_btree_node_key(b, k, &iter) {
+ if (bkey_deleted(k))
+ continue;
+
+ i = u64s >= n1_u64s;
+ u64s += k->u64s;
+ uk = bkey_unpack_key(b, k);
+ if (!i)
+ n1_pos = uk.p;
+ bch2_bkey_format_add_key(&format[i], &uk);
+
+ nr_keys[i].nr_keys++;
+ nr_keys[i].val_u64s += bkeyp_val_u64s(&b->format, k);
+ }
+
+ btree_set_min(n[0], b->data->min_key);
+ btree_set_max(n[0], n1_pos);
+ btree_set_min(n[1], bpos_successor(n1_pos));
+ btree_set_max(n[1], b->data->max_key);
+
+ for (i = 0; i < 2; i++) {
+ bch2_bkey_format_add_pos(&format[i], n[i]->data->min_key);
+ bch2_bkey_format_add_pos(&format[i], n[i]->data->max_key);
+
+ n[i]->data->format = bch2_bkey_format_done(&format[i]);
+
+ unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s +
+ nr_keys[i].val_u64s;
+ if (__vstruct_bytes(struct btree_node, u64s) > btree_bytes(as->c))
+ n[i]->data->format = b->format;
+
+ btree_node_set_format(n[i], n[i]->data->format);
+ }
+
+ u64s = 0;
+ for_each_btree_node_key(b, k, &iter) {
+ if (bkey_deleted(k))
+ continue;
+
+ i = u64s >= n1_u64s;
+ u64s += k->u64s;
+
+ if (bch2_bkey_transform(&n[i]->format, out[i], bkey_packed(k)
+ ? &b->format: &bch2_bkey_format_current, k))
+ out[i]->format = KEY_FORMAT_LOCAL_BTREE;
+ else
+ bch2_bkey_unpack(b, (void *) out[i], k);
+
+ out[i]->needs_whiteout = false;
+
+ btree_keys_account_key_add(&n[i]->nr, 0, out[i]);
+ out[i] = bkey_p_next(out[i]);
+ }
+
+ for (i = 0; i < 2; i++) {
+ bsets[i]->u64s = cpu_to_le16((u64 *) out[i] - bsets[i]->_data);
+
+ BUG_ON(!bsets[i]->u64s);
+
+ set_btree_bset_end(n[i], n[i]->set);
+
+ btree_node_reset_sib_u64s(n[i]);
+
+ bch2_verify_btree_nr_keys(n[i]);
+
+ if (b->c.level)
+ btree_node_interior_verify(as->c, n[i]);
+ }
+}
+
+/*
+ * For updates to interior nodes, we've got to do the insert before we split
+ * because the stuff we're inserting has to be inserted atomically. Post split,
+ * the keys might have to go in different nodes and the split would no longer be
+ * atomic.
+ *
+ * Worse, if the insert is from btree node coalescing, if we do the insert after
+ * we do the split (and pick the pivot) - the pivot we pick might be between
+ * nodes that were coalesced, and thus in the middle of a child node post
+ * coalescing:
+ */
+static void btree_split_insert_keys(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b,
+ struct keylist *keys)
+{
+ if (!bch2_keylist_empty(keys) &&
+ bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) {
+ struct btree_node_iter node_iter;
+
+ bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
+
+ __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
+
+ btree_node_interior_verify(as->c, b);
+ }
+}
+
+static int btree_split(struct btree_update *as, struct btree_trans *trans,
+ struct btree_path *path, struct btree *b,
+ struct keylist *keys, unsigned flags)
+{
+ struct bch_fs *c = as->c;
+ struct btree *parent = btree_node_parent(path, b);
+ struct btree *n1, *n2 = NULL, *n3 = NULL;
+ struct btree_path *path1 = NULL, *path2 = NULL;
+ u64 start_time = local_clock();
+ int ret = 0;
+
+ BUG_ON(!parent && (b != btree_node_root(c, b)));
+ BUG_ON(parent && !btree_node_intent_locked(path, b->c.level + 1));
+
+ bch2_btree_interior_update_will_free_node(as, b);
+
+ if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
+ struct btree *n[2];
+
+ trace_and_count(c, btree_node_split, c, b);
+
+ n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level);
+ n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level);
+
+ __btree_split_node(as, trans, b, n);
+
+ if (keys) {
+ btree_split_insert_keys(as, trans, path, n1, keys);
+ btree_split_insert_keys(as, trans, path, n2, keys);
+ BUG_ON(!bch2_keylist_empty(keys));
+ }
+
+ bch2_btree_build_aux_trees(n2);
+ bch2_btree_build_aux_trees(n1);
+
+ bch2_btree_update_add_new_node(as, n1);
+ bch2_btree_update_add_new_node(as, n2);
+ six_unlock_write(&n2->c.lock);
+ six_unlock_write(&n1->c.lock);
+
+ path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
+ six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
+ bch2_btree_path_level_init(trans, path1, n1);
+
+ path2 = get_unlocked_mut_path(trans, path->btree_id, n2->c.level, n2->key.k.p);
+ six_lock_increment(&n2->c.lock, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, path2, n2->c.level, BTREE_NODE_INTENT_LOCKED);
+ bch2_btree_path_level_init(trans, path2, n2);
+
+ /*
+ * Note that on recursive parent_keys == keys, so we
+ * can't start adding new keys to parent_keys before emptying it
+ * out (which we did with btree_split_insert_keys() above)
+ */
+ bch2_keylist_add(&as->parent_keys, &n1->key);
+ bch2_keylist_add(&as->parent_keys, &n2->key);
+
+ if (!parent) {
+ /* Depth increases, make a new root */
+ n3 = __btree_root_alloc(as, trans, b->c.level + 1);
+
+ bch2_btree_update_add_new_node(as, n3);
+ six_unlock_write(&n3->c.lock);
+
+ path2->locks_want++;
+ BUG_ON(btree_node_locked(path2, n3->c.level));
+ six_lock_increment(&n3->c.lock, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, path2, n3->c.level, BTREE_NODE_INTENT_LOCKED);
+ bch2_btree_path_level_init(trans, path2, n3);
+
+ n3->sib_u64s[0] = U16_MAX;
+ n3->sib_u64s[1] = U16_MAX;
+
+ btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
+ }
+ } else {
+ trace_and_count(c, btree_node_compact, c, b);
+
+ n1 = bch2_btree_node_alloc_replacement(as, trans, b);
+
+ if (keys) {
+ btree_split_insert_keys(as, trans, path, n1, keys);
+ BUG_ON(!bch2_keylist_empty(keys));
+ }
+
+ bch2_btree_build_aux_trees(n1);
+ bch2_btree_update_add_new_node(as, n1);
+ six_unlock_write(&n1->c.lock);
+
+ path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
+ six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
+ bch2_btree_path_level_init(trans, path1, n1);
+
+ if (parent)
+ bch2_keylist_add(&as->parent_keys, &n1->key);
+ }
+
+ /* New nodes all written, now make them visible: */
+
+ if (parent) {
+ /* Split a non root node */
+ ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+ if (ret)
+ goto err;
+ } else if (n3) {
+ bch2_btree_set_root(as, trans, path, n3);
+ } else {
+ /* Root filled up but didn't need to be split */
+ bch2_btree_set_root(as, trans, path, n1);
+ }
+
+ if (n3) {
+ bch2_btree_update_get_open_buckets(as, n3);
+ bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
+ }
+ if (n2) {
+ bch2_btree_update_get_open_buckets(as, n2);
+ bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0);
+ }
+ bch2_btree_update_get_open_buckets(as, n1);
+ bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
+
+ /*
+ * The old node must be freed (in memory) _before_ unlocking the new
+ * nodes - else another thread could re-acquire a read lock on the old
+ * node after another thread has locked and updated the new node, thus
+ * seeing stale data:
+ */
+ bch2_btree_node_free_inmem(trans, path, b);
+
+ if (n3)
+ bch2_trans_node_add(trans, n3);
+ if (n2)
+ bch2_trans_node_add(trans, n2);
+ bch2_trans_node_add(trans, n1);
+
+ if (n3)
+ six_unlock_intent(&n3->c.lock);
+ if (n2)
+ six_unlock_intent(&n2->c.lock);
+ six_unlock_intent(&n1->c.lock);
+out:
+ if (path2) {
+ __bch2_btree_path_unlock(trans, path2);
+ bch2_path_put(trans, path2, true);
+ }
+ if (path1) {
+ __bch2_btree_path_unlock(trans, path1);
+ bch2_path_put(trans, path1, true);
+ }
+
+ bch2_trans_verify_locks(trans);
+
+ bch2_time_stats_update(&c->times[n2
+ ? BCH_TIME_btree_node_split
+ : BCH_TIME_btree_node_compact],
+ start_time);
+ return ret;
+err:
+ if (n3)
+ bch2_btree_node_free_never_used(as, trans, n3);
+ if (n2)
+ bch2_btree_node_free_never_used(as, trans, n2);
+ bch2_btree_node_free_never_used(as, trans, n1);
+ goto out;
+}
+
+static void
+bch2_btree_insert_keys_interior(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b,
+ struct keylist *keys)
+{
+ struct btree_path *linked;
+
+ __bch2_btree_insert_keys_interior(as, trans, path, b,
+ path->l[b->c.level].iter, keys);
+
+ btree_update_updated_node(as, b);
+
+ trans_for_each_path_with_node(trans, b, linked)
+ bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
+
+ bch2_trans_verify_paths(trans);
+}
+
+/**
+ * bch2_btree_insert_node - insert bkeys into a given btree node
+ *
+ * @as: btree_update object
+ * @trans: btree_trans object
+ * @path: path that points to current node
+ * @b: node to insert keys into
+ * @keys: list of keys to insert
+ * @flags: transaction commit flags
+ *
+ * Returns: 0 on success, typically transaction restart error on failure
+ *
+ * Inserts as many keys as it can into a given btree node, splitting it if full.
+ * If a split occurred, this function will return early. This can only happen
+ * for leaf nodes -- inserts into interior nodes have to be atomic.
+ */
+static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
+ struct btree_path *path, struct btree *b,
+ struct keylist *keys, unsigned flags)
+{
+ struct bch_fs *c = as->c;
+ int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
+ int old_live_u64s = b->nr.live_u64s;
+ int live_u64s_added, u64s_added;
+ int ret;
+
+ lockdep_assert_held(&c->gc_lock);
+ BUG_ON(!btree_node_intent_locked(path, b->c.level));
+ BUG_ON(!b->c.level);
+ BUG_ON(!as || as->b);
+ bch2_verify_keylist_sorted(keys);
+
+ ret = bch2_btree_node_lock_write(trans, path, &b->c);
+ if (ret)
+ return ret;
+
+ bch2_btree_node_prep_for_write(trans, path, b);
+
+ if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
+ bch2_btree_node_unlock_write(trans, path, b);
+ goto split;
+ }
+
+ btree_node_interior_verify(c, b);
+
+ bch2_btree_insert_keys_interior(as, trans, path, b, keys);
+
+ live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
+ u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
+
+ if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
+ b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
+ if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
+ b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
+
+ if (u64s_added > live_u64s_added &&
+ bch2_maybe_compact_whiteouts(c, b))
+ bch2_trans_node_reinit_iter(trans, b);
+
+ bch2_btree_node_unlock_write(trans, path, b);
+
+ btree_node_interior_verify(c, b);
+ return 0;
+split:
+ /*
+ * We could attempt to avoid the transaction restart, by calling
+ * bch2_btree_path_upgrade() and allocating more nodes:
+ */
+ if (b->c.level >= as->update_level) {
+ trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
+ }
+
+ return btree_split(as, trans, path, b, keys, flags);
+}
+
+int bch2_btree_split_leaf(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned flags)
+{
+ struct btree *b = path_l(path)->b;
+ struct btree_update *as;
+ unsigned l;
+ int ret = 0;
+
+ as = bch2_btree_update_start(trans, path, path->level,
+ true, flags);
+ if (IS_ERR(as))
+ return PTR_ERR(as);
+
+ ret = btree_split(as, trans, path, b, NULL, flags);
+ if (ret) {
+ bch2_btree_update_free(as, trans);
+ return ret;
+ }
+
+ bch2_btree_update_done(as, trans);
+
+ for (l = path->level + 1; btree_node_intent_locked(path, l) && !ret; l++)
+ ret = bch2_foreground_maybe_merge(trans, path, l, flags);
+
+ return ret;
+}
+
+int __bch2_foreground_maybe_merge(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned level,
+ unsigned flags,
+ enum btree_node_sibling sib)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_path *sib_path = NULL, *new_path = NULL;
+ struct btree_update *as;
+ struct bkey_format_state new_s;
+ struct bkey_format new_f;
+ struct bkey_i delete;
+ struct btree *b, *m, *n, *prev, *next, *parent;
+ struct bpos sib_pos;
+ size_t sib_u64s;
+ u64 start_time = local_clock();
+ int ret = 0;
+
+ BUG_ON(!path->should_be_locked);
+ BUG_ON(!btree_node_locked(path, level));
+
+ b = path->l[level].b;
+
+ if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) ||
+ (sib == btree_next_sib && bpos_eq(b->data->max_key, SPOS_MAX))) {
+ b->sib_u64s[sib] = U16_MAX;
+ return 0;
+ }
+
+ sib_pos = sib == btree_prev_sib
+ ? bpos_predecessor(b->data->min_key)
+ : bpos_successor(b->data->max_key);
+
+ sib_path = bch2_path_get(trans, path->btree_id, sib_pos,
+ U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_);
+ ret = bch2_btree_path_traverse(trans, sib_path, false);
+ if (ret)
+ goto err;
+
+ btree_path_set_should_be_locked(sib_path);
+
+ m = sib_path->l[level].b;
+
+ if (btree_node_parent(path, b) !=
+ btree_node_parent(sib_path, m)) {
+ b->sib_u64s[sib] = U16_MAX;
+ goto out;
+ }
+
+ if (sib == btree_prev_sib) {
+ prev = m;
+ next = b;
+ } else {
+ prev = b;
+ next = m;
+ }
+
+ if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) {
+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+
+ bch2_bpos_to_text(&buf1, prev->data->max_key);
+ bch2_bpos_to_text(&buf2, next->data->min_key);
+ bch_err(c,
+ "%s(): btree topology error:\n"
+ " prev ends at %s\n"
+ " next starts at %s",
+ __func__, buf1.buf, buf2.buf);
+ printbuf_exit(&buf1);
+ printbuf_exit(&buf2);
+ bch2_topology_error(c);
+ ret = -EIO;
+ goto err;
+ }
+
+ bch2_bkey_format_init(&new_s);
+ bch2_bkey_format_add_pos(&new_s, prev->data->min_key);
+ __bch2_btree_calc_format(&new_s, prev);
+ __bch2_btree_calc_format(&new_s, next);
+ bch2_bkey_format_add_pos(&new_s, next->data->max_key);
+ new_f = bch2_bkey_format_done(&new_s);
+
+ sib_u64s = btree_node_u64s_with_format(b->nr, &b->format, &new_f) +
+ btree_node_u64s_with_format(m->nr, &m->format, &new_f);
+
+ if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
+ sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
+ sib_u64s /= 2;
+ sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
+ }
+
+ sib_u64s = min(sib_u64s, btree_max_u64s(c));
+ sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1);
+ b->sib_u64s[sib] = sib_u64s;
+
+ if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
+ goto out;
+
+ parent = btree_node_parent(path, b);
+ as = bch2_btree_update_start(trans, path, level, false,
+ BTREE_INSERT_NOFAIL|flags);
+ ret = PTR_ERR_OR_ZERO(as);
+ if (ret)
+ goto err;
+
+ trace_and_count(c, btree_node_merge, c, b);
+
+ bch2_btree_interior_update_will_free_node(as, b);
+ bch2_btree_interior_update_will_free_node(as, m);
+
+ n = bch2_btree_node_alloc(as, trans, b->c.level);
+
+ SET_BTREE_NODE_SEQ(n->data,
+ max(BTREE_NODE_SEQ(b->data),
+ BTREE_NODE_SEQ(m->data)) + 1);
+
+ btree_set_min(n, prev->data->min_key);
+ btree_set_max(n, next->data->max_key);
+
+ n->data->format = new_f;
+ btree_node_set_format(n, new_f);
+
+ bch2_btree_sort_into(c, n, prev);
+ bch2_btree_sort_into(c, n, next);
+
+ bch2_btree_build_aux_trees(n);
+ bch2_btree_update_add_new_node(as, n);
+ six_unlock_write(&n->c.lock);
+
+ new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p);
+ six_lock_increment(&n->c.lock, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
+ bch2_btree_path_level_init(trans, new_path, n);
+
+ bkey_init(&delete.k);
+ delete.k.p = prev->key.k.p;
+ bch2_keylist_add(&as->parent_keys, &delete);
+ bch2_keylist_add(&as->parent_keys, &n->key);
+
+ bch2_trans_verify_paths(trans);
+
+ ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+ if (ret)
+ goto err_free_update;
+
+ bch2_trans_verify_paths(trans);
+
+ bch2_btree_update_get_open_buckets(as, n);
+ bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
+
+ bch2_btree_node_free_inmem(trans, path, b);
+ bch2_btree_node_free_inmem(trans, sib_path, m);
+
+ bch2_trans_node_add(trans, n);
+
+ bch2_trans_verify_paths(trans);
+
+ six_unlock_intent(&n->c.lock);
+
+ bch2_btree_update_done(as, trans);
+
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
+out:
+err:
+ if (new_path)
+ bch2_path_put(trans, new_path, true);
+ bch2_path_put(trans, sib_path, true);
+ bch2_trans_verify_locks(trans);
+ return ret;
+err_free_update:
+ bch2_btree_node_free_never_used(as, trans, n);
+ bch2_btree_update_free(as, trans);
+ goto out;
+}
+
+int bch2_btree_node_rewrite(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct btree *b,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_path *new_path = NULL;
+ struct btree *n, *parent;
+ struct btree_update *as;
+ int ret;
+
+ flags |= BTREE_INSERT_NOFAIL;
+
+ parent = btree_node_parent(iter->path, b);
+ as = bch2_btree_update_start(trans, iter->path, b->c.level,
+ false, flags);
+ ret = PTR_ERR_OR_ZERO(as);
+ if (ret)
+ goto out;
+
+ bch2_btree_interior_update_will_free_node(as, b);
+
+ n = bch2_btree_node_alloc_replacement(as, trans, b);
+
+ bch2_btree_build_aux_trees(n);
+ bch2_btree_update_add_new_node(as, n);
+ six_unlock_write(&n->c.lock);
+
+ new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p);
+ six_lock_increment(&n->c.lock, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
+ bch2_btree_path_level_init(trans, new_path, n);
+
+ trace_and_count(c, btree_node_rewrite, c, b);
+
+ if (parent) {
+ bch2_keylist_add(&as->parent_keys, &n->key);
+ ret = bch2_btree_insert_node(as, trans, iter->path, parent,
+ &as->parent_keys, flags);
+ if (ret)
+ goto err;
+ } else {
+ bch2_btree_set_root(as, trans, iter->path, n);
+ }
+
+ bch2_btree_update_get_open_buckets(as, n);
+ bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
+
+ bch2_btree_node_free_inmem(trans, iter->path, b);
+
+ bch2_trans_node_add(trans, n);
+ six_unlock_intent(&n->c.lock);
+
+ bch2_btree_update_done(as, trans);
+out:
+ if (new_path)
+ bch2_path_put(trans, new_path, true);
+ bch2_trans_downgrade(trans);
+ return ret;
+err:
+ bch2_btree_node_free_never_used(as, trans, n);
+ bch2_btree_update_free(as, trans);
+ goto out;
+}
+
+struct async_btree_rewrite {
+ struct bch_fs *c;
+ struct work_struct work;
+ struct list_head list;
+ enum btree_id btree_id;
+ unsigned level;
+ struct bpos pos;
+ __le64 seq;
+};
+
+static int async_btree_node_rewrite_trans(struct btree_trans *trans,
+ struct async_btree_rewrite *a)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct btree *b;
+ int ret;
+
+ bch2_trans_node_iter_init(trans, &iter, a->btree_id, a->pos,
+ BTREE_MAX_DEPTH, a->level, 0);
+ b = bch2_btree_iter_peek_node(&iter);
+ ret = PTR_ERR_OR_ZERO(b);
+ if (ret)
+ goto out;
+
+ if (!b || b->data->keys.seq != a->seq) {
+ struct printbuf buf = PRINTBUF;
+
+ if (b)
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ else
+ prt_str(&buf, "(null");
+ bch_info(c, "%s: node to rewrite not found:, searching for seq %llu, got\n%s",
+ __func__, a->seq, buf.buf);
+ printbuf_exit(&buf);
+ goto out;
+ }
+
+ ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
+out:
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret;
+}
+
+static void async_btree_node_rewrite_work(struct work_struct *work)
+{
+ struct async_btree_rewrite *a =
+ container_of(work, struct async_btree_rewrite, work);
+ struct bch_fs *c = a->c;
+ int ret;
+
+ ret = bch2_trans_do(c, NULL, NULL, 0,
+ async_btree_node_rewrite_trans(trans, a));
+ if (ret)
+ bch_err_fn(c, ret);
+ bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
+ kfree(a);
+}
+
+void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
+{
+ struct async_btree_rewrite *a;
+ int ret;
+
+ a = kmalloc(sizeof(*a), GFP_NOFS);
+ if (!a) {
+ bch_err(c, "%s: error allocating memory", __func__);
+ return;
+ }
+
+ a->c = c;
+ a->btree_id = b->c.btree_id;
+ a->level = b->c.level;
+ a->pos = b->key.k.p;
+ a->seq = b->data->keys.seq;
+ INIT_WORK(&a->work, async_btree_node_rewrite_work);
+
+ if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
+ mutex_lock(&c->pending_node_rewrites_lock);
+ list_add(&a->list, &c->pending_node_rewrites);
+ mutex_unlock(&c->pending_node_rewrites_lock);
+ return;
+ }
+
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
+ if (test_bit(BCH_FS_STARTED, &c->flags)) {
+ bch_err(c, "%s: error getting c->writes ref", __func__);
+ kfree(a);
+ return;
+ }
+
+ ret = bch2_fs_read_write_early(c);
+ if (ret) {
+ bch_err_msg(c, ret, "going read-write");
+ kfree(a);
+ return;
+ }
+
+ bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
+ }
+
+ queue_work(c->btree_interior_update_worker, &a->work);
+}
+
+void bch2_do_pending_node_rewrites(struct bch_fs *c)
+{
+ struct async_btree_rewrite *a, *n;
+
+ mutex_lock(&c->pending_node_rewrites_lock);
+ list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) {
+ list_del(&a->list);
+
+ bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
+ queue_work(c->btree_interior_update_worker, &a->work);
+ }
+ mutex_unlock(&c->pending_node_rewrites_lock);
+}
+
+void bch2_free_pending_node_rewrites(struct bch_fs *c)
+{
+ struct async_btree_rewrite *a, *n;
+
+ mutex_lock(&c->pending_node_rewrites_lock);
+ list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) {
+ list_del(&a->list);
+
+ kfree(a);
+ }
+ mutex_unlock(&c->pending_node_rewrites_lock);
+}
+
+static int __bch2_btree_node_update_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct btree *b, struct btree *new_hash,
+ struct bkey_i *new_key,
+ unsigned commit_flags,
+ bool skip_triggers)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter2 = { NULL };
+ struct btree *parent;
+ int ret;
+
+ if (!skip_triggers) {
+ ret = bch2_trans_mark_old(trans, b->c.btree_id, b->c.level + 1,
+ bkey_i_to_s_c(&b->key), 0);
+ if (ret)
+ return ret;
+
+ ret = bch2_trans_mark_new(trans, b->c.btree_id, b->c.level + 1,
+ new_key, 0);
+ if (ret)
+ return ret;
+ }
+
+ if (new_hash) {
+ bkey_copy(&new_hash->key, new_key);
+ ret = bch2_btree_node_hash_insert(&c->btree_cache,
+ new_hash, b->c.level, b->c.btree_id);
+ BUG_ON(ret);
+ }
+
+ parent = btree_node_parent(iter->path, b);
+ if (parent) {
+ bch2_trans_copy_iter(&iter2, iter);
+
+ iter2.path = bch2_btree_path_make_mut(trans, iter2.path,
+ iter2.flags & BTREE_ITER_INTENT,
+ _THIS_IP_);
+
+ BUG_ON(iter2.path->level != b->c.level);
+ BUG_ON(!bpos_eq(iter2.path->pos, new_key->k.p));
+
+ btree_path_set_level_up(trans, iter2.path);
+
+ trans->paths_sorted = false;
+
+ ret = bch2_btree_iter_traverse(&iter2) ?:
+ bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN);
+ if (ret)
+ goto err;
+ } else {
+ BUG_ON(btree_node_root(c, b) != b);
+
+ ret = darray_make_room(&trans->extra_journal_entries,
+ jset_u64s(new_key->k.u64s));
+ if (ret)
+ return ret;
+
+ journal_entry_set((void *) &darray_top(trans->extra_journal_entries),
+ BCH_JSET_ENTRY_btree_root,
+ b->c.btree_id, b->c.level,
+ new_key, new_key->k.u64s);
+ trans->extra_journal_entries.nr += jset_u64s(new_key->k.u64s);
+ }
+
+ ret = bch2_trans_commit(trans, NULL, NULL, commit_flags);
+ if (ret)
+ goto err;
+
+ bch2_btree_node_lock_write_nofail(trans, iter->path, &b->c);
+
+ if (new_hash) {
+ mutex_lock(&c->btree_cache.lock);
+ bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+ bkey_copy(&b->key, new_key);
+ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+ BUG_ON(ret);
+ mutex_unlock(&c->btree_cache.lock);
+ } else {
+ bkey_copy(&b->key, new_key);
+ }
+
+ bch2_btree_node_unlock_write(trans, iter->path, b);
+out:
+ bch2_trans_iter_exit(trans, &iter2);
+ return ret;
+err:
+ if (new_hash) {
+ mutex_lock(&c->btree_cache.lock);
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
+ mutex_unlock(&c->btree_cache.lock);
+ }
+ goto out;
+}
+
+int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter,
+ struct btree *b, struct bkey_i *new_key,
+ unsigned commit_flags, bool skip_triggers)
+{
+ struct bch_fs *c = trans->c;
+ struct btree *new_hash = NULL;
+ struct btree_path *path = iter->path;
+ struct closure cl;
+ int ret = 0;
+
+ ret = bch2_btree_path_upgrade(trans, path, b->c.level + 1);
+ if (ret)
+ return ret;
+
+ closure_init_stack(&cl);
+
+ /*
+ * check btree_ptr_hash_val() after @b is locked by
+ * btree_iter_traverse():
+ */
+ if (btree_ptr_hash_val(new_key) != b->hash_val) {
+ ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+ if (ret) {
+ ret = drop_locks_do(trans, (closure_sync(&cl), 0));
+ if (ret)
+ return ret;
+ }
+
+ new_hash = bch2_btree_node_mem_alloc(trans, false);
+ }
+
+ path->intent_ref++;
+ ret = __bch2_btree_node_update_key(trans, iter, b, new_hash, new_key,
+ commit_flags, skip_triggers);
+ --path->intent_ref;
+
+ if (new_hash) {
+ mutex_lock(&c->btree_cache.lock);
+ list_move(&new_hash->list, &c->btree_cache.freeable);
+ mutex_unlock(&c->btree_cache.lock);
+
+ six_unlock_write(&new_hash->c.lock);
+ six_unlock_intent(&new_hash->c.lock);
+ }
+ closure_sync(&cl);
+ bch2_btree_cache_cannibalize_unlock(c);
+ return ret;
+}
+
+int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
+ struct btree *b, struct bkey_i *new_key,
+ unsigned commit_flags, bool skip_triggers)
+{
+ struct btree_iter iter;
+ int ret;
+
+ bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p,
+ BTREE_MAX_DEPTH, b->c.level,
+ BTREE_ITER_INTENT);
+ ret = bch2_btree_iter_traverse(&iter);
+ if (ret)
+ goto out;
+
+ /* has node been freed? */
+ if (iter.path->l[b->c.level].b != b) {
+ /* node has been freed: */
+ BUG_ON(!btree_node_dying(b));
+ goto out;
+ }
+
+ BUG_ON(!btree_node_hashed(b));
+
+ struct bch_extent_ptr *ptr;
+ bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr,
+ !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev));
+
+ ret = bch2_btree_node_update_key(trans, &iter, b, new_key,
+ commit_flags, skip_triggers);
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+/* Init code: */
+
+/*
+ * Only for filesystem bringup, when first reading the btree roots or allocating
+ * btree roots when initializing a new filesystem:
+ */
+void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
+{
+ BUG_ON(btree_node_root(c, b));
+
+ bch2_btree_set_root_inmem(c, b);
+}
+
+static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
+{
+ struct bch_fs *c = trans->c;
+ struct closure cl;
+ struct btree *b;
+ int ret;
+
+ closure_init_stack(&cl);
+
+ do {
+ ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+ closure_sync(&cl);
+ } while (ret);
+
+ b = bch2_btree_node_mem_alloc(trans, false);
+ bch2_btree_cache_cannibalize_unlock(c);
+
+ set_btree_node_fake(b);
+ set_btree_node_need_rewrite(b);
+ b->c.level = 0;
+ b->c.btree_id = id;
+
+ bkey_btree_ptr_init(&b->key);
+ b->key.k.p = SPOS_MAX;
+ *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id;
+
+ bch2_bset_init_first(b, &b->data->keys);
+ bch2_btree_build_aux_trees(b);
+
+ b->data->flags = 0;
+ btree_set_min(b, POS_MIN);
+ btree_set_max(b, SPOS_MAX);
+ b->data->format = bch2_btree_calc_format(b);
+ btree_node_set_format(b, b->data->format);
+
+ ret = bch2_btree_node_hash_insert(&c->btree_cache, b,
+ b->c.level, b->c.btree_id);
+ BUG_ON(ret);
+
+ bch2_btree_set_root_inmem(c, b);
+
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
+ return 0;
+}
+
+void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
+{
+ bch2_trans_run(c, __bch2_btree_root_alloc(trans, id));
+}
+
+void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ struct btree_update *as;
+
+ mutex_lock(&c->btree_interior_update_lock);
+ list_for_each_entry(as, &c->btree_interior_update_list, list)
+ prt_printf(out, "%p m %u w %u r %u j %llu\n",
+ as,
+ as->mode,
+ as->nodes_written,
+ closure_nr_remaining(&as->cl),
+ as->journal.seq);
+ mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static bool bch2_btree_interior_updates_pending(struct bch_fs *c)
+{
+ bool ret;
+
+ mutex_lock(&c->btree_interior_update_lock);
+ ret = !list_empty(&c->btree_interior_update_list);
+ mutex_unlock(&c->btree_interior_update_lock);
+
+ return ret;
+}
+
+bool bch2_btree_interior_updates_flush(struct bch_fs *c)
+{
+ bool ret = bch2_btree_interior_updates_pending(c);
+
+ if (ret)
+ closure_wait_event(&c->btree_interior_update_wait,
+ !bch2_btree_interior_updates_pending(c));
+ return ret;
+}
+
+void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry)
+{
+ struct btree_root *r = bch2_btree_id_root(c, entry->btree_id);
+
+ mutex_lock(&c->btree_root_lock);
+
+ r->level = entry->level;
+ r->alive = true;
+ bkey_copy(&r->key, (struct bkey_i *) entry->start);
+
+ mutex_unlock(&c->btree_root_lock);
+}
+
+struct jset_entry *
+bch2_btree_roots_to_journal_entries(struct bch_fs *c,
+ struct jset_entry *end,
+ unsigned long skip)
+{
+ unsigned i;
+
+ mutex_lock(&c->btree_root_lock);
+
+ for (i = 0; i < btree_id_nr_alive(c); i++) {
+ struct btree_root *r = bch2_btree_id_root(c, i);
+
+ if (r->alive && !test_bit(i, &skip)) {
+ journal_entry_set(end, BCH_JSET_ENTRY_btree_root,
+ i, r->level, &r->key, r->key.k.u64s);
+ end = vstruct_next(end);
+ }
+ }
+
+ mutex_unlock(&c->btree_root_lock);
+
+ return end;
+}
+
+void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
+{
+ if (c->btree_interior_update_worker)
+ destroy_workqueue(c->btree_interior_update_worker);
+ mempool_exit(&c->btree_interior_update_pool);
+}
+
+void bch2_fs_btree_interior_update_init_early(struct bch_fs *c)
+{
+ mutex_init(&c->btree_reserve_cache_lock);
+ INIT_LIST_HEAD(&c->btree_interior_update_list);
+ INIT_LIST_HEAD(&c->btree_interior_updates_unwritten);
+ mutex_init(&c->btree_interior_update_lock);
+ INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work);
+
+ INIT_LIST_HEAD(&c->pending_node_rewrites);
+ mutex_init(&c->pending_node_rewrites_lock);
+}
+
+int bch2_fs_btree_interior_update_init(struct bch_fs *c)
+{
+ c->btree_interior_update_worker =
+ alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
+ if (!c->btree_interior_update_worker)
+ return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
+
+ if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
+ sizeof(struct btree_update)))
+ return -BCH_ERR_ENOMEM_btree_interior_update_pool_init;
+
+ return 0;
+}
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
new file mode 100644
index 000000000000..a6668992a272
--- /dev/null
+++ b/fs/bcachefs/btree_update_interior.h
@@ -0,0 +1,332 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H
+#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H
+
+#include "btree_cache.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+
+#define BTREE_UPDATE_NODES_MAX ((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES)
+
+#define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1))
+
+/*
+ * Tracks an in progress split/rewrite of a btree node and the update to the
+ * parent node:
+ *
+ * When we split/rewrite a node, we do all the updates in memory without
+ * waiting for any writes to complete - we allocate the new node(s) and update
+ * the parent node, possibly recursively up to the root.
+ *
+ * The end result is that we have one or more new nodes being written -
+ * possibly several, if there were multiple splits - and then a write (updating
+ * an interior node) which will make all these new nodes visible.
+ *
+ * Additionally, as we split/rewrite nodes we free the old nodes - but the old
+ * nodes can't be freed (their space on disk can't be reclaimed) until the
+ * update to the interior node that makes the new node visible completes -
+ * until then, the old nodes are still reachable on disk.
+ *
+ */
+struct btree_update {
+ struct closure cl;
+ struct bch_fs *c;
+ u64 start_time;
+
+ struct list_head list;
+ struct list_head unwritten_list;
+
+ /* What kind of update are we doing? */
+ enum {
+ BTREE_INTERIOR_NO_UPDATE,
+ BTREE_INTERIOR_UPDATING_NODE,
+ BTREE_INTERIOR_UPDATING_ROOT,
+ BTREE_INTERIOR_UPDATING_AS,
+ } mode;
+
+ unsigned nodes_written:1;
+ unsigned took_gc_lock:1;
+
+ enum btree_id btree_id;
+ unsigned update_level;
+
+ struct disk_reservation disk_res;
+
+ /*
+ * BTREE_INTERIOR_UPDATING_NODE:
+ * The update that made the new nodes visible was a regular update to an
+ * existing interior node - @b. We can't write out the update to @b
+ * until the new nodes we created are finished writing, so we block @b
+ * from writing by putting this btree_interior update on the
+ * @b->write_blocked list with @write_blocked_list:
+ */
+ struct btree *b;
+ struct list_head write_blocked_list;
+
+ /*
+ * We may be freeing nodes that were dirty, and thus had journal entries
+ * pinned: we need to transfer the oldest of those pins to the
+ * btree_update operation, and release it when the new node(s)
+ * are all persistent and reachable:
+ */
+ struct journal_entry_pin journal;
+
+ /* Preallocated nodes we reserve when we start the update: */
+ struct prealloc_nodes {
+ struct btree *b[BTREE_UPDATE_NODES_MAX];
+ unsigned nr;
+ } prealloc_nodes[2];
+
+ /* Nodes being freed: */
+ struct keylist old_keys;
+ u64 _old_keys[BTREE_UPDATE_NODES_MAX *
+ BKEY_BTREE_PTR_U64s_MAX];
+
+ /* Nodes being added: */
+ struct keylist new_keys;
+ u64 _new_keys[BTREE_UPDATE_NODES_MAX *
+ BKEY_BTREE_PTR_U64s_MAX];
+
+ /* New nodes, that will be made reachable by this update: */
+ struct btree *new_nodes[BTREE_UPDATE_NODES_MAX];
+ unsigned nr_new_nodes;
+
+ struct btree *old_nodes[BTREE_UPDATE_NODES_MAX];
+ __le64 old_nodes_seq[BTREE_UPDATE_NODES_MAX];
+ unsigned nr_old_nodes;
+
+ open_bucket_idx_t open_buckets[BTREE_UPDATE_NODES_MAX *
+ BCH_REPLICAS_MAX];
+ open_bucket_idx_t nr_open_buckets;
+
+ unsigned journal_u64s;
+ u64 journal_entries[BTREE_UPDATE_JOURNAL_RES];
+
+ /* Only here to reduce stack usage on recursive splits: */
+ struct keylist parent_keys;
+ /*
+ * Enough room for btree_split's keys without realloc - btree node
+ * pointers never have crc/compression info, so we only need to acount
+ * for the pointers for three keys
+ */
+ u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
+};
+
+struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
+ struct btree_trans *,
+ struct btree *,
+ struct bkey_format);
+
+int bch2_btree_split_leaf(struct btree_trans *, struct btree_path *, unsigned);
+
+int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_path *,
+ unsigned, unsigned, enum btree_node_sibling);
+
+static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned level, unsigned flags,
+ enum btree_node_sibling sib)
+{
+ struct btree *b;
+
+ EBUG_ON(!btree_node_locked(path, level));
+
+ b = path->l[level].b;
+ if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold)
+ return 0;
+
+ return __bch2_foreground_maybe_merge(trans, path, level, flags, sib);
+}
+
+static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned level,
+ unsigned flags)
+{
+ return bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
+ btree_prev_sib) ?:
+ bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
+ btree_next_sib);
+}
+
+int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
+ struct btree *, unsigned);
+void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
+int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
+ struct btree *, struct bkey_i *,
+ unsigned, bool);
+int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *,
+ struct bkey_i *, unsigned, bool);
+
+void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
+void bch2_btree_root_alloc(struct bch_fs *, enum btree_id);
+
+static inline unsigned btree_update_reserve_required(struct bch_fs *c,
+ struct btree *b)
+{
+ unsigned depth = btree_node_root(c, b)->c.level + 1;
+
+ /*
+ * Number of nodes we might have to allocate in a worst case btree
+ * split operation - we split all the way up to the root, then allocate
+ * a new root, unless we're already at max depth:
+ */
+ if (depth < BTREE_MAX_DEPTH)
+ return (depth - b->c.level) * 2 + 1;
+ else
+ return (depth - b->c.level) * 2 - 1;
+}
+
+static inline void btree_node_reset_sib_u64s(struct btree *b)
+{
+ b->sib_u64s[0] = b->nr.live_u64s;
+ b->sib_u64s[1] = b->nr.live_u64s;
+}
+
+static inline void *btree_data_end(struct bch_fs *c, struct btree *b)
+{
+ return (void *) b->data + btree_bytes(c);
+}
+
+static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c,
+ struct btree *b)
+{
+ return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s);
+}
+
+static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c,
+ struct btree *b)
+{
+ return btree_data_end(c, b);
+}
+
+static inline void *write_block(struct btree *b)
+{
+ return (void *) b->data + (b->written << 9);
+}
+
+static inline bool __btree_addr_written(struct btree *b, void *p)
+{
+ return p < write_block(b);
+}
+
+static inline bool bset_written(struct btree *b, struct bset *i)
+{
+ return __btree_addr_written(b, i);
+}
+
+static inline bool bkey_written(struct btree *b, struct bkey_packed *k)
+{
+ return __btree_addr_written(b, k);
+}
+
+static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
+ struct btree *b,
+ void *end)
+{
+ ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
+ b->whiteout_u64s;
+ ssize_t total = c->opts.btree_node_size >> 3;
+
+ /* Always leave one extra u64 for bch2_varint_decode: */
+ used++;
+
+ return total - used;
+}
+
+static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
+ struct btree *b)
+{
+ ssize_t remaining = __bch_btree_u64s_remaining(c, b,
+ btree_bkey_last(b, bset_tree_last(b)));
+
+ BUG_ON(remaining < 0);
+
+ if (bset_written(b, btree_bset_last(b)))
+ return 0;
+
+ return remaining;
+}
+
+#define BTREE_WRITE_SET_U64s_BITS 9
+
+static inline unsigned btree_write_set_buffer(struct btree *b)
+{
+ /*
+ * Could buffer up larger amounts of keys for btrees with larger keys,
+ * pending benchmarking:
+ */
+ return 8 << BTREE_WRITE_SET_U64s_BITS;
+}
+
+static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
+ struct btree *b)
+{
+ struct bset_tree *t = bset_tree_last(b);
+ struct btree_node_entry *bne = max(write_block(b),
+ (void *) btree_bkey_last(b, bset_tree_last(b)));
+ ssize_t remaining_space =
+ __bch_btree_u64s_remaining(c, b, bne->keys.start);
+
+ if (unlikely(bset_written(b, bset(b, t)))) {
+ if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
+ return bne;
+ } else {
+ if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) &&
+ remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3))
+ return bne;
+ }
+
+ return NULL;
+}
+
+static inline void push_whiteout(struct bch_fs *c, struct btree *b,
+ struct bpos pos)
+{
+ struct bkey_packed k;
+
+ BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s);
+ EBUG_ON(btree_node_just_written(b));
+
+ if (!bkey_pack_pos(&k, pos, b)) {
+ struct bkey *u = (void *) &k;
+
+ bkey_init(u);
+ u->p = pos;
+ }
+
+ k.needs_whiteout = true;
+
+ b->whiteout_u64s += k.u64s;
+ bkey_p_copy(unwritten_whiteouts_start(c, b), &k);
+}
+
+/*
+ * write lock must be held on @b (else the dirty bset that we were going to
+ * insert into could be written out from under us)
+ */
+static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
+ struct btree *b, unsigned u64s)
+{
+ if (unlikely(btree_node_need_rewrite(b)))
+ return false;
+
+ return u64s <= bch_btree_keys_u64s_remaining(c, b);
+}
+
+void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *);
+
+bool bch2_btree_interior_updates_flush(struct bch_fs *);
+
+void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *);
+struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
+ struct jset_entry *, unsigned long);
+
+void bch2_do_pending_node_rewrites(struct bch_fs *);
+void bch2_free_pending_node_rewrites(struct bch_fs *);
+
+void bch2_fs_btree_interior_update_exit(struct bch_fs *);
+void bch2_fs_btree_interior_update_init_early(struct bch_fs *);
+int bch2_fs_btree_interior_update_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
new file mode 100644
index 000000000000..4e6241db518b
--- /dev/null
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -0,0 +1,375 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+
+#include <linux/sort.h>
+
+static int btree_write_buffered_key_cmp(const void *_l, const void *_r)
+{
+ const struct btree_write_buffered_key *l = _l;
+ const struct btree_write_buffered_key *r = _r;
+
+ return cmp_int(l->btree, r->btree) ?:
+ bpos_cmp(l->k.k.p, r->k.k.p) ?:
+ cmp_int(l->journal_seq, r->journal_seq) ?:
+ cmp_int(l->journal_offset, r->journal_offset);
+}
+
+static int btree_write_buffered_journal_cmp(const void *_l, const void *_r)
+{
+ const struct btree_write_buffered_key *l = _l;
+ const struct btree_write_buffered_key *r = _r;
+
+ return cmp_int(l->journal_seq, r->journal_seq);
+}
+
+static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct btree_write_buffered_key *wb,
+ unsigned commit_flags,
+ bool *write_locked,
+ size_t *fast)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_path *path;
+ int ret;
+
+ ret = bch2_btree_iter_traverse(iter);
+ if (ret)
+ return ret;
+
+ path = iter->path;
+
+ if (!*write_locked) {
+ ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c);
+ if (ret)
+ return ret;
+
+ bch2_btree_node_prep_for_write(trans, path, path->l[0].b);
+ *write_locked = true;
+ }
+
+ if (!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s)) {
+ bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+ *write_locked = false;
+ goto trans_commit;
+ }
+
+ bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
+ (*fast)++;
+
+ if (path->ref > 1) {
+ /*
+ * We can't clone a path that has write locks: if the path is
+ * shared, unlock before set_pos(), traverse():
+ */
+ bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+ *write_locked = false;
+ }
+ return 0;
+trans_commit:
+ return bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ commit_flags|
+ BTREE_INSERT_NOCHECK_RW|
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_JOURNAL_RECLAIM);
+}
+
+static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb)
+{
+ union btree_write_buffer_state old, new;
+ u64 v = READ_ONCE(wb->state.v);
+
+ do {
+ old.v = new.v = v;
+
+ new.nr = 0;
+ new.idx++;
+ } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v);
+
+ while (old.idx == 0 ? wb->state.ref0 : wb->state.ref1)
+ cpu_relax();
+
+ smp_mb();
+
+ return old;
+}
+
+/*
+ * Update a btree with a write buffered key using the journal seq of the
+ * original write buffer insert.
+ *
+ * It is not safe to rejournal the key once it has been inserted into the write
+ * buffer because that may break recovery ordering. For example, the key may
+ * have already been modified in the active write buffer in a seq that comes
+ * before the current transaction. If we were to journal this key again and
+ * crash, recovery would process updates in the wrong order.
+ */
+static int
+btree_write_buffered_insert(struct btree_trans *trans,
+ struct btree_write_buffered_key *wb)
+{
+ struct btree_iter iter;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k),
+ BTREE_ITER_CACHED|BTREE_ITER_INTENT);
+
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_flags,
+ bool locked)
+{
+ struct bch_fs *c = trans->c;
+ struct journal *j = &c->journal;
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+ struct journal_entry_pin pin;
+ struct btree_write_buffered_key *i, *keys;
+ struct btree_iter iter = { NULL };
+ size_t nr = 0, skipped = 0, fast = 0, slowpath = 0;
+ bool write_locked = false;
+ union btree_write_buffer_state s;
+ int ret = 0;
+
+ memset(&pin, 0, sizeof(pin));
+
+ if (!locked && !mutex_trylock(&wb->flush_lock))
+ return 0;
+
+ bch2_journal_pin_copy(j, &pin, &wb->journal_pin, NULL);
+ bch2_journal_pin_drop(j, &wb->journal_pin);
+
+ s = btree_write_buffer_switch(wb);
+ keys = wb->keys[s.idx];
+ nr = s.nr;
+
+ if (race_fault())
+ goto slowpath;
+
+ /*
+ * We first sort so that we can detect and skip redundant updates, and
+ * then we attempt to flush in sorted btree order, as this is most
+ * efficient.
+ *
+ * However, since we're not flushing in the order they appear in the
+ * journal we won't be able to drop our journal pin until everything is
+ * flushed - which means this could deadlock the journal if we weren't
+ * passing BTREE_INSERT_JOURNAL_RECLAIM. This causes the update to fail
+ * if it would block taking a journal reservation.
+ *
+ * If that happens, simply skip the key so we can optimistically insert
+ * as many keys as possible in the fast path.
+ */
+ sort(keys, nr, sizeof(keys[0]),
+ btree_write_buffered_key_cmp, NULL);
+
+ for (i = keys; i < keys + nr; i++) {
+ if (i + 1 < keys + nr &&
+ i[0].btree == i[1].btree &&
+ bpos_eq(i[0].k.k.p, i[1].k.k.p)) {
+ skipped++;
+ i->journal_seq = 0;
+ continue;
+ }
+
+ if (write_locked &&
+ (iter.path->btree_id != i->btree ||
+ bpos_gt(i->k.k.p, iter.path->l[0].b->key.k.p))) {
+ bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
+ write_locked = false;
+ }
+
+ if (!iter.path || iter.path->btree_id != i->btree) {
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p,
+ BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS);
+ }
+
+ bch2_btree_iter_set_pos(&iter, i->k.k.p);
+ iter.path->preserve = false;
+
+ do {
+ ret = bch2_btree_write_buffer_flush_one(trans, &iter, i,
+ commit_flags, &write_locked, &fast);
+ if (!write_locked)
+ bch2_trans_begin(trans);
+ } while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
+
+ if (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
+ slowpath++;
+ continue;
+ }
+ if (ret)
+ break;
+
+ i->journal_seq = 0;
+ }
+
+ if (write_locked)
+ bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
+ bch2_trans_iter_exit(trans, &iter);
+
+ trace_write_buffer_flush(trans, nr, skipped, fast, wb->size);
+
+ if (slowpath)
+ goto slowpath;
+
+ bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
+out:
+ bch2_journal_pin_drop(j, &pin);
+ mutex_unlock(&wb->flush_lock);
+ return ret;
+slowpath:
+ trace_write_buffer_flush_slowpath(trans, i - keys, nr);
+
+ /*
+ * Now sort the rest by journal seq and bump the journal pin as we go.
+ * The slowpath zapped the seq of keys that were successfully flushed so
+ * we can skip those here.
+ */
+ sort(keys, nr, sizeof(keys[0]),
+ btree_write_buffered_journal_cmp,
+ NULL);
+
+ commit_flags &= ~BCH_WATERMARK_MASK;
+ commit_flags |= BCH_WATERMARK_reclaim;
+
+ for (i = keys; i < keys + nr; i++) {
+ if (!i->journal_seq)
+ continue;
+
+ if (i->journal_seq > pin.seq) {
+ struct journal_entry_pin pin2;
+
+ memset(&pin2, 0, sizeof(pin2));
+
+ bch2_journal_pin_add(j, i->journal_seq, &pin2, NULL);
+ bch2_journal_pin_drop(j, &pin);
+ bch2_journal_pin_copy(j, &pin, &pin2, NULL);
+ bch2_journal_pin_drop(j, &pin2);
+ }
+
+ ret = commit_do(trans, NULL, NULL,
+ commit_flags|
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_JOURNAL_RECLAIM,
+ btree_write_buffered_insert(trans, i));
+ if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)))
+ break;
+ }
+
+ goto out;
+}
+
+int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans)
+{
+ bch2_trans_unlock(trans);
+ mutex_lock(&trans->c->btree_write_buffer.flush_lock);
+ return __bch2_btree_write_buffer_flush(trans, 0, true);
+}
+
+int bch2_btree_write_buffer_flush(struct btree_trans *trans)
+{
+ return __bch2_btree_write_buffer_flush(trans, 0, false);
+}
+
+static int bch2_btree_write_buffer_journal_flush(struct journal *j,
+ struct journal_entry_pin *_pin, u64 seq)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ mutex_lock(&wb->flush_lock);
+
+ return bch2_trans_run(c,
+ __bch2_btree_write_buffer_flush(trans, BTREE_INSERT_NOCHECK_RW, true));
+}
+
+static inline u64 btree_write_buffer_ref(int idx)
+{
+ return ((union btree_write_buffer_state) {
+ .ref0 = idx == 0,
+ .ref1 = idx == 1,
+ }).v;
+}
+
+int bch2_btree_insert_keys_write_buffer(struct btree_trans *trans)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+ struct btree_write_buffered_key *i;
+ union btree_write_buffer_state old, new;
+ int ret = 0;
+ u64 v;
+
+ trans_for_each_wb_update(trans, i) {
+ EBUG_ON(i->k.k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
+
+ i->journal_seq = trans->journal_res.seq;
+ i->journal_offset = trans->journal_res.offset;
+ }
+
+ preempt_disable();
+ v = READ_ONCE(wb->state.v);
+ do {
+ old.v = new.v = v;
+
+ new.v += btree_write_buffer_ref(new.idx);
+ new.nr += trans->nr_wb_updates;
+ if (new.nr > wb->size) {
+ ret = -BCH_ERR_btree_insert_need_flush_buffer;
+ goto out;
+ }
+ } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v);
+
+ memcpy(wb->keys[new.idx] + old.nr,
+ trans->wb_updates,
+ sizeof(trans->wb_updates[0]) * trans->nr_wb_updates);
+
+ bch2_journal_pin_add(&c->journal, trans->journal_res.seq, &wb->journal_pin,
+ bch2_btree_write_buffer_journal_flush);
+
+ atomic64_sub_return_release(btree_write_buffer_ref(new.idx), &wb->state.counter);
+out:
+ preempt_enable();
+ return ret;
+}
+
+void bch2_fs_btree_write_buffer_exit(struct bch_fs *c)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ BUG_ON(wb->state.nr && !bch2_journal_error(&c->journal));
+
+ kvfree(wb->keys[1]);
+ kvfree(wb->keys[0]);
+}
+
+int bch2_fs_btree_write_buffer_init(struct bch_fs *c)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ mutex_init(&wb->flush_lock);
+ wb->size = c->opts.btree_write_buffer_size;
+
+ wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL);
+ wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL);
+ if (!wb->keys[0] || !wb->keys[1])
+ return -BCH_ERR_ENOMEM_fs_btree_write_buffer_init;
+
+ return 0;
+}
diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h
new file mode 100644
index 000000000000..322df1c8304e
--- /dev/null
+++ b/fs/bcachefs/btree_write_buffer.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H
+#define _BCACHEFS_BTREE_WRITE_BUFFER_H
+
+int __bch2_btree_write_buffer_flush(struct btree_trans *, unsigned, bool);
+int bch2_btree_write_buffer_flush_sync(struct btree_trans *);
+int bch2_btree_write_buffer_flush(struct btree_trans *);
+
+int bch2_btree_insert_keys_write_buffer(struct btree_trans *);
+
+void bch2_fs_btree_write_buffer_exit(struct bch_fs *);
+int bch2_fs_btree_write_buffer_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */
diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h
new file mode 100644
index 000000000000..99993ba77aea
--- /dev/null
+++ b/fs/bcachefs/btree_write_buffer_types.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
+#define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
+
+#include "journal_types.h"
+
+#define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4
+#define BTREE_WRITE_BUFERED_U64s_MAX (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX)
+
+struct btree_write_buffered_key {
+ u64 journal_seq;
+ unsigned journal_offset;
+ enum btree_id btree;
+ __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX);
+};
+
+union btree_write_buffer_state {
+ struct {
+ atomic64_t counter;
+ };
+
+ struct {
+ u64 v;
+ };
+
+ struct {
+ u64 nr:23;
+ u64 idx:1;
+ u64 ref0:20;
+ u64 ref1:20;
+ };
+};
+
+struct btree_write_buffer {
+ struct mutex flush_lock;
+ struct journal_entry_pin journal_pin;
+
+ union btree_write_buffer_state state;
+ size_t size;
+
+ struct btree_write_buffered_key *keys[2];
+};
+
+#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
new file mode 100644
index 000000000000..5a91d3189fcf
--- /dev/null
+++ b/fs/bcachefs/buckets.c
@@ -0,0 +1,2170 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Code for manipulating bucket marks for garbage collection.
+ *
+ * Copyright 2014 Datera, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "backpointers.h"
+#include "bset.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "buckets_waiting_for_journal.h"
+#include "ec.h"
+#include "error.h"
+#include "inode.h"
+#include "movinggc.h"
+#include "recovery.h"
+#include "reflink.h"
+#include "replicas.h"
+#include "subvolume.h"
+#include "trace.h"
+
+#include <linux/preempt.h>
+
+static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
+ enum bch_data_type data_type,
+ s64 sectors)
+{
+ switch (data_type) {
+ case BCH_DATA_btree:
+ fs_usage->btree += sectors;
+ break;
+ case BCH_DATA_user:
+ case BCH_DATA_parity:
+ fs_usage->data += sectors;
+ break;
+ case BCH_DATA_cached:
+ fs_usage->cached += sectors;
+ break;
+ default:
+ break;
+ }
+}
+
+void bch2_fs_usage_initialize(struct bch_fs *c)
+{
+ struct bch_fs_usage *usage;
+ struct bch_dev *ca;
+ unsigned i;
+
+ percpu_down_write(&c->mark_lock);
+ usage = c->usage_base;
+
+ for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+ bch2_fs_usage_acc_to_base(c, i);
+
+ for (i = 0; i < BCH_REPLICAS_MAX; i++)
+ usage->reserved += usage->persistent_reserved[i];
+
+ for (i = 0; i < c->replicas.nr; i++) {
+ struct bch_replicas_entry *e =
+ cpu_replicas_entry(&c->replicas, i);
+
+ fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
+ }
+
+ for_each_member_device(ca, c, i) {
+ struct bch_dev_usage dev = bch2_dev_usage_read(ca);
+
+ usage->hidden += (dev.d[BCH_DATA_sb].buckets +
+ dev.d[BCH_DATA_journal].buckets) *
+ ca->mi.bucket_size;
+ }
+
+ percpu_up_write(&c->mark_lock);
+}
+
+static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
+ unsigned journal_seq,
+ bool gc)
+{
+ BUG_ON(!gc && !journal_seq);
+
+ return this_cpu_ptr(gc
+ ? ca->usage_gc
+ : ca->usage[journal_seq & JOURNAL_BUF_MASK]);
+}
+
+void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
+{
+ struct bch_fs *c = ca->fs;
+ unsigned seq, i, u64s = dev_usage_u64s();
+
+ do {
+ seq = read_seqcount_begin(&c->usage_lock);
+ memcpy(usage, ca->usage_base, u64s * sizeof(u64));
+ for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+ acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage[i], u64s);
+ } while (read_seqcount_retry(&c->usage_lock, seq));
+}
+
+u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
+{
+ ssize_t offset = v - (u64 *) c->usage_base;
+ unsigned i, seq;
+ u64 ret;
+
+ BUG_ON(offset < 0 || offset >= fs_usage_u64s(c));
+ percpu_rwsem_assert_held(&c->mark_lock);
+
+ do {
+ seq = read_seqcount_begin(&c->usage_lock);
+ ret = *v;
+
+ for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+ ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset);
+ } while (read_seqcount_retry(&c->usage_lock, seq));
+
+ return ret;
+}
+
+struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)
+{
+ struct bch_fs_usage_online *ret;
+ unsigned nr_replicas = READ_ONCE(c->replicas.nr);
+ unsigned seq, i;
+retry:
+ ret = kmalloc(__fs_usage_online_u64s(nr_replicas) * sizeof(u64), GFP_KERNEL);
+ if (unlikely(!ret))
+ return NULL;
+
+ percpu_down_read(&c->mark_lock);
+
+ if (nr_replicas != c->replicas.nr) {
+ nr_replicas = c->replicas.nr;
+ percpu_up_read(&c->mark_lock);
+ kfree(ret);
+ goto retry;
+ }
+
+ ret->online_reserved = percpu_u64_get(c->online_reserved);
+
+ do {
+ seq = read_seqcount_begin(&c->usage_lock);
+ unsafe_memcpy(&ret->u, c->usage_base,
+ __fs_usage_u64s(nr_replicas) * sizeof(u64),
+ "embedded variable length struct");
+ for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+ acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i],
+ __fs_usage_u64s(nr_replicas));
+ } while (read_seqcount_retry(&c->usage_lock, seq));
+
+ return ret;
+}
+
+void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
+{
+ struct bch_dev *ca;
+ unsigned i, u64s = fs_usage_u64s(c);
+
+ BUG_ON(idx >= ARRAY_SIZE(c->usage));
+
+ preempt_disable();
+ write_seqcount_begin(&c->usage_lock);
+
+ acc_u64s_percpu((u64 *) c->usage_base,
+ (u64 __percpu *) c->usage[idx], u64s);
+ percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
+
+ rcu_read_lock();
+ for_each_member_device_rcu(ca, c, i, NULL) {
+ u64s = dev_usage_u64s();
+
+ acc_u64s_percpu((u64 *) ca->usage_base,
+ (u64 __percpu *) ca->usage[idx], u64s);
+ percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64));
+ }
+ rcu_read_unlock();
+
+ write_seqcount_end(&c->usage_lock);
+ preempt_enable();
+}
+
+void bch2_fs_usage_to_text(struct printbuf *out,
+ struct bch_fs *c,
+ struct bch_fs_usage_online *fs_usage)
+{
+ unsigned i;
+
+ prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity);
+
+ prt_printf(out, "hidden:\t\t\t\t%llu\n",
+ fs_usage->u.hidden);
+ prt_printf(out, "data:\t\t\t\t%llu\n",
+ fs_usage->u.data);
+ prt_printf(out, "cached:\t\t\t\t%llu\n",
+ fs_usage->u.cached);
+ prt_printf(out, "reserved:\t\t\t%llu\n",
+ fs_usage->u.reserved);
+ prt_printf(out, "nr_inodes:\t\t\t%llu\n",
+ fs_usage->u.nr_inodes);
+ prt_printf(out, "online reserved:\t\t%llu\n",
+ fs_usage->online_reserved);
+
+ for (i = 0;
+ i < ARRAY_SIZE(fs_usage->u.persistent_reserved);
+ i++) {
+ prt_printf(out, "%u replicas:\n", i + 1);
+ prt_printf(out, "\treserved:\t\t%llu\n",
+ fs_usage->u.persistent_reserved[i]);
+ }
+
+ for (i = 0; i < c->replicas.nr; i++) {
+ struct bch_replicas_entry *e =
+ cpu_replicas_entry(&c->replicas, i);
+
+ prt_printf(out, "\t");
+ bch2_replicas_entry_to_text(out, e);
+ prt_printf(out, ":\t%llu\n", fs_usage->u.replicas[i]);
+ }
+}
+
+static u64 reserve_factor(u64 r)
+{
+ return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
+}
+
+u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage)
+{
+ return min(fs_usage->u.hidden +
+ fs_usage->u.btree +
+ fs_usage->u.data +
+ reserve_factor(fs_usage->u.reserved +
+ fs_usage->online_reserved),
+ c->capacity);
+}
+
+static struct bch_fs_usage_short
+__bch2_fs_usage_read_short(struct bch_fs *c)
+{
+ struct bch_fs_usage_short ret;
+ u64 data, reserved;
+
+ ret.capacity = c->capacity -
+ bch2_fs_usage_read_one(c, &c->usage_base->hidden);
+
+ data = bch2_fs_usage_read_one(c, &c->usage_base->data) +
+ bch2_fs_usage_read_one(c, &c->usage_base->btree);
+ reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) +
+ percpu_u64_get(c->online_reserved);
+
+ ret.used = min(ret.capacity, data + reserve_factor(reserved));
+ ret.free = ret.capacity - ret.used;
+
+ ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes);
+
+ return ret;
+}
+
+struct bch_fs_usage_short
+bch2_fs_usage_read_short(struct bch_fs *c)
+{
+ struct bch_fs_usage_short ret;
+
+ percpu_down_read(&c->mark_lock);
+ ret = __bch2_fs_usage_read_short(c);
+ percpu_up_read(&c->mark_lock);
+
+ return ret;
+}
+
+void bch2_dev_usage_init(struct bch_dev *ca)
+{
+ ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket;
+}
+
+static inline int bucket_sectors_fragmented(struct bch_dev *ca,
+ struct bch_alloc_v4 a)
+{
+ return a.dirty_sectors
+ ? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors)
+ : 0;
+}
+
+static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
+ struct bch_alloc_v4 old,
+ struct bch_alloc_v4 new,
+ u64 journal_seq, bool gc)
+{
+ struct bch_fs_usage *fs_usage;
+ struct bch_dev_usage *u;
+
+ preempt_disable();
+ fs_usage = fs_usage_ptr(c, journal_seq, gc);
+
+ if (data_type_is_hidden(old.data_type))
+ fs_usage->hidden -= ca->mi.bucket_size;
+ if (data_type_is_hidden(new.data_type))
+ fs_usage->hidden += ca->mi.bucket_size;
+
+ u = dev_usage_ptr(ca, journal_seq, gc);
+
+ u->d[old.data_type].buckets--;
+ u->d[new.data_type].buckets++;
+
+ u->buckets_ec -= (int) !!old.stripe;
+ u->buckets_ec += (int) !!new.stripe;
+
+ u->d[old.data_type].sectors -= old.dirty_sectors;
+ u->d[new.data_type].sectors += new.dirty_sectors;
+
+ u->d[BCH_DATA_cached].sectors += new.cached_sectors;
+ u->d[BCH_DATA_cached].sectors -= old.cached_sectors;
+
+ u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old);
+ u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new);
+
+ preempt_enable();
+}
+
+static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca,
+ struct bucket old, struct bucket new,
+ u64 journal_seq, bool gc)
+{
+ struct bch_alloc_v4 old_a = {
+ .gen = old.gen,
+ .data_type = old.data_type,
+ .dirty_sectors = old.dirty_sectors,
+ .cached_sectors = old.cached_sectors,
+ .stripe = old.stripe,
+ };
+ struct bch_alloc_v4 new_a = {
+ .gen = new.gen,
+ .data_type = new.data_type,
+ .dirty_sectors = new.dirty_sectors,
+ .cached_sectors = new.cached_sectors,
+ .stripe = new.stripe,
+ };
+
+ bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc);
+}
+
+static inline int __update_replicas(struct bch_fs *c,
+ struct bch_fs_usage *fs_usage,
+ struct bch_replicas_entry *r,
+ s64 sectors)
+{
+ int idx = bch2_replicas_entry_idx(c, r);
+
+ if (idx < 0)
+ return -1;
+
+ fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
+ fs_usage->replicas[idx] += sectors;
+ return 0;
+}
+
+static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
+ struct bch_replicas_entry *r, s64 sectors,
+ unsigned journal_seq, bool gc)
+{
+ struct bch_fs_usage *fs_usage;
+ int idx, ret = 0;
+ struct printbuf buf = PRINTBUF;
+
+ percpu_down_read(&c->mark_lock);
+
+ idx = bch2_replicas_entry_idx(c, r);
+ if (idx < 0 &&
+ fsck_err(c, ptr_to_missing_replicas_entry,
+ "no replicas entry\n while marking %s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ percpu_up_read(&c->mark_lock);
+ ret = bch2_mark_replicas(c, r);
+ percpu_down_read(&c->mark_lock);
+
+ if (ret)
+ goto err;
+ idx = bch2_replicas_entry_idx(c, r);
+ }
+ if (idx < 0) {
+ ret = -1;
+ goto err;
+ }
+
+ preempt_disable();
+ fs_usage = fs_usage_ptr(c, journal_seq, gc);
+ fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
+ fs_usage->replicas[idx] += sectors;
+ preempt_enable();
+err:
+fsck_err:
+ percpu_up_read(&c->mark_lock);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static inline int update_cached_sectors(struct bch_fs *c,
+ struct bkey_s_c k,
+ unsigned dev, s64 sectors,
+ unsigned journal_seq, bool gc)
+{
+ struct bch_replicas_padded r;
+
+ bch2_replicas_entry_cached(&r.e, dev);
+
+ return update_replicas(c, k, &r.e, sectors, journal_seq, gc);
+}
+
+static int __replicas_deltas_realloc(struct btree_trans *trans, unsigned more,
+ gfp_t gfp)
+{
+ struct replicas_delta_list *d = trans->fs_usage_deltas;
+ unsigned new_size = d ? (d->size + more) * 2 : 128;
+ unsigned alloc_size = sizeof(*d) + new_size;
+
+ WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX);
+
+ if (!d || d->used + more > d->size) {
+ d = krealloc(d, alloc_size, gfp|__GFP_ZERO);
+
+ if (unlikely(!d)) {
+ if (alloc_size > REPLICAS_DELTA_LIST_MAX)
+ return -ENOMEM;
+
+ d = mempool_alloc(&trans->c->replicas_delta_pool, gfp);
+ if (!d)
+ return -ENOMEM;
+
+ memset(d, 0, REPLICAS_DELTA_LIST_MAX);
+
+ if (trans->fs_usage_deltas)
+ memcpy(d, trans->fs_usage_deltas,
+ trans->fs_usage_deltas->size + sizeof(*d));
+
+ new_size = REPLICAS_DELTA_LIST_MAX - sizeof(*d);
+ kfree(trans->fs_usage_deltas);
+ }
+
+ d->size = new_size;
+ trans->fs_usage_deltas = d;
+ }
+
+ return 0;
+}
+
+int bch2_replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
+{
+ return allocate_dropping_locks_errcode(trans,
+ __replicas_deltas_realloc(trans, more, _gfp));
+}
+
+static inline int update_replicas_list(struct btree_trans *trans,
+ struct bch_replicas_entry *r,
+ s64 sectors)
+{
+ struct replicas_delta_list *d;
+ struct replicas_delta *n;
+ unsigned b;
+ int ret;
+
+ if (!sectors)
+ return 0;
+
+ b = replicas_entry_bytes(r) + 8;
+ ret = bch2_replicas_deltas_realloc(trans, b);
+ if (ret)
+ return ret;
+
+ d = trans->fs_usage_deltas;
+ n = (void *) d->d + d->used;
+ n->delta = sectors;
+ unsafe_memcpy((void *) n + offsetof(struct replicas_delta, r),
+ r, replicas_entry_bytes(r),
+ "flexible array member embedded in strcuct with padding");
+ bch2_replicas_entry_sort(&n->r);
+ d->used += b;
+ return 0;
+}
+
+static inline int update_cached_sectors_list(struct btree_trans *trans,
+ unsigned dev, s64 sectors)
+{
+ struct bch_replicas_padded r;
+
+ bch2_replicas_entry_cached(&r.e, dev);
+
+ return update_replicas_list(trans, &r.e, sectors);
+}
+
+int bch2_mark_alloc(struct btree_trans *trans,
+ enum btree_id btree, unsigned level,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
+{
+ bool gc = flags & BTREE_TRIGGER_GC;
+ u64 journal_seq = trans->journal_res.seq;
+ u64 bucket_journal_seq;
+ struct bch_fs *c = trans->c;
+ struct bch_alloc_v4 old_a_convert, new_a_convert;
+ const struct bch_alloc_v4 *old_a, *new_a;
+ struct bch_dev *ca;
+ int ret = 0;
+
+ /*
+ * alloc btree is read in by bch2_alloc_read, not gc:
+ */
+ if ((flags & BTREE_TRIGGER_GC) &&
+ !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
+ return 0;
+
+ if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans,
+ "alloc key for invalid device or bucket"))
+ return -EIO;
+
+ ca = bch_dev_bkey_exists(c, new.k->p.inode);
+
+ old_a = bch2_alloc_to_v4(old, &old_a_convert);
+ new_a = bch2_alloc_to_v4(new, &new_a_convert);
+
+ bucket_journal_seq = new_a->journal_seq;
+
+ if ((flags & BTREE_TRIGGER_INSERT) &&
+ data_type_is_empty(old_a->data_type) !=
+ data_type_is_empty(new_a->data_type) &&
+ new.k->type == KEY_TYPE_alloc_v4) {
+ struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v;
+
+ EBUG_ON(!journal_seq);
+
+ /*
+ * If the btree updates referring to a bucket weren't flushed
+ * before the bucket became empty again, then the we don't have
+ * to wait on a journal flush before we can reuse the bucket:
+ */
+ v->journal_seq = bucket_journal_seq =
+ data_type_is_empty(new_a->data_type) &&
+ (journal_seq == v->journal_seq ||
+ bch2_journal_noflush_seq(&c->journal, v->journal_seq))
+ ? 0 : journal_seq;
+ }
+
+ if (!data_type_is_empty(old_a->data_type) &&
+ data_type_is_empty(new_a->data_type) &&
+ bucket_journal_seq) {
+ ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+ c->journal.flushed_seq_ondisk,
+ new.k->p.inode, new.k->p.offset,
+ bucket_journal_seq);
+ if (ret) {
+ bch2_fs_fatal_error(c,
+ "error setting bucket_needs_journal_commit: %i", ret);
+ return ret;
+ }
+ }
+
+ percpu_down_read(&c->mark_lock);
+ if (!gc && new_a->gen != old_a->gen)
+ *bucket_gen(ca, new.k->p.offset) = new_a->gen;
+
+ bch2_dev_usage_update(c, ca, *old_a, *new_a, journal_seq, gc);
+
+ if (gc) {
+ struct bucket *g = gc_bucket(ca, new.k->p.offset);
+
+ bucket_lock(g);
+
+ g->gen_valid = 1;
+ g->gen = new_a->gen;
+ g->data_type = new_a->data_type;
+ g->stripe = new_a->stripe;
+ g->stripe_redundancy = new_a->stripe_redundancy;
+ g->dirty_sectors = new_a->dirty_sectors;
+ g->cached_sectors = new_a->cached_sectors;
+
+ bucket_unlock(g);
+ }
+ percpu_up_read(&c->mark_lock);
+
+ /*
+ * need to know if we're getting called from the invalidate path or
+ * not:
+ */
+
+ if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
+ old_a->cached_sectors) {
+ ret = update_cached_sectors(c, new, ca->dev_idx,
+ -((s64) old_a->cached_sectors),
+ journal_seq, gc);
+ if (ret) {
+ bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors",
+ __func__);
+ return ret;
+ }
+ }
+
+ if (new_a->data_type == BCH_DATA_free &&
+ (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk))
+ closure_wake_up(&c->freelist_wait);
+
+ if (new_a->data_type == BCH_DATA_need_discard &&
+ (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk))
+ bch2_do_discards(c);
+
+ if (old_a->data_type != BCH_DATA_cached &&
+ new_a->data_type == BCH_DATA_cached &&
+ should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
+ bch2_do_invalidates(c);
+
+ if (new_a->data_type == BCH_DATA_need_gc_gens)
+ bch2_do_gc_gens(c);
+
+ return 0;
+}
+
+int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+ size_t b, enum bch_data_type data_type,
+ unsigned sectors, struct gc_pos pos,
+ unsigned flags)
+{
+ struct bucket old, new, *g;
+ int ret = 0;
+
+ BUG_ON(!(flags & BTREE_TRIGGER_GC));
+ BUG_ON(data_type != BCH_DATA_sb &&
+ data_type != BCH_DATA_journal);
+
+ /*
+ * Backup superblock might be past the end of our normal usable space:
+ */
+ if (b >= ca->mi.nbuckets)
+ return 0;
+
+ percpu_down_read(&c->mark_lock);
+ g = gc_bucket(ca, b);
+
+ bucket_lock(g);
+ old = *g;
+
+ if (bch2_fs_inconsistent_on(g->data_type &&
+ g->data_type != data_type, c,
+ "different types of data in same bucket: %s, %s",
+ bch2_data_types[g->data_type],
+ bch2_data_types[data_type])) {
+ ret = -EIO;
+ goto err;
+ }
+
+ if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
+ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size",
+ ca->dev_idx, b, g->gen,
+ bch2_data_types[g->data_type ?: data_type],
+ g->dirty_sectors, sectors)) {
+ ret = -EIO;
+ goto err;
+ }
+
+
+ g->data_type = data_type;
+ g->dirty_sectors += sectors;
+ new = *g;
+err:
+ bucket_unlock(g);
+ if (!ret)
+ bch2_dev_usage_update_m(c, ca, old, new, 0, true);
+ percpu_up_read(&c->mark_lock);
+ return ret;
+}
+
+static int check_bucket_ref(struct btree_trans *trans,
+ struct bkey_s_c k,
+ const struct bch_extent_ptr *ptr,
+ s64 sectors, enum bch_data_type ptr_data_type,
+ u8 b_gen, u8 bucket_data_type,
+ u32 dirty_sectors, u32 cached_sectors)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
+ u32 bucket_sectors = !ptr->cached
+ ? dirty_sectors
+ : cached_sectors;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ if (bucket_data_type == BCH_DATA_cached)
+ bucket_data_type = BCH_DATA_user;
+
+ if ((bucket_data_type == BCH_DATA_stripe && ptr_data_type == BCH_DATA_user) ||
+ (bucket_data_type == BCH_DATA_user && ptr_data_type == BCH_DATA_stripe))
+ bucket_data_type = ptr_data_type = BCH_DATA_stripe;
+
+ if (gen_after(ptr->gen, b_gen)) {
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen,
+ "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
+ "while marking %s",
+ ptr->dev, bucket_nr, b_gen,
+ bch2_data_types[bucket_data_type ?: ptr_data_type],
+ ptr->gen,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ ret = -EIO;
+ goto err;
+ }
+
+ if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ BCH_FSCK_ERR_ptr_too_stale,
+ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+ "while marking %s",
+ ptr->dev, bucket_nr, b_gen,
+ bch2_data_types[bucket_data_type ?: ptr_data_type],
+ ptr->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ ret = -EIO;
+ goto err;
+ }
+
+ if (b_gen != ptr->gen && !ptr->cached) {
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ BCH_FSCK_ERR_stale_dirty_ptr,
+ "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
+ "while marking %s",
+ ptr->dev, bucket_nr, b_gen,
+ *bucket_gen(ca, bucket_nr),
+ bch2_data_types[bucket_data_type ?: ptr_data_type],
+ ptr->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ ret = -EIO;
+ goto err;
+ }
+
+ if (b_gen != ptr->gen) {
+ ret = 1;
+ goto out;
+ }
+
+ if (!data_type_is_empty(bucket_data_type) &&
+ ptr_data_type &&
+ bucket_data_type != ptr_data_type) {
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ BCH_FSCK_ERR_ptr_bucket_data_type_mismatch,
+ "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
+ "while marking %s",
+ ptr->dev, bucket_nr, b_gen,
+ bch2_data_types[bucket_data_type],
+ bch2_data_types[ptr_data_type],
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ ret = -EIO;
+ goto err;
+ }
+
+ if ((u64) bucket_sectors + sectors > U32_MAX) {
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ BCH_FSCK_ERR_bucket_sector_count_overflow,
+ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
+ "while marking %s",
+ ptr->dev, bucket_nr, b_gen,
+ bch2_data_types[bucket_data_type ?: ptr_data_type],
+ bucket_sectors, sectors,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ ret = -EIO;
+ goto err;
+ }
+out:
+ printbuf_exit(&buf);
+ return ret;
+err:
+ bch2_dump_trans_updates(trans);
+ goto out;
+}
+
+static int mark_stripe_bucket(struct btree_trans *trans,
+ struct bkey_s_c k,
+ unsigned ptr_idx,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ u64 journal_seq = trans->journal_res.seq;
+ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+ unsigned nr_data = s->nr_blocks - s->nr_redundant;
+ bool parity = ptr_idx >= nr_data;
+ enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
+ s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
+ const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bucket old, new, *g;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+ /* * XXX doesn't handle deletion */
+
+ percpu_down_read(&c->mark_lock);
+ g = PTR_GC_BUCKET(ca, ptr);
+
+ if (g->dirty_sectors ||
+ (g->stripe && g->stripe != k.k->p.offset)) {
+ bch2_fs_inconsistent(c,
+ "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
+ ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ ret = -EINVAL;
+ goto err;
+ }
+
+ bucket_lock(g);
+ old = *g;
+
+ ret = check_bucket_ref(trans, k, ptr, sectors, data_type,
+ g->gen, g->data_type,
+ g->dirty_sectors, g->cached_sectors);
+ if (ret)
+ goto err;
+
+ g->data_type = data_type;
+ g->dirty_sectors += sectors;
+
+ g->stripe = k.k->p.offset;
+ g->stripe_redundancy = s->nr_redundant;
+ new = *g;
+err:
+ bucket_unlock(g);
+ if (!ret)
+ bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
+ percpu_up_read(&c->mark_lock);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int __mark_pointer(struct btree_trans *trans,
+ struct bkey_s_c k,
+ const struct bch_extent_ptr *ptr,
+ s64 sectors, enum bch_data_type ptr_data_type,
+ u8 bucket_gen, u8 *bucket_data_type,
+ u32 *dirty_sectors, u32 *cached_sectors)
+{
+ u32 *dst_sectors = !ptr->cached
+ ? dirty_sectors
+ : cached_sectors;
+ int ret = check_bucket_ref(trans, k, ptr, sectors, ptr_data_type,
+ bucket_gen, *bucket_data_type,
+ *dirty_sectors, *cached_sectors);
+
+ if (ret)
+ return ret;
+
+ *dst_sectors += sectors;
+
+ if (!*dirty_sectors && !*cached_sectors)
+ *bucket_data_type = 0;
+ else if (*bucket_data_type != BCH_DATA_stripe)
+ *bucket_data_type = ptr_data_type;
+
+ return 0;
+}
+
+static int bch2_mark_pointer(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k,
+ struct extent_ptr_decoded p,
+ s64 sectors,
+ unsigned flags)
+{
+ u64 journal_seq = trans->journal_res.seq;
+ struct bch_fs *c = trans->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+ struct bucket old, new, *g;
+ enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p);
+ u8 bucket_data_type;
+ int ret = 0;
+
+ BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+ percpu_down_read(&c->mark_lock);
+ g = PTR_GC_BUCKET(ca, &p.ptr);
+ bucket_lock(g);
+ old = *g;
+
+ bucket_data_type = g->data_type;
+ ret = __mark_pointer(trans, k, &p.ptr, sectors,
+ data_type, g->gen,
+ &bucket_data_type,
+ &g->dirty_sectors,
+ &g->cached_sectors);
+ if (!ret)
+ g->data_type = bucket_data_type;
+
+ new = *g;
+ bucket_unlock(g);
+ if (!ret)
+ bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
+ percpu_up_read(&c->mark_lock);
+
+ return ret;
+}
+
+static int bch2_mark_stripe_ptr(struct btree_trans *trans,
+ struct bkey_s_c k,
+ struct bch_extent_stripe_ptr p,
+ enum bch_data_type data_type,
+ s64 sectors,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_replicas_padded r;
+ struct gc_stripe *m;
+
+ BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+ m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL);
+ if (!m) {
+ bch_err(c, "error allocating memory for gc_stripes, idx %llu",
+ (u64) p.idx);
+ return -BCH_ERR_ENOMEM_mark_stripe_ptr;
+ }
+
+ mutex_lock(&c->ec_stripes_heap_lock);
+
+ if (!m || !m->alive) {
+ mutex_unlock(&c->ec_stripes_heap_lock);
+ bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
+ (u64) p.idx);
+ bch2_inconsistent_error(c);
+ return -EIO;
+ }
+
+ m->block_sectors[p.block] += sectors;
+
+ r = m->r;
+ mutex_unlock(&c->ec_stripes_heap_lock);
+
+ r.e.data_type = data_type;
+ update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true);
+
+ return 0;
+}
+
+static int __mark_extent(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
+{
+ u64 journal_seq = trans->journal_res.seq;
+ struct bch_fs *c = trans->c;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ struct bch_replicas_padded r;
+ enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
+ ? BCH_DATA_btree
+ : BCH_DATA_user;
+ s64 sectors = bkey_is_btree_ptr(k.k)
+ ? btree_sectors(c)
+ : k.k->size;
+ s64 dirty_sectors = 0;
+ bool stale;
+ int ret;
+
+ BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+ r.e.data_type = data_type;
+ r.e.nr_devs = 0;
+ r.e.nr_required = 1;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ s64 disk_sectors = ptr_disk_sectors(sectors, p);
+
+ if (flags & BTREE_TRIGGER_OVERWRITE)
+ disk_sectors = -disk_sectors;
+
+ ret = bch2_mark_pointer(trans, btree_id, level, k, p, disk_sectors, flags);
+ if (ret < 0)
+ return ret;
+
+ stale = ret > 0;
+
+ if (p.ptr.cached) {
+ if (!stale) {
+ ret = update_cached_sectors(c, k, p.ptr.dev,
+ disk_sectors, journal_seq, true);
+ if (ret) {
+ bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors",
+ __func__);
+ return ret;
+ }
+ }
+ } else if (!p.has_ec) {
+ dirty_sectors += disk_sectors;
+ r.e.devs[r.e.nr_devs++] = p.ptr.dev;
+ } else {
+ ret = bch2_mark_stripe_ptr(trans, k, p.ec, data_type,
+ disk_sectors, flags);
+ if (ret)
+ return ret;
+
+ /*
+ * There may be other dirty pointers in this extent, but
+ * if so they're not required for mounting if we have an
+ * erasure coded pointer in this extent:
+ */
+ r.e.nr_required = 0;
+ }
+ }
+
+ if (r.e.nr_devs) {
+ ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true);
+ if (ret) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf);
+ printbuf_exit(&buf);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+int bch2_mark_extent(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
+{
+ return mem_trigger_run_overwrite_then_insert(__mark_extent, trans, btree_id, level, old, new, flags);
+}
+
+int bch2_mark_stripe(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
+{
+ bool gc = flags & BTREE_TRIGGER_GC;
+ u64 journal_seq = trans->journal_res.seq;
+ struct bch_fs *c = trans->c;
+ u64 idx = new.k->p.offset;
+ const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
+ ? bkey_s_c_to_stripe(old).v : NULL;
+ const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
+ ? bkey_s_c_to_stripe(new).v : NULL;
+ unsigned i;
+ int ret;
+
+ BUG_ON(gc && old_s);
+
+ if (!gc) {
+ struct stripe *m = genradix_ptr(&c->stripes, idx);
+
+ if (!m) {
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf1, c, old);
+ bch2_bkey_val_to_text(&buf2, c, new);
+ bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
+ "old %s\n"
+ "new %s", idx, buf1.buf, buf2.buf);
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
+ bch2_inconsistent_error(c);
+ return -1;
+ }
+
+ if (!new_s) {
+ bch2_stripes_heap_del(c, m, idx);
+
+ memset(m, 0, sizeof(*m));
+ } else {
+ m->sectors = le16_to_cpu(new_s->sectors);
+ m->algorithm = new_s->algorithm;
+ m->nr_blocks = new_s->nr_blocks;
+ m->nr_redundant = new_s->nr_redundant;
+ m->blocks_nonempty = 0;
+
+ for (i = 0; i < new_s->nr_blocks; i++)
+ m->blocks_nonempty += !!stripe_blockcount_get(new_s, i);
+
+ if (!old_s)
+ bch2_stripes_heap_insert(c, m, idx);
+ else
+ bch2_stripes_heap_update(c, m, idx);
+ }
+ } else {
+ struct gc_stripe *m =
+ genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
+
+ if (!m) {
+ bch_err(c, "error allocating memory for gc_stripes, idx %llu",
+ idx);
+ return -BCH_ERR_ENOMEM_mark_stripe;
+ }
+ /*
+ * This will be wrong when we bring back runtime gc: we should
+ * be unmarking the old key and then marking the new key
+ */
+ m->alive = true;
+ m->sectors = le16_to_cpu(new_s->sectors);
+ m->nr_blocks = new_s->nr_blocks;
+ m->nr_redundant = new_s->nr_redundant;
+
+ for (i = 0; i < new_s->nr_blocks; i++)
+ m->ptrs[i] = new_s->ptrs[i];
+
+ bch2_bkey_to_replicas(&m->r.e, new);
+
+ /*
+ * gc recalculates this field from stripe ptr
+ * references:
+ */
+ memset(m->block_sectors, 0, sizeof(m->block_sectors));
+
+ for (i = 0; i < new_s->nr_blocks; i++) {
+ ret = mark_stripe_bucket(trans, new, i, flags);
+ if (ret)
+ return ret;
+ }
+
+ ret = update_replicas(c, new, &m->r.e,
+ ((s64) m->sectors * m->nr_redundant),
+ journal_seq, gc);
+ if (ret) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, new);
+ bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
+ printbuf_exit(&buf);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static int __mark_reservation(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_fs_usage *fs_usage;
+ unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
+ s64 sectors = (s64) k.k->size;
+
+ BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+ if (flags & BTREE_TRIGGER_OVERWRITE)
+ sectors = -sectors;
+ sectors *= replicas;
+
+ percpu_down_read(&c->mark_lock);
+ preempt_disable();
+
+ fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC);
+ replicas = clamp_t(unsigned, replicas, 1,
+ ARRAY_SIZE(fs_usage->persistent_reserved));
+
+ fs_usage->reserved += sectors;
+ fs_usage->persistent_reserved[replicas - 1] += sectors;
+
+ preempt_enable();
+ percpu_up_read(&c->mark_lock);
+
+ return 0;
+}
+
+int bch2_mark_reservation(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
+{
+ return mem_trigger_run_overwrite_then_insert(__mark_reservation, trans, btree_id, level, old, new, flags);
+}
+
+static s64 __bch2_mark_reflink_p(struct btree_trans *trans,
+ struct bkey_s_c_reflink_p p,
+ u64 start, u64 end,
+ u64 *idx, unsigned flags, size_t r_idx)
+{
+ struct bch_fs *c = trans->c;
+ struct reflink_gc *r;
+ int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+ u64 next_idx = end;
+ s64 ret = 0;
+ struct printbuf buf = PRINTBUF;
+
+ if (r_idx >= c->reflink_gc_nr)
+ goto not_found;
+
+ r = genradix_ptr(&c->reflink_gc_table, r_idx);
+ next_idx = min(next_idx, r->offset - r->size);
+ if (*idx < next_idx)
+ goto not_found;
+
+ BUG_ON((s64) r->refcount + add < 0);
+
+ r->refcount += add;
+ *idx = r->offset;
+ return 0;
+not_found:
+ if (fsck_err(c, reflink_p_to_missing_reflink_v,
+ "pointer to missing indirect extent\n"
+ " %s\n"
+ " missing range %llu-%llu",
+ (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
+ *idx, next_idx)) {
+ struct bkey_i_error *new;
+
+ new = bch2_trans_kmalloc(trans, sizeof(*new));
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ goto err;
+
+ bkey_init(&new->k);
+ new->k.type = KEY_TYPE_error;
+ new->k.p = bkey_start_pos(p.k);
+ new->k.p.offset += *idx - start;
+ bch2_key_resize(&new->k, next_idx - *idx);
+ ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i,
+ BTREE_TRIGGER_NORUN);
+ }
+
+ *idx = next_idx;
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int __mark_reflink_p(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+ struct reflink_gc *ref;
+ size_t l, r, m;
+ u64 idx = le64_to_cpu(p.v->idx), start = idx;
+ u64 end = le64_to_cpu(p.v->idx) + p.k->size;
+ int ret = 0;
+
+ BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+ if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_reflink_p_fix) {
+ idx -= le32_to_cpu(p.v->front_pad);
+ end += le32_to_cpu(p.v->back_pad);
+ }
+
+ l = 0;
+ r = c->reflink_gc_nr;
+ while (l < r) {
+ m = l + (r - l) / 2;
+
+ ref = genradix_ptr(&c->reflink_gc_table, m);
+ if (ref->offset <= idx)
+ l = m + 1;
+ else
+ r = m;
+ }
+
+ while (idx < end && !ret)
+ ret = __bch2_mark_reflink_p(trans, p, start, end,
+ &idx, flags, l++);
+
+ return ret;
+}
+
+int bch2_mark_reflink_p(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
+{
+ return mem_trigger_run_overwrite_then_insert(__mark_reflink_p, trans, btree_id, level, old, new, flags);
+}
+
+void bch2_trans_fs_usage_revert(struct btree_trans *trans,
+ struct replicas_delta_list *deltas)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_fs_usage *dst;
+ struct replicas_delta *d, *top = (void *) deltas->d + deltas->used;
+ s64 added = 0;
+ unsigned i;
+
+ percpu_down_read(&c->mark_lock);
+ preempt_disable();
+ dst = fs_usage_ptr(c, trans->journal_res.seq, false);
+
+ /* revert changes: */
+ for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
+ switch (d->r.data_type) {
+ case BCH_DATA_btree:
+ case BCH_DATA_user:
+ case BCH_DATA_parity:
+ added += d->delta;
+ }
+ BUG_ON(__update_replicas(c, dst, &d->r, -d->delta));
+ }
+
+ dst->nr_inodes -= deltas->nr_inodes;
+
+ for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+ added -= deltas->persistent_reserved[i];
+ dst->reserved -= deltas->persistent_reserved[i];
+ dst->persistent_reserved[i] -= deltas->persistent_reserved[i];
+ }
+
+ if (added > 0) {
+ trans->disk_res->sectors += added;
+ this_cpu_add(*c->online_reserved, added);
+ }
+
+ preempt_enable();
+ percpu_up_read(&c->mark_lock);
+}
+
+int bch2_trans_fs_usage_apply(struct btree_trans *trans,
+ struct replicas_delta_list *deltas)
+{
+ struct bch_fs *c = trans->c;
+ static int warned_disk_usage = 0;
+ bool warn = false;
+ u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
+ struct replicas_delta *d, *d2;
+ struct replicas_delta *top = (void *) deltas->d + deltas->used;
+ struct bch_fs_usage *dst;
+ s64 added = 0, should_not_have_added;
+ unsigned i;
+
+ percpu_down_read(&c->mark_lock);
+ preempt_disable();
+ dst = fs_usage_ptr(c, trans->journal_res.seq, false);
+
+ for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
+ switch (d->r.data_type) {
+ case BCH_DATA_btree:
+ case BCH_DATA_user:
+ case BCH_DATA_parity:
+ added += d->delta;
+ }
+
+ if (__update_replicas(c, dst, &d->r, d->delta))
+ goto need_mark;
+ }
+
+ dst->nr_inodes += deltas->nr_inodes;
+
+ for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+ added += deltas->persistent_reserved[i];
+ dst->reserved += deltas->persistent_reserved[i];
+ dst->persistent_reserved[i] += deltas->persistent_reserved[i];
+ }
+
+ /*
+ * Not allowed to reduce sectors_available except by getting a
+ * reservation:
+ */
+ should_not_have_added = added - (s64) disk_res_sectors;
+ if (unlikely(should_not_have_added > 0)) {
+ u64 old, new, v = atomic64_read(&c->sectors_available);
+
+ do {
+ old = v;
+ new = max_t(s64, 0, old - should_not_have_added);
+ } while ((v = atomic64_cmpxchg(&c->sectors_available,
+ old, new)) != old);
+
+ added -= should_not_have_added;
+ warn = true;
+ }
+
+ if (added > 0) {
+ trans->disk_res->sectors -= added;
+ this_cpu_sub(*c->online_reserved, added);
+ }
+
+ preempt_enable();
+ percpu_up_read(&c->mark_lock);
+
+ if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
+ bch2_trans_inconsistent(trans,
+ "disk usage increased %lli more than %llu sectors reserved)",
+ should_not_have_added, disk_res_sectors);
+ return 0;
+need_mark:
+ /* revert changes: */
+ for (d2 = deltas->d; d2 != d; d2 = replicas_delta_next(d2))
+ BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta));
+
+ preempt_enable();
+ percpu_up_read(&c->mark_lock);
+ return -1;
+}
+
+/* trans_mark: */
+
+static inline int bch2_trans_mark_pointer(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, struct extent_ptr_decoded p,
+ unsigned flags)
+{
+ bool insert = !(flags & BTREE_TRIGGER_OVERWRITE);
+ struct btree_iter iter;
+ struct bkey_i_alloc_v4 *a;
+ struct bpos bucket;
+ struct bch_backpointer bp;
+ s64 sectors;
+ int ret;
+
+ bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp);
+ sectors = bp.bucket_len;
+ if (!insert)
+ sectors = -sectors;
+
+ a = bch2_trans_start_alloc_update(trans, &iter, bucket);
+ if (IS_ERR(a))
+ return PTR_ERR(a);
+
+ ret = __mark_pointer(trans, k, &p.ptr, sectors, bp.data_type,
+ a->v.gen, &a->v.data_type,
+ &a->v.dirty_sectors, &a->v.cached_sectors) ?:
+ bch2_trans_update(trans, &iter, &a->k_i, 0);
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (ret)
+ return ret;
+
+ if (!p.ptr.cached) {
+ ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
+ struct extent_ptr_decoded p,
+ s64 sectors, enum bch_data_type data_type)
+{
+ struct btree_iter iter;
+ struct bkey_i_stripe *s;
+ struct bch_replicas_padded r;
+ int ret = 0;
+
+ s = bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_stripes, POS(0, p.ec.idx),
+ BTREE_ITER_WITH_UPDATES, stripe);
+ ret = PTR_ERR_OR_ZERO(s);
+ if (unlikely(ret)) {
+ bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans,
+ "pointer to nonexistent stripe %llu",
+ (u64) p.ec.idx);
+ goto err;
+ }
+
+ if (!bch2_ptr_matches_stripe(&s->v, p)) {
+ bch2_trans_inconsistent(trans,
+ "stripe pointer doesn't match stripe %llu",
+ (u64) p.ec.idx);
+ ret = -EIO;
+ goto err;
+ }
+
+ stripe_blockcount_set(&s->v, p.ec.block,
+ stripe_blockcount_get(&s->v, p.ec.block) +
+ sectors);
+
+ bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
+ r.e.data_type = data_type;
+ ret = update_replicas_list(trans, &r.e, sectors);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int __trans_mark_extent(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ struct bch_replicas_padded r;
+ enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
+ ? BCH_DATA_btree
+ : BCH_DATA_user;
+ s64 sectors = bkey_is_btree_ptr(k.k)
+ ? btree_sectors(c)
+ : k.k->size;
+ s64 dirty_sectors = 0;
+ bool stale;
+ int ret = 0;
+
+ r.e.data_type = data_type;
+ r.e.nr_devs = 0;
+ r.e.nr_required = 1;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ s64 disk_sectors = ptr_disk_sectors(sectors, p);
+
+ if (flags & BTREE_TRIGGER_OVERWRITE)
+ disk_sectors = -disk_sectors;
+
+ ret = bch2_trans_mark_pointer(trans, btree_id, level, k, p, flags);
+ if (ret < 0)
+ return ret;
+
+ stale = ret > 0;
+
+ if (p.ptr.cached) {
+ if (!stale) {
+ ret = update_cached_sectors_list(trans, p.ptr.dev,
+ disk_sectors);
+ if (ret)
+ return ret;
+ }
+ } else if (!p.has_ec) {
+ dirty_sectors += disk_sectors;
+ r.e.devs[r.e.nr_devs++] = p.ptr.dev;
+ } else {
+ ret = bch2_trans_mark_stripe_ptr(trans, p,
+ disk_sectors, data_type);
+ if (ret)
+ return ret;
+
+ r.e.nr_required = 0;
+ }
+ }
+
+ if (r.e.nr_devs)
+ ret = update_replicas_list(trans, &r.e, dirty_sectors);
+
+ return ret;
+}
+
+int bch2_trans_mark_extent(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_i *new,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ int mod = (int) bch2_bkey_needs_rebalance(c, bkey_i_to_s_c(new)) -
+ (int) bch2_bkey_needs_rebalance(c, old);
+
+ if (mod) {
+ int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new->k.p, mod > 0);
+ if (ret)
+ return ret;
+ }
+
+ return trigger_run_overwrite_then_insert(__trans_mark_extent, trans, btree_id, level, old, new, flags);
+}
+
+static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
+ struct bkey_s_c_stripe s,
+ unsigned idx, bool deleting)
+{
+ struct bch_fs *c = trans->c;
+ const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
+ struct btree_iter iter;
+ struct bkey_i_alloc_v4 *a;
+ enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant
+ ? BCH_DATA_parity : 0;
+ s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0;
+ int ret = 0;
+
+ if (deleting)
+ sectors = -sectors;
+
+ a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr));
+ if (IS_ERR(a))
+ return PTR_ERR(a);
+
+ ret = check_bucket_ref(trans, s.s_c, ptr, sectors, data_type,
+ a->v.gen, a->v.data_type,
+ a->v.dirty_sectors, a->v.cached_sectors);
+ if (ret)
+ goto err;
+
+ if (!deleting) {
+ if (bch2_trans_inconsistent_on(a->v.stripe ||
+ a->v.stripe_redundancy, trans,
+ "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
+ iter.pos.inode, iter.pos.offset, a->v.gen,
+ bch2_data_types[a->v.data_type],
+ a->v.dirty_sectors,
+ a->v.stripe, s.k->p.offset)) {
+ ret = -EIO;
+ goto err;
+ }
+
+ if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans,
+ "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
+ iter.pos.inode, iter.pos.offset, a->v.gen,
+ bch2_data_types[a->v.data_type],
+ a->v.dirty_sectors,
+ s.k->p.offset)) {
+ ret = -EIO;
+ goto err;
+ }
+
+ a->v.stripe = s.k->p.offset;
+ a->v.stripe_redundancy = s.v->nr_redundant;
+ a->v.data_type = BCH_DATA_stripe;
+ } else {
+ if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset ||
+ a->v.stripe_redundancy != s.v->nr_redundant, trans,
+ "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)",
+ iter.pos.inode, iter.pos.offset, a->v.gen,
+ s.k->p.offset, a->v.stripe)) {
+ ret = -EIO;
+ goto err;
+ }
+
+ a->v.stripe = 0;
+ a->v.stripe_redundancy = 0;
+ a->v.data_type = alloc_data_type(a->v, BCH_DATA_user);
+ }
+
+ a->v.dirty_sectors += sectors;
+ if (data_type)
+ a->v.data_type = !deleting ? data_type : 0;
+
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+ if (ret)
+ goto err;
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_trans_mark_stripe(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_i *new,
+ unsigned flags)
+{
+ const struct bch_stripe *old_s = NULL;
+ struct bch_stripe *new_s = NULL;
+ struct bch_replicas_padded r;
+ unsigned i, nr_blocks;
+ int ret = 0;
+
+ if (old.k->type == KEY_TYPE_stripe)
+ old_s = bkey_s_c_to_stripe(old).v;
+ if (new->k.type == KEY_TYPE_stripe)
+ new_s = &bkey_i_to_stripe(new)->v;
+
+ /*
+ * If the pointers aren't changing, we don't need to do anything:
+ */
+ if (new_s && old_s &&
+ new_s->nr_blocks == old_s->nr_blocks &&
+ new_s->nr_redundant == old_s->nr_redundant &&
+ !memcmp(old_s->ptrs, new_s->ptrs,
+ new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
+ return 0;
+
+ BUG_ON(new_s && old_s &&
+ (new_s->nr_blocks != old_s->nr_blocks ||
+ new_s->nr_redundant != old_s->nr_redundant));
+
+ nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
+
+ if (new_s) {
+ s64 sectors = le16_to_cpu(new_s->sectors);
+
+ bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new));
+ ret = update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
+ if (ret)
+ return ret;
+ }
+
+ if (old_s) {
+ s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
+
+ bch2_bkey_to_replicas(&r.e, old);
+ ret = update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
+ if (ret)
+ return ret;
+ }
+
+ for (i = 0; i < nr_blocks; i++) {
+ if (new_s && old_s &&
+ !memcmp(&new_s->ptrs[i],
+ &old_s->ptrs[i],
+ sizeof(new_s->ptrs[i])))
+ continue;
+
+ if (new_s) {
+ ret = bch2_trans_mark_stripe_bucket(trans,
+ bkey_i_to_s_c_stripe(new), i, false);
+ if (ret)
+ break;
+ }
+
+ if (old_s) {
+ ret = bch2_trans_mark_stripe_bucket(trans,
+ bkey_s_c_to_stripe(old), i, true);
+ if (ret)
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static int __trans_mark_reservation(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
+{
+ unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
+ s64 sectors = (s64) k.k->size;
+ struct replicas_delta_list *d;
+ int ret;
+
+ if (flags & BTREE_TRIGGER_OVERWRITE)
+ sectors = -sectors;
+ sectors *= replicas;
+
+ ret = bch2_replicas_deltas_realloc(trans, 0);
+ if (ret)
+ return ret;
+
+ d = trans->fs_usage_deltas;
+ replicas = clamp_t(unsigned, replicas, 1,
+ ARRAY_SIZE(d->persistent_reserved));
+
+ d->persistent_reserved[replicas - 1] += sectors;
+ return 0;
+}
+
+int bch2_trans_mark_reservation(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old,
+ struct bkey_i *new,
+ unsigned flags)
+{
+ return trigger_run_overwrite_then_insert(__trans_mark_reservation, trans, btree_id, level, old, new, flags);
+}
+
+static int trans_mark_reflink_p_segment(struct btree_trans *trans,
+ struct bkey_s_c_reflink_p p,
+ u64 *idx, unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_i *k;
+ __le64 *refcount;
+ int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+ struct printbuf buf = PRINTBUF;
+ int ret;
+
+ k = bch2_bkey_get_mut_noupdate(trans, &iter,
+ BTREE_ID_reflink, POS(0, *idx),
+ BTREE_ITER_WITH_UPDATES);
+ ret = PTR_ERR_OR_ZERO(k);
+ if (ret)
+ goto err;
+
+ refcount = bkey_refcount(k);
+ if (!refcount) {
+ bch2_bkey_val_to_text(&buf, c, p.s_c);
+ bch2_trans_inconsistent(trans,
+ "nonexistent indirect extent at %llu while marking\n %s",
+ *idx, buf.buf);
+ ret = -EIO;
+ goto err;
+ }
+
+ if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
+ bch2_bkey_val_to_text(&buf, c, p.s_c);
+ bch2_trans_inconsistent(trans,
+ "indirect extent refcount underflow at %llu while marking\n %s",
+ *idx, buf.buf);
+ ret = -EIO;
+ goto err;
+ }
+
+ if (flags & BTREE_TRIGGER_INSERT) {
+ struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
+ u64 pad;
+
+ pad = max_t(s64, le32_to_cpu(v->front_pad),
+ le64_to_cpu(v->idx) - bkey_start_offset(&k->k));
+ BUG_ON(pad > U32_MAX);
+ v->front_pad = cpu_to_le32(pad);
+
+ pad = max_t(s64, le32_to_cpu(v->back_pad),
+ k->k.p.offset - p.k->size - le64_to_cpu(v->idx));
+ BUG_ON(pad > U32_MAX);
+ v->back_pad = cpu_to_le32(pad);
+ }
+
+ le64_add_cpu(refcount, add);
+
+ bch2_btree_iter_set_pos_to_extent_start(&iter);
+ ret = bch2_trans_update(trans, &iter, k, 0);
+ if (ret)
+ goto err;
+
+ *idx = k->k.p.offset;
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int __trans_mark_reflink_p(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
+{
+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+ u64 idx, end_idx;
+ int ret = 0;
+
+ idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
+ end_idx = le64_to_cpu(p.v->idx) + p.k->size +
+ le32_to_cpu(p.v->back_pad);
+
+ while (idx < end_idx && !ret)
+ ret = trans_mark_reflink_p_segment(trans, p, &idx, flags);
+ return ret;
+}
+
+int bch2_trans_mark_reflink_p(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old,
+ struct bkey_i *new,
+ unsigned flags)
+{
+ if (flags & BTREE_TRIGGER_INSERT) {
+ struct bch_reflink_p *v = &bkey_i_to_reflink_p(new)->v;
+
+ v->front_pad = v->back_pad = 0;
+ }
+
+ return trigger_run_overwrite_then_insert(__trans_mark_reflink_p, trans, btree_id, level, old, new, flags);
+}
+
+static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
+ struct bch_dev *ca, size_t b,
+ enum bch_data_type type,
+ unsigned sectors)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_i_alloc_v4 *a;
+ int ret = 0;
+
+ /*
+ * Backup superblock might be past the end of our normal usable space:
+ */
+ if (b >= ca->mi.nbuckets)
+ return 0;
+
+ a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b));
+ if (IS_ERR(a))
+ return PTR_ERR(a);
+
+ if (a->v.data_type && type && a->v.data_type != type) {
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ BCH_FSCK_ERR_bucket_metadata_type_mismatch,
+ "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
+ "while marking %s",
+ iter.pos.inode, iter.pos.offset, a->v.gen,
+ bch2_data_types[a->v.data_type],
+ bch2_data_types[type],
+ bch2_data_types[type]);
+ ret = -EIO;
+ goto err;
+ }
+
+ if (a->v.data_type != type ||
+ a->v.dirty_sectors != sectors) {
+ a->v.data_type = type;
+ a->v.dirty_sectors = sectors;
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
+ struct bch_dev *ca, size_t b,
+ enum bch_data_type type,
+ unsigned sectors)
+{
+ return commit_do(trans, NULL, NULL, 0,
+ __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
+}
+
+static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
+ struct bch_dev *ca,
+ u64 start, u64 end,
+ enum bch_data_type type,
+ u64 *bucket, unsigned *bucket_sectors)
+{
+ do {
+ u64 b = sector_to_bucket(ca, start);
+ unsigned sectors =
+ min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
+
+ if (b != *bucket && *bucket_sectors) {
+ int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket,
+ type, *bucket_sectors);
+ if (ret)
+ return ret;
+
+ *bucket_sectors = 0;
+ }
+
+ *bucket = b;
+ *bucket_sectors += sectors;
+ start += sectors;
+ } while (start < end);
+
+ return 0;
+}
+
+static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
+ struct bch_dev *ca)
+{
+ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+ u64 bucket = 0;
+ unsigned i, bucket_sectors = 0;
+ int ret;
+
+ for (i = 0; i < layout->nr_superblocks; i++) {
+ u64 offset = le64_to_cpu(layout->sb_offset[i]);
+
+ if (offset == BCH_SB_SECTOR) {
+ ret = bch2_trans_mark_metadata_sectors(trans, ca,
+ 0, BCH_SB_SECTOR,
+ BCH_DATA_sb, &bucket, &bucket_sectors);
+ if (ret)
+ return ret;
+ }
+
+ ret = bch2_trans_mark_metadata_sectors(trans, ca, offset,
+ offset + (1 << layout->sb_max_size_bits),
+ BCH_DATA_sb, &bucket, &bucket_sectors);
+ if (ret)
+ return ret;
+ }
+
+ if (bucket_sectors) {
+ ret = bch2_trans_mark_metadata_bucket(trans, ca,
+ bucket, BCH_DATA_sb, bucket_sectors);
+ if (ret)
+ return ret;
+ }
+
+ for (i = 0; i < ca->journal.nr; i++) {
+ ret = bch2_trans_mark_metadata_bucket(trans, ca,
+ ca->journal.buckets[i],
+ BCH_DATA_journal, ca->mi.bucket_size);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
+{
+ int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca));
+
+ if (ret)
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+int bch2_trans_mark_dev_sbs(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i;
+
+ for_each_online_member(ca, c, i) {
+ int ret = bch2_trans_mark_dev_sb(c, ca);
+ if (ret) {
+ percpu_ref_put(&ca->ref);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+/* Disk reservations: */
+
+#define SECTORS_CACHE 1024
+
+int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
+ u64 sectors, int flags)
+{
+ struct bch_fs_pcpu *pcpu;
+ u64 old, v, get;
+ s64 sectors_available;
+ int ret;
+
+ percpu_down_read(&c->mark_lock);
+ preempt_disable();
+ pcpu = this_cpu_ptr(c->pcpu);
+
+ if (sectors <= pcpu->sectors_available)
+ goto out;
+
+ v = atomic64_read(&c->sectors_available);
+ do {
+ old = v;
+ get = min((u64) sectors + SECTORS_CACHE, old);
+
+ if (get < sectors) {
+ preempt_enable();
+ goto recalculate;
+ }
+ } while ((v = atomic64_cmpxchg(&c->sectors_available,
+ old, old - get)) != old);
+
+ pcpu->sectors_available += get;
+
+out:
+ pcpu->sectors_available -= sectors;
+ this_cpu_add(*c->online_reserved, sectors);
+ res->sectors += sectors;
+
+ preempt_enable();
+ percpu_up_read(&c->mark_lock);
+ return 0;
+
+recalculate:
+ mutex_lock(&c->sectors_available_lock);
+
+ percpu_u64_set(&c->pcpu->sectors_available, 0);
+ sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free);
+
+ if (sectors <= sectors_available ||
+ (flags & BCH_DISK_RESERVATION_NOFAIL)) {
+ atomic64_set(&c->sectors_available,
+ max_t(s64, 0, sectors_available - sectors));
+ this_cpu_add(*c->online_reserved, sectors);
+ res->sectors += sectors;
+ ret = 0;
+ } else {
+ atomic64_set(&c->sectors_available, sectors_available);
+ ret = -BCH_ERR_ENOSPC_disk_reservation;
+ }
+
+ mutex_unlock(&c->sectors_available_lock);
+ percpu_up_read(&c->mark_lock);
+
+ return ret;
+}
+
+/* Startup/shutdown: */
+
+static void bucket_gens_free_rcu(struct rcu_head *rcu)
+{
+ struct bucket_gens *buckets =
+ container_of(rcu, struct bucket_gens, rcu);
+
+ kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets);
+}
+
+int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
+{
+ struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
+ unsigned long *buckets_nouse = NULL;
+ bool resize = ca->bucket_gens != NULL;
+ int ret;
+
+ if (!(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
+ GFP_KERNEL|__GFP_ZERO))) {
+ ret = -BCH_ERR_ENOMEM_bucket_gens;
+ goto err;
+ }
+
+ if ((c->opts.buckets_nouse &&
+ !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
+ sizeof(unsigned long),
+ GFP_KERNEL|__GFP_ZERO)))) {
+ ret = -BCH_ERR_ENOMEM_buckets_nouse;
+ goto err;
+ }
+
+ bucket_gens->first_bucket = ca->mi.first_bucket;
+ bucket_gens->nbuckets = nbuckets;
+
+ if (resize) {
+ down_write(&c->gc_lock);
+ down_write(&ca->bucket_lock);
+ percpu_down_write(&c->mark_lock);
+ }
+
+ old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
+
+ if (resize) {
+ size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets);
+
+ memcpy(bucket_gens->b,
+ old_bucket_gens->b,
+ n);
+ if (buckets_nouse)
+ memcpy(buckets_nouse,
+ ca->buckets_nouse,
+ BITS_TO_LONGS(n) * sizeof(unsigned long));
+ }
+
+ rcu_assign_pointer(ca->bucket_gens, bucket_gens);
+ bucket_gens = old_bucket_gens;
+
+ swap(ca->buckets_nouse, buckets_nouse);
+
+ nbuckets = ca->mi.nbuckets;
+
+ if (resize) {
+ percpu_up_write(&c->mark_lock);
+ up_write(&ca->bucket_lock);
+ up_write(&c->gc_lock);
+ }
+
+ ret = 0;
+err:
+ kvpfree(buckets_nouse,
+ BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
+ if (bucket_gens)
+ call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
+
+ return ret;
+}
+
+void bch2_dev_buckets_free(struct bch_dev *ca)
+{
+ unsigned i;
+
+ kvpfree(ca->buckets_nouse,
+ BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
+ kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
+ sizeof(struct bucket_gens) + ca->mi.nbuckets);
+
+ for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+ free_percpu(ca->usage[i]);
+ kfree(ca->usage_base);
+}
+
+int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
+{
+ unsigned i;
+
+ ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
+ if (!ca->usage_base)
+ return -BCH_ERR_ENOMEM_usage_init;
+
+ for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
+ ca->usage[i] = alloc_percpu(struct bch_dev_usage);
+ if (!ca->usage[i])
+ return -BCH_ERR_ENOMEM_usage_init;
+ }
+
+ return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);
+}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
new file mode 100644
index 000000000000..21f6cb356921
--- /dev/null
+++ b/fs/bcachefs/buckets.h
@@ -0,0 +1,458 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Code for manipulating bucket marks for garbage collection.
+ *
+ * Copyright 2014 Datera, Inc.
+ */
+
+#ifndef _BUCKETS_H
+#define _BUCKETS_H
+
+#include "buckets_types.h"
+#include "extents.h"
+#include "sb-members.h"
+
+static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s)
+{
+ return div_u64(s, ca->mi.bucket_size);
+}
+
+static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
+{
+ return ((sector_t) b) * ca->mi.bucket_size;
+}
+
+static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
+{
+ u32 remainder;
+
+ div_u64_rem(s, ca->mi.bucket_size, &remainder);
+ return remainder;
+}
+
+static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s,
+ u32 *offset)
+{
+ return div_u64_rem(s, ca->mi.bucket_size, offset);
+}
+
+#define for_each_bucket(_b, _buckets) \
+ for (_b = (_buckets)->b + (_buckets)->first_bucket; \
+ _b < (_buckets)->b + (_buckets)->nbuckets; _b++)
+
+/*
+ * Ugly hack alert:
+ *
+ * We need to cram a spinlock in a single byte, because that's what we have left
+ * in struct bucket, and we care about the size of these - during fsck, we need
+ * in memory state for every single bucket on every device.
+ *
+ * We used to do
+ * while (xchg(&b->lock, 1) cpu_relax();
+ * but, it turns out not all architectures support xchg on a single byte.
+ *
+ * So now we use bit_spin_lock(), with fun games since we can't burn a whole
+ * ulong for this - we just need to make sure the lock bit always ends up in the
+ * first byte.
+ */
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define BUCKET_LOCK_BITNR 0
+#else
+#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1)
+#endif
+
+union ulong_byte_assert {
+ ulong ulong;
+ u8 byte;
+};
+
+static inline void bucket_unlock(struct bucket *b)
+{
+ BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
+
+ clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &b->lock);
+ wake_up_bit((void *) &b->lock, BUCKET_LOCK_BITNR);
+}
+
+static inline void bucket_lock(struct bucket *b)
+{
+ wait_on_bit_lock((void *) &b->lock, BUCKET_LOCK_BITNR,
+ TASK_UNINTERRUPTIBLE);
+}
+
+static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
+{
+ return rcu_dereference_check(ca->buckets_gc,
+ !ca->fs ||
+ percpu_rwsem_is_held(&ca->fs->mark_lock) ||
+ lockdep_is_held(&ca->fs->gc_lock) ||
+ lockdep_is_held(&ca->bucket_lock));
+}
+
+static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
+{
+ struct bucket_array *buckets = gc_bucket_array(ca);
+
+ BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
+ return buckets->b + b;
+}
+
+static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
+{
+ return rcu_dereference_check(ca->bucket_gens,
+ !ca->fs ||
+ percpu_rwsem_is_held(&ca->fs->mark_lock) ||
+ lockdep_is_held(&ca->fs->gc_lock) ||
+ lockdep_is_held(&ca->bucket_lock));
+}
+
+static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
+{
+ struct bucket_gens *gens = bucket_gens(ca);
+
+ BUG_ON(b < gens->first_bucket || b >= gens->nbuckets);
+ return gens->b + b;
+}
+
+static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
+ const struct bch_extent_ptr *ptr)
+{
+ return sector_to_bucket(ca, ptr->offset);
+}
+
+static inline struct bpos PTR_BUCKET_POS(const struct bch_fs *c,
+ const struct bch_extent_ptr *ptr)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+ return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
+}
+
+static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_fs *c,
+ const struct bch_extent_ptr *ptr,
+ u32 *bucket_offset)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+ return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset));
+}
+
+static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca,
+ const struct bch_extent_ptr *ptr)
+{
+ return gc_bucket(ca, PTR_BUCKET_NR(ca, ptr));
+}
+
+static inline enum bch_data_type ptr_data_type(const struct bkey *k,
+ const struct bch_extent_ptr *ptr)
+{
+ if (bkey_is_btree_ptr(k))
+ return BCH_DATA_btree;
+
+ return ptr->cached ? BCH_DATA_cached : BCH_DATA_user;
+}
+
+static inline s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
+{
+ EBUG_ON(sectors < 0);
+
+ return crc_is_compressed(p.crc)
+ ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
+ p.crc.uncompressed_size)
+ : sectors;
+}
+
+static inline int gen_cmp(u8 a, u8 b)
+{
+ return (s8) (a - b);
+}
+
+static inline int gen_after(u8 a, u8 b)
+{
+ int r = gen_cmp(a, b);
+
+ return r > 0 ? r : 0;
+}
+
+/**
+ * ptr_stale() - check if a pointer points into a bucket that has been
+ * invalidated.
+ */
+static inline u8 ptr_stale(struct bch_dev *ca,
+ const struct bch_extent_ptr *ptr)
+{
+ u8 ret;
+
+ rcu_read_lock();
+ ret = gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/* Device usage: */
+
+void bch2_dev_usage_read_fast(struct bch_dev *, struct bch_dev_usage *);
+static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
+{
+ struct bch_dev_usage ret;
+
+ bch2_dev_usage_read_fast(ca, &ret);
+ return ret;
+}
+
+void bch2_dev_usage_init(struct bch_dev *);
+
+static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark)
+{
+ s64 reserved = 0;
+
+ switch (watermark) {
+ case BCH_WATERMARK_NR:
+ BUG();
+ case BCH_WATERMARK_stripe:
+ reserved += ca->mi.nbuckets >> 6;
+ fallthrough;
+ case BCH_WATERMARK_normal:
+ reserved += ca->mi.nbuckets >> 6;
+ fallthrough;
+ case BCH_WATERMARK_copygc:
+ reserved += ca->nr_btree_reserve;
+ fallthrough;
+ case BCH_WATERMARK_btree:
+ reserved += ca->nr_btree_reserve;
+ fallthrough;
+ case BCH_WATERMARK_btree_copygc:
+ case BCH_WATERMARK_reclaim:
+ break;
+ }
+
+ return reserved;
+}
+
+static inline u64 dev_buckets_free(struct bch_dev *ca,
+ struct bch_dev_usage usage,
+ enum bch_watermark watermark)
+{
+ return max_t(s64, 0,
+ usage.d[BCH_DATA_free].buckets -
+ ca->nr_open_buckets -
+ bch2_dev_buckets_reserved(ca, watermark));
+}
+
+static inline u64 __dev_buckets_available(struct bch_dev *ca,
+ struct bch_dev_usage usage,
+ enum bch_watermark watermark)
+{
+ return max_t(s64, 0,
+ usage.d[BCH_DATA_free].buckets
+ + usage.d[BCH_DATA_cached].buckets
+ + usage.d[BCH_DATA_need_gc_gens].buckets
+ + usage.d[BCH_DATA_need_discard].buckets
+ - ca->nr_open_buckets
+ - bch2_dev_buckets_reserved(ca, watermark));
+}
+
+static inline u64 dev_buckets_available(struct bch_dev *ca,
+ enum bch_watermark watermark)
+{
+ return __dev_buckets_available(ca, bch2_dev_usage_read(ca), watermark);
+}
+
+/* Filesystem usage: */
+
+static inline unsigned __fs_usage_u64s(unsigned nr_replicas)
+{
+ return sizeof(struct bch_fs_usage) / sizeof(u64) + nr_replicas;
+}
+
+static inline unsigned fs_usage_u64s(struct bch_fs *c)
+{
+ return __fs_usage_u64s(READ_ONCE(c->replicas.nr));
+}
+
+static inline unsigned __fs_usage_online_u64s(unsigned nr_replicas)
+{
+ return sizeof(struct bch_fs_usage_online) / sizeof(u64) + nr_replicas;
+}
+
+static inline unsigned fs_usage_online_u64s(struct bch_fs *c)
+{
+ return __fs_usage_online_u64s(READ_ONCE(c->replicas.nr));
+}
+
+static inline unsigned dev_usage_u64s(void)
+{
+ return sizeof(struct bch_dev_usage) / sizeof(u64);
+}
+
+u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *);
+
+struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *);
+
+void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned);
+
+void bch2_fs_usage_to_text(struct printbuf *,
+ struct bch_fs *, struct bch_fs_usage_online *);
+
+u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *);
+
+struct bch_fs_usage_short
+bch2_fs_usage_read_short(struct bch_fs *);
+
+/* key/bucket marking: */
+
+static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
+ unsigned journal_seq,
+ bool gc)
+{
+ percpu_rwsem_assert_held(&c->mark_lock);
+ BUG_ON(!gc && !journal_seq);
+
+ return this_cpu_ptr(gc
+ ? c->usage_gc
+ : c->usage[journal_seq & JOURNAL_BUF_MASK]);
+}
+
+int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned);
+
+void bch2_fs_usage_initialize(struct bch_fs *);
+
+int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
+ size_t, enum bch_data_type, unsigned,
+ struct gc_pos, unsigned);
+
+int bch2_mark_alloc(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_extent(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_stripe(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_reservation(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s_c, unsigned);
+
+int bch2_trans_mark_extent(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+
+#define mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\
+({ \
+ int ret = 0; \
+ \
+ if (_old.k->type) \
+ ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_INSERT); \
+ if (!ret && _new.k->type) \
+ ret = _fn(_trans, _btree_id, _level, _new, _flags & ~BTREE_TRIGGER_OVERWRITE); \
+ ret; \
+})
+
+#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags) \
+ mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, bkey_i_to_s_c(_new), _flags)
+
+void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *);
+int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
+
+int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
+ size_t, enum bch_data_type, unsigned);
+int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *);
+int bch2_trans_mark_dev_sbs(struct bch_fs *);
+
+static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
+{
+ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+ u64 b_offset = bucket_to_sector(ca, b);
+ u64 b_end = bucket_to_sector(ca, b + 1);
+ unsigned i;
+
+ if (!b)
+ return true;
+
+ for (i = 0; i < layout->nr_superblocks; i++) {
+ u64 offset = le64_to_cpu(layout->sb_offset[i]);
+ u64 end = offset + (1 << layout->sb_max_size_bits);
+
+ if (!(offset >= b_end || end <= b_offset))
+ return true;
+ }
+
+ return false;
+}
+
+/* disk reservations: */
+
+static inline void bch2_disk_reservation_put(struct bch_fs *c,
+ struct disk_reservation *res)
+{
+ if (res->sectors) {
+ this_cpu_sub(*c->online_reserved, res->sectors);
+ res->sectors = 0;
+ }
+}
+
+#define BCH_DISK_RESERVATION_NOFAIL (1 << 0)
+
+int __bch2_disk_reservation_add(struct bch_fs *,
+ struct disk_reservation *,
+ u64, int);
+
+static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
+ u64 sectors, int flags)
+{
+#ifdef __KERNEL__
+ u64 old, new;
+
+ do {
+ old = this_cpu_read(c->pcpu->sectors_available);
+ if (sectors > old)
+ return __bch2_disk_reservation_add(c, res, sectors, flags);
+
+ new = old - sectors;
+ } while (this_cpu_cmpxchg(c->pcpu->sectors_available, old, new) != old);
+
+ this_cpu_add(*c->online_reserved, sectors);
+ res->sectors += sectors;
+ return 0;
+#else
+ return __bch2_disk_reservation_add(c, res, sectors, flags);
+#endif
+}
+
+static inline struct disk_reservation
+bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
+{
+ return (struct disk_reservation) {
+ .sectors = 0,
+#if 0
+ /* not used yet: */
+ .gen = c->capacity_gen,
+#endif
+ .nr_replicas = nr_replicas,
+ };
+}
+
+static inline int bch2_disk_reservation_get(struct bch_fs *c,
+ struct disk_reservation *res,
+ u64 sectors, unsigned nr_replicas,
+ int flags)
+{
+ *res = bch2_disk_reservation_init(c, nr_replicas);
+
+ return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags);
+}
+
+#define RESERVE_FACTOR 6
+
+static inline u64 avail_factor(u64 r)
+{
+ return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1);
+}
+
+int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64);
+void bch2_dev_buckets_free(struct bch_dev *);
+int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *);
+
+#endif /* _BUCKETS_H */
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
new file mode 100644
index 000000000000..2a9dab9006ef
--- /dev/null
+++ b/fs/bcachefs/buckets_types.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BUCKETS_TYPES_H
+#define _BUCKETS_TYPES_H
+
+#include "bcachefs_format.h"
+#include "util.h"
+
+#define BUCKET_JOURNAL_SEQ_BITS 16
+
+struct bucket {
+ u8 lock;
+ u8 gen_valid:1;
+ u8 data_type:7;
+ u8 gen;
+ u8 stripe_redundancy;
+ u32 stripe;
+ u32 dirty_sectors;
+ u32 cached_sectors;
+};
+
+struct bucket_array {
+ struct rcu_head rcu;
+ u16 first_bucket;
+ size_t nbuckets;
+ struct bucket b[];
+};
+
+struct bucket_gens {
+ struct rcu_head rcu;
+ u16 first_bucket;
+ size_t nbuckets;
+ u8 b[];
+};
+
+struct bch_dev_usage {
+ u64 buckets_ec;
+
+ struct {
+ u64 buckets;
+ u64 sectors; /* _compressed_ sectors: */
+ /*
+ * XXX
+ * Why do we have this? Isn't it just buckets * bucket_size -
+ * sectors?
+ */
+ u64 fragmented;
+ } d[BCH_DATA_NR];
+};
+
+struct bch_fs_usage {
+ /* all fields are in units of 512 byte sectors: */
+ u64 hidden;
+ u64 btree;
+ u64 data;
+ u64 cached;
+ u64 reserved;
+ u64 nr_inodes;
+
+ /* XXX: add stats for compression ratio */
+#if 0
+ u64 uncompressed;
+ u64 compressed;
+#endif
+
+ /* broken out: */
+
+ u64 persistent_reserved[BCH_REPLICAS_MAX];
+ u64 replicas[];
+};
+
+struct bch_fs_usage_online {
+ u64 online_reserved;
+ struct bch_fs_usage u;
+};
+
+struct bch_fs_usage_short {
+ u64 capacity;
+ u64 used;
+ u64 free;
+ u64 nr_inodes;
+};
+
+/*
+ * A reservation for space on disk:
+ */
+struct disk_reservation {
+ u64 sectors;
+ u32 gen;
+ unsigned nr_replicas;
+};
+
+#endif /* _BUCKETS_TYPES_H */
diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c
new file mode 100644
index 000000000000..ec1b636ef78d
--- /dev/null
+++ b/fs/bcachefs/buckets_waiting_for_journal.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "buckets_waiting_for_journal.h"
+#include <linux/hash.h>
+#include <linux/random.h>
+
+static inline struct bucket_hashed *
+bucket_hash(struct buckets_waiting_for_journal_table *t,
+ unsigned hash_seed_idx, u64 dev_bucket)
+{
+ return t->d + hash_64(dev_bucket ^ t->hash_seeds[hash_seed_idx], t->bits);
+}
+
+static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t bits)
+{
+ unsigned i;
+
+ t->bits = bits;
+ for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++)
+ get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i]));
+ memset(t->d, 0, sizeof(t->d[0]) << t->bits);
+}
+
+bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
+ u64 flushed_seq,
+ unsigned dev, u64 bucket)
+{
+ struct buckets_waiting_for_journal_table *t;
+ u64 dev_bucket = (u64) dev << 56 | bucket;
+ bool ret = false;
+ unsigned i;
+
+ mutex_lock(&b->lock);
+ t = b->t;
+
+ for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
+ struct bucket_hashed *h = bucket_hash(t, i, dev_bucket);
+
+ if (h->dev_bucket == dev_bucket) {
+ ret = h->journal_seq > flushed_seq;
+ break;
+ }
+ }
+
+ mutex_unlock(&b->lock);
+
+ return ret;
+}
+
+static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t,
+ struct bucket_hashed *new,
+ u64 flushed_seq)
+{
+ struct bucket_hashed *last_evicted = NULL;
+ unsigned tries, i;
+
+ for (tries = 0; tries < 10; tries++) {
+ struct bucket_hashed *old, *victim = NULL;
+
+ for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
+ old = bucket_hash(t, i, new->dev_bucket);
+
+ if (old->dev_bucket == new->dev_bucket ||
+ old->journal_seq <= flushed_seq) {
+ *old = *new;
+ return true;
+ }
+
+ if (last_evicted != old)
+ victim = old;
+ }
+
+ /* hashed to same slot 3 times: */
+ if (!victim)
+ break;
+
+ /* Failed to find an empty slot: */
+ swap(*new, *victim);
+ last_evicted = victim;
+ }
+
+ return false;
+}
+
+int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
+ u64 flushed_seq,
+ unsigned dev, u64 bucket,
+ u64 journal_seq)
+{
+ struct buckets_waiting_for_journal_table *t, *n;
+ struct bucket_hashed tmp, new = {
+ .dev_bucket = (u64) dev << 56 | bucket,
+ .journal_seq = journal_seq,
+ };
+ size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0;
+ int ret = 0;
+
+ mutex_lock(&b->lock);
+
+ if (likely(bucket_table_insert(b->t, &new, flushed_seq)))
+ goto out;
+
+ t = b->t;
+ size = 1UL << t->bits;
+ for (i = 0; i < size; i++)
+ nr_elements += t->d[i].journal_seq > flushed_seq;
+
+ new_bits = t->bits + (nr_elements * 3 > size);
+
+ n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL);
+ if (!n) {
+ ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set;
+ goto out;
+ }
+
+retry_rehash:
+ nr_rehashes++;
+ bucket_table_init(n, new_bits);
+
+ tmp = new;
+ BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq));
+
+ for (i = 0; i < 1UL << t->bits; i++) {
+ if (t->d[i].journal_seq <= flushed_seq)
+ continue;
+
+ tmp = t->d[i];
+ if (!bucket_table_insert(n, &tmp, flushed_seq))
+ goto retry_rehash;
+ }
+
+ b->t = n;
+ kvfree(t);
+
+ pr_debug("took %zu rehashes, table at %zu/%lu elements",
+ nr_rehashes, nr_elements, 1UL << b->t->bits);
+out:
+ mutex_unlock(&b->lock);
+
+ return ret;
+}
+
+void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c)
+{
+ struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
+
+ kvfree(b->t);
+}
+
+#define INITIAL_TABLE_BITS 3
+
+int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c)
+{
+ struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
+
+ mutex_init(&b->lock);
+
+ b->t = kvmalloc(sizeof(*b->t) +
+ (sizeof(b->t->d[0]) << INITIAL_TABLE_BITS), GFP_KERNEL);
+ if (!b->t)
+ return -BCH_ERR_ENOMEM_buckets_waiting_for_journal_init;
+
+ bucket_table_init(b->t, INITIAL_TABLE_BITS);
+ return 0;
+}
diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h
new file mode 100644
index 000000000000..d2ae19cbe18c
--- /dev/null
+++ b/fs/bcachefs/buckets_waiting_for_journal.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H
+#define _BUCKETS_WAITING_FOR_JOURNAL_H
+
+#include "buckets_waiting_for_journal_types.h"
+
+bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
+ u64, unsigned, u64);
+int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
+ u64, unsigned, u64, u64);
+
+void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *);
+int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *);
+
+#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */
diff --git a/fs/bcachefs/buckets_waiting_for_journal_types.h b/fs/bcachefs/buckets_waiting_for_journal_types.h
new file mode 100644
index 000000000000..e593db061d81
--- /dev/null
+++ b/fs/bcachefs/buckets_waiting_for_journal_types.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
+#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
+
+#include <linux/siphash.h>
+
+struct bucket_hashed {
+ u64 dev_bucket;
+ u64 journal_seq;
+};
+
+struct buckets_waiting_for_journal_table {
+ unsigned bits;
+ u64 hash_seeds[3];
+ struct bucket_hashed d[];
+};
+
+struct buckets_waiting_for_journal {
+ struct mutex lock;
+ struct buckets_waiting_for_journal_table *t;
+};
+
+#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
new file mode 100644
index 000000000000..4bb88aefed12
--- /dev/null
+++ b/fs/bcachefs/chardev.c
@@ -0,0 +1,784 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_CHARDEV
+
+#include "bcachefs.h"
+#include "bcachefs_ioctl.h"
+#include "buckets.h"
+#include "chardev.h"
+#include "journal.h"
+#include "move.h"
+#include "replicas.h"
+#include "super.h"
+#include "super-io.h"
+
+#include <linux/anon_inodes.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/ioctl.h>
+#include <linux/kthread.h>
+#include <linux/major.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+/* returns with ref on ca->ref */
+static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
+ unsigned flags)
+{
+ struct bch_dev *ca;
+
+ if (flags & BCH_BY_INDEX) {
+ if (dev >= c->sb.nr_devices)
+ return ERR_PTR(-EINVAL);
+
+ rcu_read_lock();
+ ca = rcu_dereference(c->devs[dev]);
+ if (ca)
+ percpu_ref_get(&ca->ref);
+ rcu_read_unlock();
+
+ if (!ca)
+ return ERR_PTR(-EINVAL);
+ } else {
+ char *path;
+
+ path = strndup_user((const char __user *)
+ (unsigned long) dev, PATH_MAX);
+ if (IS_ERR(path))
+ return ERR_CAST(path);
+
+ ca = bch2_dev_lookup(c, path);
+ kfree(path);
+ }
+
+ return ca;
+}
+
+#if 0
+static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
+{
+ struct bch_ioctl_assemble arg;
+ struct bch_fs *c;
+ u64 *user_devs = NULL;
+ char **devs = NULL;
+ unsigned i;
+ int ret = -EFAULT;
+
+ if (copy_from_user(&arg, user_arg, sizeof(arg)))
+ return -EFAULT;
+
+ if (arg.flags || arg.pad)
+ return -EINVAL;
+
+ user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL);
+ if (!user_devs)
+ return -ENOMEM;
+
+ devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL);
+
+ if (copy_from_user(user_devs, user_arg->devs,
+ sizeof(u64) * arg.nr_devs))
+ goto err;
+
+ for (i = 0; i < arg.nr_devs; i++) {
+ devs[i] = strndup_user((const char __user *)(unsigned long)
+ user_devs[i],
+ PATH_MAX);
+ ret= PTR_ERR_OR_ZERO(devs[i]);
+ if (ret)
+ goto err;
+ }
+
+ c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty());
+ ret = PTR_ERR_OR_ZERO(c);
+ if (!ret)
+ closure_put(&c->cl);
+err:
+ if (devs)
+ for (i = 0; i < arg.nr_devs; i++)
+ kfree(devs[i]);
+ kfree(devs);
+ return ret;
+}
+
+static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg)
+{
+ struct bch_ioctl_incremental arg;
+ const char *err;
+ char *path;
+
+ if (copy_from_user(&arg, user_arg, sizeof(arg)))
+ return -EFAULT;
+
+ if (arg.flags || arg.pad)
+ return -EINVAL;
+
+ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
+ ret = PTR_ERR_OR_ZERO(path);
+ if (ret)
+ return ret;
+
+ err = bch2_fs_open_incremental(path);
+ kfree(path);
+
+ if (err) {
+ pr_err("Could not register bcachefs devices: %s", err);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+#endif
+
+static long bch2_global_ioctl(unsigned cmd, void __user *arg)
+{
+ switch (cmd) {
+#if 0
+ case BCH_IOCTL_ASSEMBLE:
+ return bch2_ioctl_assemble(arg);
+ case BCH_IOCTL_INCREMENTAL:
+ return bch2_ioctl_incremental(arg);
+#endif
+ default:
+ return -ENOTTY;
+ }
+}
+
+static long bch2_ioctl_query_uuid(struct bch_fs *c,
+ struct bch_ioctl_query_uuid __user *user_arg)
+{
+ if (copy_to_user(&user_arg->uuid, &c->sb.user_uuid,
+ sizeof(c->sb.user_uuid)))
+ return -EFAULT;
+ return 0;
+}
+
+#if 0
+static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (arg.flags || arg.pad)
+ return -EINVAL;
+
+ return bch2_fs_start(c);
+}
+
+static long bch2_ioctl_stop(struct bch_fs *c)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ bch2_fs_stop(c);
+ return 0;
+}
+#endif
+
+static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
+{
+ char *path;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (arg.flags || arg.pad)
+ return -EINVAL;
+
+ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
+ ret = PTR_ERR_OR_ZERO(path);
+ if (ret)
+ return ret;
+
+ ret = bch2_dev_add(c, path);
+ kfree(path);
+
+ return ret;
+}
+
+static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg)
+{
+ struct bch_dev *ca;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
+ BCH_FORCE_IF_METADATA_LOST|
+ BCH_FORCE_IF_DEGRADED|
+ BCH_BY_INDEX)) ||
+ arg.pad)
+ return -EINVAL;
+
+ ca = bch2_device_lookup(c, arg.dev, arg.flags);
+ if (IS_ERR(ca))
+ return PTR_ERR(ca);
+
+ return bch2_dev_remove(c, ca, arg.flags);
+}
+
+static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg)
+{
+ char *path;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (arg.flags || arg.pad)
+ return -EINVAL;
+
+ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
+ ret = PTR_ERR_OR_ZERO(path);
+ if (ret)
+ return ret;
+
+ ret = bch2_dev_online(c, path);
+ kfree(path);
+ return ret;
+}
+
+static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg)
+{
+ struct bch_dev *ca;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
+ BCH_FORCE_IF_METADATA_LOST|
+ BCH_FORCE_IF_DEGRADED|
+ BCH_BY_INDEX)) ||
+ arg.pad)
+ return -EINVAL;
+
+ ca = bch2_device_lookup(c, arg.dev, arg.flags);
+ if (IS_ERR(ca))
+ return PTR_ERR(ca);
+
+ ret = bch2_dev_offline(c, ca, arg.flags);
+ percpu_ref_put(&ca->ref);
+ return ret;
+}
+
+static long bch2_ioctl_disk_set_state(struct bch_fs *c,
+ struct bch_ioctl_disk_set_state arg)
+{
+ struct bch_dev *ca;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
+ BCH_FORCE_IF_METADATA_LOST|
+ BCH_FORCE_IF_DEGRADED|
+ BCH_BY_INDEX)) ||
+ arg.pad[0] || arg.pad[1] || arg.pad[2] ||
+ arg.new_state >= BCH_MEMBER_STATE_NR)
+ return -EINVAL;
+
+ ca = bch2_device_lookup(c, arg.dev, arg.flags);
+ if (IS_ERR(ca))
+ return PTR_ERR(ca);
+
+ ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags);
+ if (ret)
+ bch_err(c, "Error setting device state: %s", bch2_err_str(ret));
+
+ percpu_ref_put(&ca->ref);
+ return ret;
+}
+
+struct bch_data_ctx {
+ struct bch_fs *c;
+ struct bch_ioctl_data arg;
+ struct bch_move_stats stats;
+
+ int ret;
+
+ struct task_struct *thread;
+};
+
+static int bch2_data_thread(void *arg)
+{
+ struct bch_data_ctx *ctx = arg;
+
+ ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
+
+ ctx->stats.data_type = U8_MAX;
+ return 0;
+}
+
+static int bch2_data_job_release(struct inode *inode, struct file *file)
+{
+ struct bch_data_ctx *ctx = file->private_data;
+
+ kthread_stop(ctx->thread);
+ put_task_struct(ctx->thread);
+ kfree(ctx);
+ return 0;
+}
+
+static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ struct bch_data_ctx *ctx = file->private_data;
+ struct bch_fs *c = ctx->c;
+ struct bch_ioctl_data_event e = {
+ .type = BCH_DATA_EVENT_PROGRESS,
+ .p.data_type = ctx->stats.data_type,
+ .p.btree_id = ctx->stats.pos.btree,
+ .p.pos = ctx->stats.pos.pos,
+ .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
+ .p.sectors_total = bch2_fs_usage_read_short(c).used,
+ };
+
+ if (len < sizeof(e))
+ return -EINVAL;
+
+ if (copy_to_user(buf, &e, sizeof(e)))
+ return -EFAULT;
+
+ return sizeof(e);
+}
+
+static const struct file_operations bcachefs_data_ops = {
+ .release = bch2_data_job_release,
+ .read = bch2_data_job_read,
+ .llseek = no_llseek,
+};
+
+static long bch2_ioctl_data(struct bch_fs *c,
+ struct bch_ioctl_data arg)
+{
+ struct bch_data_ctx *ctx = NULL;
+ struct file *file = NULL;
+ unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK;
+ int ret, fd = -1;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (arg.op >= BCH_DATA_OP_NR || arg.flags)
+ return -EINVAL;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->c = c;
+ ctx->arg = arg;
+
+ ctx->thread = kthread_create(bch2_data_thread, ctx,
+ "bch-data/%s", c->name);
+ if (IS_ERR(ctx->thread)) {
+ ret = PTR_ERR(ctx->thread);
+ goto err;
+ }
+
+ ret = get_unused_fd_flags(flags);
+ if (ret < 0)
+ goto err;
+ fd = ret;
+
+ file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags);
+ if (IS_ERR(file)) {
+ ret = PTR_ERR(file);
+ goto err;
+ }
+
+ fd_install(fd, file);
+
+ get_task_struct(ctx->thread);
+ wake_up_process(ctx->thread);
+
+ return fd;
+err:
+ if (fd >= 0)
+ put_unused_fd(fd);
+ if (!IS_ERR_OR_NULL(ctx->thread))
+ kthread_stop(ctx->thread);
+ kfree(ctx);
+ return ret;
+}
+
+static long bch2_ioctl_fs_usage(struct bch_fs *c,
+ struct bch_ioctl_fs_usage __user *user_arg)
+{
+ struct bch_ioctl_fs_usage *arg = NULL;
+ struct bch_replicas_usage *dst_e, *dst_end;
+ struct bch_fs_usage_online *src;
+ u32 replica_entries_bytes;
+ unsigned i;
+ int ret = 0;
+
+ if (!test_bit(BCH_FS_STARTED, &c->flags))
+ return -EINVAL;
+
+ if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes))
+ return -EFAULT;
+
+ arg = kzalloc(size_add(sizeof(*arg), replica_entries_bytes), GFP_KERNEL);
+ if (!arg)
+ return -ENOMEM;
+
+ src = bch2_fs_usage_read(c);
+ if (!src) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ arg->capacity = c->capacity;
+ arg->used = bch2_fs_sectors_used(c, src);
+ arg->online_reserved = src->online_reserved;
+
+ for (i = 0; i < BCH_REPLICAS_MAX; i++)
+ arg->persistent_reserved[i] = src->u.persistent_reserved[i];
+
+ dst_e = arg->replicas;
+ dst_end = (void *) arg->replicas + replica_entries_bytes;
+
+ for (i = 0; i < c->replicas.nr; i++) {
+ struct bch_replicas_entry *src_e =
+ cpu_replicas_entry(&c->replicas, i);
+
+ /* check that we have enough space for one replicas entry */
+ if (dst_e + 1 > dst_end) {
+ ret = -ERANGE;
+ break;
+ }
+
+ dst_e->sectors = src->u.replicas[i];
+ dst_e->r = *src_e;
+
+ /* recheck after setting nr_devs: */
+ if (replicas_usage_next(dst_e) > dst_end) {
+ ret = -ERANGE;
+ break;
+ }
+
+ memcpy(dst_e->r.devs, src_e->devs, src_e->nr_devs);
+
+ dst_e = replicas_usage_next(dst_e);
+ }
+
+ arg->replica_entries_bytes = (void *) dst_e - (void *) arg->replicas;
+
+ percpu_up_read(&c->mark_lock);
+ kfree(src);
+
+ if (ret)
+ goto err;
+ if (copy_to_user(user_arg, arg,
+ sizeof(*arg) + arg->replica_entries_bytes))
+ ret = -EFAULT;
+err:
+ kfree(arg);
+ return ret;
+}
+
+static long bch2_ioctl_dev_usage(struct bch_fs *c,
+ struct bch_ioctl_dev_usage __user *user_arg)
+{
+ struct bch_ioctl_dev_usage arg;
+ struct bch_dev_usage src;
+ struct bch_dev *ca;
+ unsigned i;
+
+ if (!test_bit(BCH_FS_STARTED, &c->flags))
+ return -EINVAL;
+
+ if (copy_from_user(&arg, user_arg, sizeof(arg)))
+ return -EFAULT;
+
+ if ((arg.flags & ~BCH_BY_INDEX) ||
+ arg.pad[0] ||
+ arg.pad[1] ||
+ arg.pad[2])
+ return -EINVAL;
+
+ ca = bch2_device_lookup(c, arg.dev, arg.flags);
+ if (IS_ERR(ca))
+ return PTR_ERR(ca);
+
+ src = bch2_dev_usage_read(ca);
+
+ arg.state = ca->mi.state;
+ arg.bucket_size = ca->mi.bucket_size;
+ arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket;
+ arg.buckets_ec = src.buckets_ec;
+
+ for (i = 0; i < BCH_DATA_NR; i++) {
+ arg.d[i].buckets = src.d[i].buckets;
+ arg.d[i].sectors = src.d[i].sectors;
+ arg.d[i].fragmented = src.d[i].fragmented;
+ }
+
+ percpu_ref_put(&ca->ref);
+
+ if (copy_to_user(user_arg, &arg, sizeof(arg)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static long bch2_ioctl_read_super(struct bch_fs *c,
+ struct bch_ioctl_read_super arg)
+{
+ struct bch_dev *ca = NULL;
+ struct bch_sb *sb;
+ int ret = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) ||
+ arg.pad)
+ return -EINVAL;
+
+ mutex_lock(&c->sb_lock);
+
+ if (arg.flags & BCH_READ_DEV) {
+ ca = bch2_device_lookup(c, arg.dev, arg.flags);
+
+ if (IS_ERR(ca)) {
+ ret = PTR_ERR(ca);
+ goto err;
+ }
+
+ sb = ca->disk_sb.sb;
+ } else {
+ sb = c->disk_sb.sb;
+ }
+
+ if (vstruct_bytes(sb) > arg.size) {
+ ret = -ERANGE;
+ goto err;
+ }
+
+ if (copy_to_user((void __user *)(unsigned long)arg.sb, sb,
+ vstruct_bytes(sb)))
+ ret = -EFAULT;
+err:
+ if (!IS_ERR_OR_NULL(ca))
+ percpu_ref_put(&ca->ref);
+ mutex_unlock(&c->sb_lock);
+ return ret;
+}
+
+static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
+ struct bch_ioctl_disk_get_idx arg)
+{
+ dev_t dev = huge_decode_dev(arg.dev);
+ struct bch_dev *ca;
+ unsigned i;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!dev)
+ return -EINVAL;
+
+ for_each_online_member(ca, c, i)
+ if (ca->dev == dev) {
+ percpu_ref_put(&ca->io_ref);
+ return i;
+ }
+
+ return -BCH_ERR_ENOENT_dev_idx_not_found;
+}
+
+static long bch2_ioctl_disk_resize(struct bch_fs *c,
+ struct bch_ioctl_disk_resize arg)
+{
+ struct bch_dev *ca;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if ((arg.flags & ~BCH_BY_INDEX) ||
+ arg.pad)
+ return -EINVAL;
+
+ ca = bch2_device_lookup(c, arg.dev, arg.flags);
+ if (IS_ERR(ca))
+ return PTR_ERR(ca);
+
+ ret = bch2_dev_resize(c, ca, arg.nbuckets);
+
+ percpu_ref_put(&ca->ref);
+ return ret;
+}
+
+static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
+ struct bch_ioctl_disk_resize_journal arg)
+{
+ struct bch_dev *ca;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if ((arg.flags & ~BCH_BY_INDEX) ||
+ arg.pad)
+ return -EINVAL;
+
+ if (arg.nbuckets > U32_MAX)
+ return -EINVAL;
+
+ ca = bch2_device_lookup(c, arg.dev, arg.flags);
+ if (IS_ERR(ca))
+ return PTR_ERR(ca);
+
+ ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets);
+
+ percpu_ref_put(&ca->ref);
+ return ret;
+}
+
+#define BCH_IOCTL(_name, _argtype) \
+do { \
+ _argtype i; \
+ \
+ if (copy_from_user(&i, arg, sizeof(i))) \
+ return -EFAULT; \
+ ret = bch2_ioctl_##_name(c, i); \
+ goto out; \
+} while (0)
+
+long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
+{
+ long ret;
+
+ switch (cmd) {
+ case BCH_IOCTL_QUERY_UUID:
+ return bch2_ioctl_query_uuid(c, arg);
+ case BCH_IOCTL_FS_USAGE:
+ return bch2_ioctl_fs_usage(c, arg);
+ case BCH_IOCTL_DEV_USAGE:
+ return bch2_ioctl_dev_usage(c, arg);
+#if 0
+ case BCH_IOCTL_START:
+ BCH_IOCTL(start, struct bch_ioctl_start);
+ case BCH_IOCTL_STOP:
+ return bch2_ioctl_stop(c);
+#endif
+ case BCH_IOCTL_READ_SUPER:
+ BCH_IOCTL(read_super, struct bch_ioctl_read_super);
+ case BCH_IOCTL_DISK_GET_IDX:
+ BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx);
+ }
+
+ if (!test_bit(BCH_FS_STARTED, &c->flags))
+ return -EINVAL;
+
+ switch (cmd) {
+ case BCH_IOCTL_DISK_ADD:
+ BCH_IOCTL(disk_add, struct bch_ioctl_disk);
+ case BCH_IOCTL_DISK_REMOVE:
+ BCH_IOCTL(disk_remove, struct bch_ioctl_disk);
+ case BCH_IOCTL_DISK_ONLINE:
+ BCH_IOCTL(disk_online, struct bch_ioctl_disk);
+ case BCH_IOCTL_DISK_OFFLINE:
+ BCH_IOCTL(disk_offline, struct bch_ioctl_disk);
+ case BCH_IOCTL_DISK_SET_STATE:
+ BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state);
+ case BCH_IOCTL_DATA:
+ BCH_IOCTL(data, struct bch_ioctl_data);
+ case BCH_IOCTL_DISK_RESIZE:
+ BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);
+ case BCH_IOCTL_DISK_RESIZE_JOURNAL:
+ BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal);
+
+ default:
+ return -ENOTTY;
+ }
+out:
+ if (ret < 0)
+ ret = bch2_err_class(ret);
+ return ret;
+}
+
+static DEFINE_IDR(bch_chardev_minor);
+
+static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v)
+{
+ unsigned minor = iminor(file_inode(filp));
+ struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL;
+ void __user *arg = (void __user *) v;
+
+ return c
+ ? bch2_fs_ioctl(c, cmd, arg)
+ : bch2_global_ioctl(cmd, arg);
+}
+
+static const struct file_operations bch_chardev_fops = {
+ .owner = THIS_MODULE,
+ .unlocked_ioctl = bch2_chardev_ioctl,
+ .open = nonseekable_open,
+};
+
+static int bch_chardev_major;
+static struct class *bch_chardev_class;
+static struct device *bch_chardev;
+
+void bch2_fs_chardev_exit(struct bch_fs *c)
+{
+ if (!IS_ERR_OR_NULL(c->chardev))
+ device_unregister(c->chardev);
+ if (c->minor >= 0)
+ idr_remove(&bch_chardev_minor, c->minor);
+}
+
+int bch2_fs_chardev_init(struct bch_fs *c)
+{
+ c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL);
+ if (c->minor < 0)
+ return c->minor;
+
+ c->chardev = device_create(bch_chardev_class, NULL,
+ MKDEV(bch_chardev_major, c->minor), c,
+ "bcachefs%u-ctl", c->minor);
+ if (IS_ERR(c->chardev))
+ return PTR_ERR(c->chardev);
+
+ return 0;
+}
+
+void bch2_chardev_exit(void)
+{
+ if (!IS_ERR_OR_NULL(bch_chardev_class))
+ device_destroy(bch_chardev_class,
+ MKDEV(bch_chardev_major, U8_MAX));
+ if (!IS_ERR_OR_NULL(bch_chardev_class))
+ class_destroy(bch_chardev_class);
+ if (bch_chardev_major > 0)
+ unregister_chrdev(bch_chardev_major, "bcachefs");
+}
+
+int __init bch2_chardev_init(void)
+{
+ bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops);
+ if (bch_chardev_major < 0)
+ return bch_chardev_major;
+
+ bch_chardev_class = class_create("bcachefs");
+ if (IS_ERR(bch_chardev_class))
+ return PTR_ERR(bch_chardev_class);
+
+ bch_chardev = device_create(bch_chardev_class, NULL,
+ MKDEV(bch_chardev_major, U8_MAX),
+ NULL, "bcachefs-ctl");
+ if (IS_ERR(bch_chardev))
+ return PTR_ERR(bch_chardev);
+
+ return 0;
+}
+
+#endif /* NO_BCACHEFS_CHARDEV */
diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h
new file mode 100644
index 000000000000..0f563ca53c36
--- /dev/null
+++ b/fs/bcachefs/chardev.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CHARDEV_H
+#define _BCACHEFS_CHARDEV_H
+
+#ifndef NO_BCACHEFS_FS
+
+long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *);
+
+void bch2_fs_chardev_exit(struct bch_fs *);
+int bch2_fs_chardev_init(struct bch_fs *);
+
+void bch2_chardev_exit(void);
+int __init bch2_chardev_init(void);
+
+#else
+
+static inline long bch2_fs_ioctl(struct bch_fs *c,
+ unsigned cmd, void __user * arg)
+{
+ return -ENOTTY;
+}
+
+static inline void bch2_fs_chardev_exit(struct bch_fs *c) {}
+static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; }
+
+static inline void bch2_chardev_exit(void) {}
+static inline int __init bch2_chardev_init(void) { return 0; }
+
+#endif /* NO_BCACHEFS_FS */
+
+#endif /* _BCACHEFS_CHARDEV_H */
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
new file mode 100644
index 000000000000..3c761ad6b1c8
--- /dev/null
+++ b/fs/bcachefs/checksum.c
@@ -0,0 +1,804 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "checksum.h"
+#include "errcode.h"
+#include "super.h"
+#include "super-io.h"
+
+#include <linux/crc32c.h>
+#include <linux/crypto.h>
+#include <linux/xxhash.h>
+#include <linux/key.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <crypto/algapi.h>
+#include <crypto/chacha.h>
+#include <crypto/hash.h>
+#include <crypto/poly1305.h>
+#include <crypto/skcipher.h>
+#include <keys/user-type.h>
+
+/*
+ * bch2_checksum state is an abstraction of the checksum state calculated over different pages.
+ * it features page merging without having the checksum algorithm lose its state.
+ * for native checksum aglorithms (like crc), a default seed value will do.
+ * for hash-like algorithms, a state needs to be stored
+ */
+
+struct bch2_checksum_state {
+ union {
+ u64 seed;
+ struct xxh64_state h64state;
+ };
+ unsigned int type;
+};
+
+static void bch2_checksum_init(struct bch2_checksum_state *state)
+{
+ switch (state->type) {
+ case BCH_CSUM_none:
+ case BCH_CSUM_crc32c:
+ case BCH_CSUM_crc64:
+ state->seed = 0;
+ break;
+ case BCH_CSUM_crc32c_nonzero:
+ state->seed = U32_MAX;
+ break;
+ case BCH_CSUM_crc64_nonzero:
+ state->seed = U64_MAX;
+ break;
+ case BCH_CSUM_xxhash:
+ xxh64_reset(&state->h64state, 0);
+ break;
+ default:
+ BUG();
+ }
+}
+
+static u64 bch2_checksum_final(const struct bch2_checksum_state *state)
+{
+ switch (state->type) {
+ case BCH_CSUM_none:
+ case BCH_CSUM_crc32c:
+ case BCH_CSUM_crc64:
+ return state->seed;
+ case BCH_CSUM_crc32c_nonzero:
+ return state->seed ^ U32_MAX;
+ case BCH_CSUM_crc64_nonzero:
+ return state->seed ^ U64_MAX;
+ case BCH_CSUM_xxhash:
+ return xxh64_digest(&state->h64state);
+ default:
+ BUG();
+ }
+}
+
+static void bch2_checksum_update(struct bch2_checksum_state *state, const void *data, size_t len)
+{
+ switch (state->type) {
+ case BCH_CSUM_none:
+ return;
+ case BCH_CSUM_crc32c_nonzero:
+ case BCH_CSUM_crc32c:
+ state->seed = crc32c(state->seed, data, len);
+ break;
+ case BCH_CSUM_crc64_nonzero:
+ case BCH_CSUM_crc64:
+ state->seed = crc64_be(state->seed, data, len);
+ break;
+ case BCH_CSUM_xxhash:
+ xxh64_update(&state->h64state, data, len);
+ break;
+ default:
+ BUG();
+ }
+}
+
+static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm,
+ struct nonce nonce,
+ struct scatterlist *sg, size_t len)
+{
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
+ int ret;
+
+ skcipher_request_set_sync_tfm(req, tfm);
+ skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
+
+ ret = crypto_skcipher_encrypt(req);
+ if (ret)
+ pr_err("got error %i from crypto_skcipher_encrypt()", ret);
+
+ return ret;
+}
+
+static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
+ struct nonce nonce,
+ void *buf, size_t len)
+{
+ if (!is_vmalloc_addr(buf)) {
+ struct scatterlist sg;
+
+ sg_init_table(&sg, 1);
+ sg_set_page(&sg,
+ is_vmalloc_addr(buf)
+ ? vmalloc_to_page(buf)
+ : virt_to_page(buf),
+ len, offset_in_page(buf));
+ return do_encrypt_sg(tfm, nonce, &sg, len);
+ } else {
+ unsigned pages = buf_pages(buf, len);
+ struct scatterlist *sg;
+ size_t orig_len = len;
+ int ret, i;
+
+ sg = kmalloc_array(pages, sizeof(*sg), GFP_KERNEL);
+ if (!sg)
+ return -BCH_ERR_ENOMEM_do_encrypt;
+
+ sg_init_table(sg, pages);
+
+ for (i = 0; i < pages; i++) {
+ unsigned offset = offset_in_page(buf);
+ unsigned pg_len = min_t(size_t, len, PAGE_SIZE - offset);
+
+ sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset);
+ buf += pg_len;
+ len -= pg_len;
+ }
+
+ ret = do_encrypt_sg(tfm, nonce, sg, orig_len);
+ kfree(sg);
+ return ret;
+ }
+}
+
+int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
+ void *buf, size_t len)
+{
+ struct crypto_sync_skcipher *chacha20 =
+ crypto_alloc_sync_skcipher("chacha20", 0, 0);
+ int ret;
+
+ ret = PTR_ERR_OR_ZERO(chacha20);
+ if (ret) {
+ pr_err("error requesting chacha20 cipher: %s", bch2_err_str(ret));
+ return ret;
+ }
+
+ ret = crypto_skcipher_setkey(&chacha20->base,
+ (void *) key, sizeof(*key));
+ if (ret) {
+ pr_err("error from crypto_skcipher_setkey(): %s", bch2_err_str(ret));
+ goto err;
+ }
+
+ ret = do_encrypt(chacha20, nonce, buf, len);
+err:
+ crypto_free_sync_skcipher(chacha20);
+ return ret;
+}
+
+static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
+ struct nonce nonce)
+{
+ u8 key[POLY1305_KEY_SIZE];
+ int ret;
+
+ nonce.d[3] ^= BCH_NONCE_POLY;
+
+ memset(key, 0, sizeof(key));
+ ret = do_encrypt(c->chacha20, nonce, key, sizeof(key));
+ if (ret)
+ return ret;
+
+ desc->tfm = c->poly1305;
+ crypto_shash_init(desc);
+ crypto_shash_update(desc, key, sizeof(key));
+ return 0;
+}
+
+struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
+ struct nonce nonce, const void *data, size_t len)
+{
+ switch (type) {
+ case BCH_CSUM_none:
+ case BCH_CSUM_crc32c_nonzero:
+ case BCH_CSUM_crc64_nonzero:
+ case BCH_CSUM_crc32c:
+ case BCH_CSUM_xxhash:
+ case BCH_CSUM_crc64: {
+ struct bch2_checksum_state state;
+
+ state.type = type;
+
+ bch2_checksum_init(&state);
+ bch2_checksum_update(&state, data, len);
+
+ return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
+ }
+
+ case BCH_CSUM_chacha20_poly1305_80:
+ case BCH_CSUM_chacha20_poly1305_128: {
+ SHASH_DESC_ON_STACK(desc, c->poly1305);
+ u8 digest[POLY1305_DIGEST_SIZE];
+ struct bch_csum ret = { 0 };
+
+ gen_poly_key(c, desc, nonce);
+
+ crypto_shash_update(desc, data, len);
+ crypto_shash_final(desc, digest);
+
+ memcpy(&ret, digest, bch_crc_bytes[type]);
+ return ret;
+ }
+ default:
+ BUG();
+ }
+}
+
+int bch2_encrypt(struct bch_fs *c, unsigned type,
+ struct nonce nonce, void *data, size_t len)
+{
+ if (!bch2_csum_type_is_encryption(type))
+ return 0;
+
+ return do_encrypt(c->chacha20, nonce, data, len);
+}
+
+static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
+ struct nonce nonce, struct bio *bio,
+ struct bvec_iter *iter)
+{
+ struct bio_vec bv;
+
+ switch (type) {
+ case BCH_CSUM_none:
+ return (struct bch_csum) { 0 };
+ case BCH_CSUM_crc32c_nonzero:
+ case BCH_CSUM_crc64_nonzero:
+ case BCH_CSUM_crc32c:
+ case BCH_CSUM_xxhash:
+ case BCH_CSUM_crc64: {
+ struct bch2_checksum_state state;
+
+ state.type = type;
+ bch2_checksum_init(&state);
+
+#ifdef CONFIG_HIGHMEM
+ __bio_for_each_segment(bv, bio, *iter, *iter) {
+ void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
+
+ bch2_checksum_update(&state, p, bv.bv_len);
+ kunmap_local(p);
+ }
+#else
+ __bio_for_each_bvec(bv, bio, *iter, *iter)
+ bch2_checksum_update(&state, page_address(bv.bv_page) + bv.bv_offset,
+ bv.bv_len);
+#endif
+ return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
+ }
+
+ case BCH_CSUM_chacha20_poly1305_80:
+ case BCH_CSUM_chacha20_poly1305_128: {
+ SHASH_DESC_ON_STACK(desc, c->poly1305);
+ u8 digest[POLY1305_DIGEST_SIZE];
+ struct bch_csum ret = { 0 };
+
+ gen_poly_key(c, desc, nonce);
+
+#ifdef CONFIG_HIGHMEM
+ __bio_for_each_segment(bv, bio, *iter, *iter) {
+ void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
+
+ crypto_shash_update(desc, p, bv.bv_len);
+ kunmap_local(p);
+ }
+#else
+ __bio_for_each_bvec(bv, bio, *iter, *iter)
+ crypto_shash_update(desc,
+ page_address(bv.bv_page) + bv.bv_offset,
+ bv.bv_len);
+#endif
+ crypto_shash_final(desc, digest);
+
+ memcpy(&ret, digest, bch_crc_bytes[type]);
+ return ret;
+ }
+ default:
+ BUG();
+ }
+}
+
+struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
+ struct nonce nonce, struct bio *bio)
+{
+ struct bvec_iter iter = bio->bi_iter;
+
+ return __bch2_checksum_bio(c, type, nonce, bio, &iter);
+}
+
+int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
+ struct nonce nonce, struct bio *bio)
+{
+ struct bio_vec bv;
+ struct bvec_iter iter;
+ struct scatterlist sgl[16], *sg = sgl;
+ size_t bytes = 0;
+ int ret = 0;
+
+ if (!bch2_csum_type_is_encryption(type))
+ return 0;
+
+ sg_init_table(sgl, ARRAY_SIZE(sgl));
+
+ bio_for_each_segment(bv, bio, iter) {
+ if (sg == sgl + ARRAY_SIZE(sgl)) {
+ sg_mark_end(sg - 1);
+
+ ret = do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+ if (ret)
+ return ret;
+
+ nonce = nonce_add(nonce, bytes);
+ bytes = 0;
+
+ sg_init_table(sgl, ARRAY_SIZE(sgl));
+ sg = sgl;
+ }
+
+ sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset);
+ bytes += bv.bv_len;
+ }
+
+ sg_mark_end(sg - 1);
+ return do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+}
+
+struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
+ struct bch_csum b, size_t b_len)
+{
+ struct bch2_checksum_state state;
+
+ state.type = type;
+ bch2_checksum_init(&state);
+ state.seed = le64_to_cpu(a.lo);
+
+ BUG_ON(!bch2_checksum_mergeable(type));
+
+ while (b_len) {
+ unsigned page_len = min_t(unsigned, b_len, PAGE_SIZE);
+
+ bch2_checksum_update(&state,
+ page_address(ZERO_PAGE(0)), page_len);
+ b_len -= page_len;
+ }
+ a.lo = cpu_to_le64(bch2_checksum_final(&state));
+ a.lo ^= b.lo;
+ a.hi ^= b.hi;
+ return a;
+}
+
+int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
+ struct bversion version,
+ struct bch_extent_crc_unpacked crc_old,
+ struct bch_extent_crc_unpacked *crc_a,
+ struct bch_extent_crc_unpacked *crc_b,
+ unsigned len_a, unsigned len_b,
+ unsigned new_csum_type)
+{
+ struct bvec_iter iter = bio->bi_iter;
+ struct nonce nonce = extent_nonce(version, crc_old);
+ struct bch_csum merged = { 0 };
+ struct crc_split {
+ struct bch_extent_crc_unpacked *crc;
+ unsigned len;
+ unsigned csum_type;
+ struct bch_csum csum;
+ } splits[3] = {
+ { crc_a, len_a, new_csum_type, { 0 }},
+ { crc_b, len_b, new_csum_type, { 0 } },
+ { NULL, bio_sectors(bio) - len_a - len_b, new_csum_type, { 0 } },
+ }, *i;
+ bool mergeable = crc_old.csum_type == new_csum_type &&
+ bch2_checksum_mergeable(new_csum_type);
+ unsigned crc_nonce = crc_old.nonce;
+
+ BUG_ON(len_a + len_b > bio_sectors(bio));
+ BUG_ON(crc_old.uncompressed_size != bio_sectors(bio));
+ BUG_ON(crc_is_compressed(crc_old));
+ BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) !=
+ bch2_csum_type_is_encryption(new_csum_type));
+
+ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
+ iter.bi_size = i->len << 9;
+ if (mergeable || i->crc)
+ i->csum = __bch2_checksum_bio(c, i->csum_type,
+ nonce, bio, &iter);
+ else
+ bio_advance_iter(bio, &iter, i->len << 9);
+ nonce = nonce_add(nonce, i->len << 9);
+ }
+
+ if (mergeable)
+ for (i = splits; i < splits + ARRAY_SIZE(splits); i++)
+ merged = bch2_checksum_merge(new_csum_type, merged,
+ i->csum, i->len << 9);
+ else
+ merged = bch2_checksum_bio(c, crc_old.csum_type,
+ extent_nonce(version, crc_old), bio);
+
+ if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) {
+ bch_err(c, "checksum error in %s() (memory corruption or bug?)\n"
+ "expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)",
+ __func__,
+ crc_old.csum.hi,
+ crc_old.csum.lo,
+ merged.hi,
+ merged.lo,
+ bch2_csum_types[crc_old.csum_type],
+ bch2_csum_types[new_csum_type]);
+ return -EIO;
+ }
+
+ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
+ if (i->crc)
+ *i->crc = (struct bch_extent_crc_unpacked) {
+ .csum_type = i->csum_type,
+ .compression_type = crc_old.compression_type,
+ .compressed_size = i->len,
+ .uncompressed_size = i->len,
+ .offset = 0,
+ .live_size = i->len,
+ .nonce = crc_nonce,
+ .csum = i->csum,
+ };
+
+ if (bch2_csum_type_is_encryption(new_csum_type))
+ crc_nonce += i->len;
+ }
+
+ return 0;
+}
+
+/* BCH_SB_FIELD_crypt: */
+
+static int bch2_sb_crypt_validate(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ struct printbuf *err)
+{
+ struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
+
+ if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
+ prt_printf(err, "wrong size (got %zu should be %zu)",
+ vstruct_bytes(&crypt->field), sizeof(*crypt));
+ return -BCH_ERR_invalid_sb_crypt;
+ }
+
+ if (BCH_CRYPT_KDF_TYPE(crypt)) {
+ prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
+ return -BCH_ERR_invalid_sb_crypt;
+ }
+
+ return 0;
+}
+
+static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
+
+ prt_printf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt));
+ prt_newline(out);
+ prt_printf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt));
+ prt_newline(out);
+ prt_printf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt));
+ prt_newline(out);
+ prt_printf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt));
+ prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
+ .validate = bch2_sb_crypt_validate,
+ .to_text = bch2_sb_crypt_to_text,
+};
+
+#ifdef __KERNEL__
+static int __bch2_request_key(char *key_description, struct bch_key *key)
+{
+ struct key *keyring_key;
+ const struct user_key_payload *ukp;
+ int ret;
+
+ keyring_key = request_key(&key_type_user, key_description, NULL);
+ if (IS_ERR(keyring_key))
+ return PTR_ERR(keyring_key);
+
+ down_read(&keyring_key->sem);
+ ukp = dereference_key_locked(keyring_key);
+ if (ukp->datalen == sizeof(*key)) {
+ memcpy(key, ukp->data, ukp->datalen);
+ ret = 0;
+ } else {
+ ret = -EINVAL;
+ }
+ up_read(&keyring_key->sem);
+ key_put(keyring_key);
+
+ return ret;
+}
+#else
+#include <keyutils.h>
+
+static int __bch2_request_key(char *key_description, struct bch_key *key)
+{
+ key_serial_t key_id;
+
+ key_id = request_key("user", key_description, NULL,
+ KEY_SPEC_SESSION_KEYRING);
+ if (key_id >= 0)
+ goto got_key;
+
+ key_id = request_key("user", key_description, NULL,
+ KEY_SPEC_USER_KEYRING);
+ if (key_id >= 0)
+ goto got_key;
+
+ key_id = request_key("user", key_description, NULL,
+ KEY_SPEC_USER_SESSION_KEYRING);
+ if (key_id >= 0)
+ goto got_key;
+
+ return -errno;
+got_key:
+
+ if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key))
+ return -1;
+
+ return 0;
+}
+
+#include "../crypto.h"
+#endif
+
+int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
+{
+ struct printbuf key_description = PRINTBUF;
+ int ret;
+
+ prt_printf(&key_description, "bcachefs:");
+ pr_uuid(&key_description, sb->user_uuid.b);
+
+ ret = __bch2_request_key(key_description.buf, key);
+ printbuf_exit(&key_description);
+
+#ifndef __KERNEL__
+ if (ret) {
+ char *passphrase = read_passphrase("Enter passphrase: ");
+ struct bch_encrypted_key sb_key;
+
+ bch2_passphrase_check(sb, passphrase,
+ key, &sb_key);
+ ret = 0;
+ }
+#endif
+
+ /* stash with memfd, pass memfd fd to mount */
+
+ return ret;
+}
+
+#ifndef __KERNEL__
+int bch2_revoke_key(struct bch_sb *sb)
+{
+ key_serial_t key_id;
+ struct printbuf key_description = PRINTBUF;
+
+ prt_printf(&key_description, "bcachefs:");
+ pr_uuid(&key_description, sb->user_uuid.b);
+
+ key_id = request_key("user", key_description.buf, NULL, KEY_SPEC_USER_KEYRING);
+ printbuf_exit(&key_description);
+ if (key_id < 0)
+ return errno;
+
+ keyctl_revoke(key_id);
+
+ return 0;
+}
+#endif
+
+int bch2_decrypt_sb_key(struct bch_fs *c,
+ struct bch_sb_field_crypt *crypt,
+ struct bch_key *key)
+{
+ struct bch_encrypted_key sb_key = crypt->key;
+ struct bch_key user_key;
+ int ret = 0;
+
+ /* is key encrypted? */
+ if (!bch2_key_is_encrypted(&sb_key))
+ goto out;
+
+ ret = bch2_request_key(c->disk_sb.sb, &user_key);
+ if (ret) {
+ bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
+ goto err;
+ }
+
+ /* decrypt real key: */
+ ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
+ &sb_key, sizeof(sb_key));
+ if (ret)
+ goto err;
+
+ if (bch2_key_is_encrypted(&sb_key)) {
+ bch_err(c, "incorrect encryption key");
+ ret = -EINVAL;
+ goto err;
+ }
+out:
+ *key = sb_key.key;
+err:
+ memzero_explicit(&sb_key, sizeof(sb_key));
+ memzero_explicit(&user_key, sizeof(user_key));
+ return ret;
+}
+
+static int bch2_alloc_ciphers(struct bch_fs *c)
+{
+ int ret;
+
+ if (!c->chacha20)
+ c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
+ ret = PTR_ERR_OR_ZERO(c->chacha20);
+
+ if (ret) {
+ bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret));
+ return ret;
+ }
+
+ if (!c->poly1305)
+ c->poly1305 = crypto_alloc_shash("poly1305", 0, 0);
+ ret = PTR_ERR_OR_ZERO(c->poly1305);
+
+ if (ret) {
+ bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret));
+ return ret;
+ }
+
+ return 0;
+}
+
+int bch2_disable_encryption(struct bch_fs *c)
+{
+ struct bch_sb_field_crypt *crypt;
+ struct bch_key key;
+ int ret = -EINVAL;
+
+ mutex_lock(&c->sb_lock);
+
+ crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
+ if (!crypt)
+ goto out;
+
+ /* is key encrypted? */
+ ret = 0;
+ if (bch2_key_is_encrypted(&crypt->key))
+ goto out;
+
+ ret = bch2_decrypt_sb_key(c, crypt, &key);
+ if (ret)
+ goto out;
+
+ crypt->key.magic = cpu_to_le64(BCH_KEY_MAGIC);
+ crypt->key.key = key;
+
+ SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0);
+ bch2_write_super(c);
+out:
+ mutex_unlock(&c->sb_lock);
+
+ return ret;
+}
+
+int bch2_enable_encryption(struct bch_fs *c, bool keyed)
+{
+ struct bch_encrypted_key key;
+ struct bch_key user_key;
+ struct bch_sb_field_crypt *crypt;
+ int ret = -EINVAL;
+
+ mutex_lock(&c->sb_lock);
+
+ /* Do we already have an encryption key? */
+ if (bch2_sb_field_get(c->disk_sb.sb, crypt))
+ goto err;
+
+ ret = bch2_alloc_ciphers(c);
+ if (ret)
+ goto err;
+
+ key.magic = cpu_to_le64(BCH_KEY_MAGIC);
+ get_random_bytes(&key.key, sizeof(key.key));
+
+ if (keyed) {
+ ret = bch2_request_key(c->disk_sb.sb, &user_key);
+ if (ret) {
+ bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
+ goto err;
+ }
+
+ ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
+ &key, sizeof(key));
+ if (ret)
+ goto err;
+ }
+
+ ret = crypto_skcipher_setkey(&c->chacha20->base,
+ (void *) &key.key, sizeof(key.key));
+ if (ret)
+ goto err;
+
+ crypt = bch2_sb_field_resize(&c->disk_sb, crypt,
+ sizeof(*crypt) / sizeof(u64));
+ if (!crypt) {
+ ret = -BCH_ERR_ENOSPC_sb_crypt;
+ goto err;
+ }
+
+ crypt->key = key;
+
+ /* write superblock */
+ SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1);
+ bch2_write_super(c);
+err:
+ mutex_unlock(&c->sb_lock);
+ memzero_explicit(&user_key, sizeof(user_key));
+ memzero_explicit(&key, sizeof(key));
+ return ret;
+}
+
+void bch2_fs_encryption_exit(struct bch_fs *c)
+{
+ if (!IS_ERR_OR_NULL(c->poly1305))
+ crypto_free_shash(c->poly1305);
+ if (!IS_ERR_OR_NULL(c->chacha20))
+ crypto_free_sync_skcipher(c->chacha20);
+ if (!IS_ERR_OR_NULL(c->sha256))
+ crypto_free_shash(c->sha256);
+}
+
+int bch2_fs_encryption_init(struct bch_fs *c)
+{
+ struct bch_sb_field_crypt *crypt;
+ struct bch_key key;
+ int ret = 0;
+
+ c->sha256 = crypto_alloc_shash("sha256", 0, 0);
+ ret = PTR_ERR_OR_ZERO(c->sha256);
+ if (ret) {
+ bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret));
+ goto out;
+ }
+
+ crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
+ if (!crypt)
+ goto out;
+
+ ret = bch2_alloc_ciphers(c);
+ if (ret)
+ goto out;
+
+ ret = bch2_decrypt_sb_key(c, crypt, &key);
+ if (ret)
+ goto out;
+
+ ret = crypto_skcipher_setkey(&c->chacha20->base,
+ (void *) &key.key, sizeof(key.key));
+ if (ret)
+ goto out;
+out:
+ memzero_explicit(&key, sizeof(key));
+ return ret;
+}
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
new file mode 100644
index 000000000000..13998388c545
--- /dev/null
+++ b/fs/bcachefs/checksum.h
@@ -0,0 +1,213 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CHECKSUM_H
+#define _BCACHEFS_CHECKSUM_H
+
+#include "bcachefs.h"
+#include "extents_types.h"
+#include "super-io.h"
+
+#include <linux/crc64.h>
+#include <crypto/chacha.h>
+
+static inline bool bch2_checksum_mergeable(unsigned type)
+{
+
+ switch (type) {
+ case BCH_CSUM_none:
+ case BCH_CSUM_crc32c:
+ case BCH_CSUM_crc64:
+ return true;
+ default:
+ return false;
+ }
+}
+
+struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum,
+ struct bch_csum, size_t);
+
+#define BCH_NONCE_EXTENT cpu_to_le32(1 << 28)
+#define BCH_NONCE_BTREE cpu_to_le32(2 << 28)
+#define BCH_NONCE_JOURNAL cpu_to_le32(3 << 28)
+#define BCH_NONCE_PRIO cpu_to_le32(4 << 28)
+#define BCH_NONCE_POLY cpu_to_le32(1 << 31)
+
+struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
+ const void *, size_t);
+
+/*
+ * This is used for various on disk data structures - bch_sb, prio_set, bset,
+ * jset: The checksum is _always_ the first field of these structs
+ */
+#define csum_vstruct(_c, _type, _nonce, _i) \
+({ \
+ const void *_start = ((const void *) (_i)) + sizeof((_i)->csum);\
+ \
+ bch2_checksum(_c, _type, _nonce, _start, vstruct_end(_i) - _start);\
+})
+
+int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
+int bch2_request_key(struct bch_sb *, struct bch_key *);
+#ifndef __KERNEL__
+int bch2_revoke_key(struct bch_sb *);
+#endif
+
+int bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
+ void *data, size_t);
+
+struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned,
+ struct nonce, struct bio *);
+
+int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion,
+ struct bch_extent_crc_unpacked,
+ struct bch_extent_crc_unpacked *,
+ struct bch_extent_crc_unpacked *,
+ unsigned, unsigned, unsigned);
+
+int __bch2_encrypt_bio(struct bch_fs *, unsigned,
+ struct nonce, struct bio *);
+
+static inline int bch2_encrypt_bio(struct bch_fs *c, unsigned type,
+ struct nonce nonce, struct bio *bio)
+{
+ return bch2_csum_type_is_encryption(type)
+ ? __bch2_encrypt_bio(c, type, nonce, bio)
+ : 0;
+}
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_crypt;
+
+int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
+ struct bch_key *);
+
+int bch2_disable_encryption(struct bch_fs *);
+int bch2_enable_encryption(struct bch_fs *, bool);
+
+void bch2_fs_encryption_exit(struct bch_fs *);
+int bch2_fs_encryption_init(struct bch_fs *);
+
+static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
+ bool data)
+{
+ switch (type) {
+ case BCH_CSUM_OPT_none:
+ return BCH_CSUM_none;
+ case BCH_CSUM_OPT_crc32c:
+ return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero;
+ case BCH_CSUM_OPT_crc64:
+ return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero;
+ case BCH_CSUM_OPT_xxhash:
+ return BCH_CSUM_xxhash;
+ default:
+ BUG();
+ }
+}
+
+static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
+ struct bch_io_opts opts)
+{
+ if (opts.nocow)
+ return 0;
+
+ if (c->sb.encryption_type)
+ return c->opts.wide_macs
+ ? BCH_CSUM_chacha20_poly1305_128
+ : BCH_CSUM_chacha20_poly1305_80;
+
+ return bch2_csum_opt_to_type(opts.data_checksum, true);
+}
+
+static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
+{
+ if (c->sb.encryption_type)
+ return BCH_CSUM_chacha20_poly1305_128;
+
+ return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
+}
+
+static inline bool bch2_checksum_type_valid(const struct bch_fs *c,
+ unsigned type)
+{
+ if (type >= BCH_CSUM_NR)
+ return false;
+
+ if (bch2_csum_type_is_encryption(type) && !c->chacha20)
+ return false;
+
+ return true;
+}
+
+/* returns true if not equal */
+static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
+{
+ /*
+ * XXX: need some way of preventing the compiler from optimizing this
+ * into a form that isn't constant time..
+ */
+ return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0;
+}
+
+/* for skipping ahead and encrypting/decrypting at an offset: */
+static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
+{
+ EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));
+
+ le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
+ return nonce;
+}
+
+static inline struct nonce null_nonce(void)
+{
+ struct nonce ret;
+
+ memset(&ret, 0, sizeof(ret));
+ return ret;
+}
+
+static inline struct nonce extent_nonce(struct bversion version,
+ struct bch_extent_crc_unpacked crc)
+{
+ unsigned compression_type = crc_is_compressed(crc)
+ ? crc.compression_type
+ : 0;
+ unsigned size = compression_type ? crc.uncompressed_size : 0;
+ struct nonce nonce = (struct nonce) {{
+ [0] = cpu_to_le32(size << 22),
+ [1] = cpu_to_le32(version.lo),
+ [2] = cpu_to_le32(version.lo >> 32),
+ [3] = cpu_to_le32(version.hi|
+ (compression_type << 24))^BCH_NONCE_EXTENT,
+ }};
+
+ return nonce_add(nonce, crc.nonce << 9);
+}
+
+static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key)
+{
+ return le64_to_cpu(key->magic) != BCH_KEY_MAGIC;
+}
+
+static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb)
+{
+ __le64 magic = __bch2_sb_magic(sb);
+
+ return (struct nonce) {{
+ [0] = 0,
+ [1] = 0,
+ [2] = ((__le32 *) &magic)[0],
+ [3] = ((__le32 *) &magic)[1],
+ }};
+}
+
+static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c)
+{
+ __le64 magic = bch2_sb_magic(c);
+
+ return (struct nonce) {{
+ [0] = 0,
+ [1] = 0,
+ [2] = ((__le32 *) &magic)[0],
+ [3] = ((__le32 *) &magic)[1],
+ }};
+}
+
+#endif /* _BCACHEFS_CHECKSUM_H */
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
new file mode 100644
index 000000000000..f41889093a2c
--- /dev/null
+++ b/fs/bcachefs/clock.c
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "clock.h"
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/preempt.h>
+
+static inline long io_timer_cmp(io_timer_heap *h,
+ struct io_timer *l,
+ struct io_timer *r)
+{
+ return l->expire - r->expire;
+}
+
+void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
+{
+ size_t i;
+
+ spin_lock(&clock->timer_lock);
+
+ if (time_after_eq((unsigned long) atomic64_read(&clock->now),
+ timer->expire)) {
+ spin_unlock(&clock->timer_lock);
+ timer->fn(timer);
+ return;
+ }
+
+ for (i = 0; i < clock->timers.used; i++)
+ if (clock->timers.data[i] == timer)
+ goto out;
+
+ BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp, NULL));
+out:
+ spin_unlock(&clock->timer_lock);
+}
+
+void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
+{
+ size_t i;
+
+ spin_lock(&clock->timer_lock);
+
+ for (i = 0; i < clock->timers.used; i++)
+ if (clock->timers.data[i] == timer) {
+ heap_del(&clock->timers, i, io_timer_cmp, NULL);
+ break;
+ }
+
+ spin_unlock(&clock->timer_lock);
+}
+
+struct io_clock_wait {
+ struct io_timer io_timer;
+ struct timer_list cpu_timer;
+ struct task_struct *task;
+ int expired;
+};
+
+static void io_clock_wait_fn(struct io_timer *timer)
+{
+ struct io_clock_wait *wait = container_of(timer,
+ struct io_clock_wait, io_timer);
+
+ wait->expired = 1;
+ wake_up_process(wait->task);
+}
+
+static void io_clock_cpu_timeout(struct timer_list *timer)
+{
+ struct io_clock_wait *wait = container_of(timer,
+ struct io_clock_wait, cpu_timer);
+
+ wait->expired = 1;
+ wake_up_process(wait->task);
+}
+
+void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until)
+{
+ struct io_clock_wait wait;
+
+ /* XXX: calculate sleep time rigorously */
+ wait.io_timer.expire = until;
+ wait.io_timer.fn = io_clock_wait_fn;
+ wait.task = current;
+ wait.expired = 0;
+ bch2_io_timer_add(clock, &wait.io_timer);
+
+ schedule();
+
+ bch2_io_timer_del(clock, &wait.io_timer);
+}
+
+void bch2_kthread_io_clock_wait(struct io_clock *clock,
+ unsigned long io_until,
+ unsigned long cpu_timeout)
+{
+ bool kthread = (current->flags & PF_KTHREAD) != 0;
+ struct io_clock_wait wait;
+
+ wait.io_timer.expire = io_until;
+ wait.io_timer.fn = io_clock_wait_fn;
+ wait.task = current;
+ wait.expired = 0;
+ bch2_io_timer_add(clock, &wait.io_timer);
+
+ timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0);
+
+ if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
+ mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);
+
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (kthread && kthread_should_stop())
+ break;
+
+ if (wait.expired)
+ break;
+
+ schedule();
+ try_to_freeze();
+ }
+
+ __set_current_state(TASK_RUNNING);
+ del_timer_sync(&wait.cpu_timer);
+ destroy_timer_on_stack(&wait.cpu_timer);
+ bch2_io_timer_del(clock, &wait.io_timer);
+}
+
+static struct io_timer *get_expired_timer(struct io_clock *clock,
+ unsigned long now)
+{
+ struct io_timer *ret = NULL;
+
+ spin_lock(&clock->timer_lock);
+
+ if (clock->timers.used &&
+ time_after_eq(now, clock->timers.data[0]->expire))
+ heap_pop(&clock->timers, ret, io_timer_cmp, NULL);
+
+ spin_unlock(&clock->timer_lock);
+
+ return ret;
+}
+
+void __bch2_increment_clock(struct io_clock *clock, unsigned sectors)
+{
+ struct io_timer *timer;
+ unsigned long now = atomic64_add_return(sectors, &clock->now);
+
+ while ((timer = get_expired_timer(clock, now)))
+ timer->fn(timer);
+}
+
+void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
+{
+ unsigned long now;
+ unsigned i;
+
+ out->atomic++;
+ spin_lock(&clock->timer_lock);
+ now = atomic64_read(&clock->now);
+
+ for (i = 0; i < clock->timers.used; i++)
+ prt_printf(out, "%ps:\t%li\n",
+ clock->timers.data[i]->fn,
+ clock->timers.data[i]->expire - now);
+ spin_unlock(&clock->timer_lock);
+ --out->atomic;
+}
+
+void bch2_io_clock_exit(struct io_clock *clock)
+{
+ free_heap(&clock->timers);
+ free_percpu(clock->pcpu_buf);
+}
+
+int bch2_io_clock_init(struct io_clock *clock)
+{
+ atomic64_set(&clock->now, 0);
+ spin_lock_init(&clock->timer_lock);
+
+ clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus();
+
+ clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf);
+ if (!clock->pcpu_buf)
+ return -BCH_ERR_ENOMEM_io_clock_init;
+
+ if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL))
+ return -BCH_ERR_ENOMEM_io_clock_init;
+
+ return 0;
+}
diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
new file mode 100644
index 000000000000..70a0f7436c84
--- /dev/null
+++ b/fs/bcachefs/clock.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CLOCK_H
+#define _BCACHEFS_CLOCK_H
+
+void bch2_io_timer_add(struct io_clock *, struct io_timer *);
+void bch2_io_timer_del(struct io_clock *, struct io_timer *);
+void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long,
+ unsigned long);
+
+void __bch2_increment_clock(struct io_clock *, unsigned);
+
+static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors,
+ int rw)
+{
+ struct io_clock *clock = &c->io_clock[rw];
+
+ if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >=
+ IO_CLOCK_PCPU_SECTORS))
+ __bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0));
+}
+
+void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
+
+#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\
+({ \
+ long __ret = timeout; \
+ might_sleep(); \
+ if (!___wait_cond_timeout(condition)) \
+ __ret = __wait_event_timeout(wq, condition, timeout); \
+ __ret; \
+})
+
+void bch2_io_timers_to_text(struct printbuf *, struct io_clock *);
+
+void bch2_io_clock_exit(struct io_clock *);
+int bch2_io_clock_init(struct io_clock *);
+
+#endif /* _BCACHEFS_CLOCK_H */
diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h
new file mode 100644
index 000000000000..5fae0012d808
--- /dev/null
+++ b/fs/bcachefs/clock_types.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CLOCK_TYPES_H
+#define _BCACHEFS_CLOCK_TYPES_H
+
+#include "util.h"
+
+#define NR_IO_TIMERS (BCH_SB_MEMBERS_MAX * 3)
+
+/*
+ * Clocks/timers in units of sectors of IO:
+ *
+ * Note - they use percpu batching, so they're only approximate.
+ */
+
+struct io_timer;
+typedef void (*io_timer_fn)(struct io_timer *);
+
+struct io_timer {
+ io_timer_fn fn;
+ unsigned long expire;
+};
+
+/* Amount to buffer up on a percpu counter */
+#define IO_CLOCK_PCPU_SECTORS 128
+
+typedef HEAP(struct io_timer *) io_timer_heap;
+
+struct io_clock {
+ atomic64_t now;
+ u16 __percpu *pcpu_buf;
+ unsigned max_slop;
+
+ spinlock_t timer_lock;
+ io_timer_heap timers;
+};
+
+#endif /* _BCACHEFS_CLOCK_TYPES_H */
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
new file mode 100644
index 000000000000..51af8ea230ed
--- /dev/null
+++ b/fs/bcachefs/compress.c
@@ -0,0 +1,732 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "checksum.h"
+#include "compress.h"
+#include "extents.h"
+#include "super-io.h"
+
+#include <linux/lz4.h>
+#include <linux/zlib.h>
+#include <linux/zstd.h>
+
+/* Bounce buffer: */
+struct bbuf {
+ void *b;
+ enum {
+ BB_NONE,
+ BB_VMAP,
+ BB_KMALLOC,
+ BB_MEMPOOL,
+ } type;
+ int rw;
+};
+
+static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw)
+{
+ void *b;
+
+ BUG_ON(size > c->opts.encoded_extent_max);
+
+ b = kmalloc(size, GFP_NOFS|__GFP_NOWARN);
+ if (b)
+ return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw };
+
+ b = mempool_alloc(&c->compression_bounce[rw], GFP_NOFS);
+ if (b)
+ return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
+
+ BUG();
+}
+
+static bool bio_phys_contig(struct bio *bio, struct bvec_iter start)
+{
+ struct bio_vec bv;
+ struct bvec_iter iter;
+ void *expected_start = NULL;
+
+ __bio_for_each_bvec(bv, bio, iter, start) {
+ if (expected_start &&
+ expected_start != page_address(bv.bv_page) + bv.bv_offset)
+ return false;
+
+ expected_start = page_address(bv.bv_page) +
+ bv.bv_offset + bv.bv_len;
+ }
+
+ return true;
+}
+
+static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
+ struct bvec_iter start, int rw)
+{
+ struct bbuf ret;
+ struct bio_vec bv;
+ struct bvec_iter iter;
+ unsigned nr_pages = 0;
+ struct page *stack_pages[16];
+ struct page **pages = NULL;
+ void *data;
+
+ BUG_ON(start.bi_size > c->opts.encoded_extent_max);
+
+ if (!PageHighMem(bio_iter_page(bio, start)) &&
+ bio_phys_contig(bio, start))
+ return (struct bbuf) {
+ .b = page_address(bio_iter_page(bio, start)) +
+ bio_iter_offset(bio, start),
+ .type = BB_NONE, .rw = rw
+ };
+
+ /* check if we can map the pages contiguously: */
+ __bio_for_each_segment(bv, bio, iter, start) {
+ if (iter.bi_size != start.bi_size &&
+ bv.bv_offset)
+ goto bounce;
+
+ if (bv.bv_len < iter.bi_size &&
+ bv.bv_offset + bv.bv_len < PAGE_SIZE)
+ goto bounce;
+
+ nr_pages++;
+ }
+
+ BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages);
+
+ pages = nr_pages > ARRAY_SIZE(stack_pages)
+ ? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS)
+ : stack_pages;
+ if (!pages)
+ goto bounce;
+
+ nr_pages = 0;
+ __bio_for_each_segment(bv, bio, iter, start)
+ pages[nr_pages++] = bv.bv_page;
+
+ data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+ if (pages != stack_pages)
+ kfree(pages);
+
+ if (data)
+ return (struct bbuf) {
+ .b = data + bio_iter_offset(bio, start),
+ .type = BB_VMAP, .rw = rw
+ };
+bounce:
+ ret = __bounce_alloc(c, start.bi_size, rw);
+
+ if (rw == READ)
+ memcpy_from_bio(ret.b, bio, start);
+
+ return ret;
+}
+
+static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw)
+{
+ return __bio_map_or_bounce(c, bio, bio->bi_iter, rw);
+}
+
+static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf)
+{
+ switch (buf.type) {
+ case BB_NONE:
+ break;
+ case BB_VMAP:
+ vunmap((void *) ((unsigned long) buf.b & PAGE_MASK));
+ break;
+ case BB_KMALLOC:
+ kfree(buf.b);
+ break;
+ case BB_MEMPOOL:
+ mempool_free(buf.b, &c->compression_bounce[buf.rw]);
+ break;
+ }
+}
+
+static inline void zlib_set_workspace(z_stream *strm, void *workspace)
+{
+#ifdef __KERNEL__
+ strm->workspace = workspace;
+#endif
+}
+
+static int __bio_uncompress(struct bch_fs *c, struct bio *src,
+ void *dst_data, struct bch_extent_crc_unpacked crc)
+{
+ struct bbuf src_data = { NULL };
+ size_t src_len = src->bi_iter.bi_size;
+ size_t dst_len = crc.uncompressed_size << 9;
+ void *workspace;
+ int ret;
+
+ src_data = bio_map_or_bounce(c, src, READ);
+
+ switch (crc.compression_type) {
+ case BCH_COMPRESSION_TYPE_lz4_old:
+ case BCH_COMPRESSION_TYPE_lz4:
+ ret = LZ4_decompress_safe_partial(src_data.b, dst_data,
+ src_len, dst_len, dst_len);
+ if (ret != dst_len)
+ goto err;
+ break;
+ case BCH_COMPRESSION_TYPE_gzip: {
+ z_stream strm = {
+ .next_in = src_data.b,
+ .avail_in = src_len,
+ .next_out = dst_data,
+ .avail_out = dst_len,
+ };
+
+ workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS);
+
+ zlib_set_workspace(&strm, workspace);
+ zlib_inflateInit2(&strm, -MAX_WBITS);
+ ret = zlib_inflate(&strm, Z_FINISH);
+
+ mempool_free(workspace, &c->decompress_workspace);
+
+ if (ret != Z_STREAM_END)
+ goto err;
+ break;
+ }
+ case BCH_COMPRESSION_TYPE_zstd: {
+ ZSTD_DCtx *ctx;
+ size_t real_src_len = le32_to_cpup(src_data.b);
+
+ if (real_src_len > src_len - 4)
+ goto err;
+
+ workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS);
+ ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
+
+ ret = zstd_decompress_dctx(ctx,
+ dst_data, dst_len,
+ src_data.b + 4, real_src_len);
+
+ mempool_free(workspace, &c->decompress_workspace);
+
+ if (ret != dst_len)
+ goto err;
+ break;
+ }
+ default:
+ BUG();
+ }
+ ret = 0;
+out:
+ bio_unmap_or_unbounce(c, src_data);
+ return ret;
+err:
+ ret = -EIO;
+ goto out;
+}
+
+int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
+ struct bch_extent_crc_unpacked *crc)
+{
+ struct bbuf data = { NULL };
+ size_t dst_len = crc->uncompressed_size << 9;
+
+ /* bio must own its pages: */
+ BUG_ON(!bio->bi_vcnt);
+ BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
+
+ if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max ||
+ crc->compressed_size << 9 > c->opts.encoded_extent_max) {
+ bch_err(c, "error rewriting existing data: extent too big");
+ return -EIO;
+ }
+
+ data = __bounce_alloc(c, dst_len, WRITE);
+
+ if (__bio_uncompress(c, bio, data.b, *crc)) {
+ if (!c->opts.no_data_io)
+ bch_err(c, "error rewriting existing data: decompression error");
+ bio_unmap_or_unbounce(c, data);
+ return -EIO;
+ }
+
+ /*
+ * XXX: don't have a good way to assert that the bio was allocated with
+ * enough space, we depend on bch2_move_extent doing the right thing
+ */
+ bio->bi_iter.bi_size = crc->live_size << 9;
+
+ memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9));
+
+ crc->csum_type = 0;
+ crc->compression_type = 0;
+ crc->compressed_size = crc->live_size;
+ crc->uncompressed_size = crc->live_size;
+ crc->offset = 0;
+ crc->csum = (struct bch_csum) { 0, 0 };
+
+ bio_unmap_or_unbounce(c, data);
+ return 0;
+}
+
+int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
+ struct bio *dst, struct bvec_iter dst_iter,
+ struct bch_extent_crc_unpacked crc)
+{
+ struct bbuf dst_data = { NULL };
+ size_t dst_len = crc.uncompressed_size << 9;
+ int ret;
+
+ if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max ||
+ crc.compressed_size << 9 > c->opts.encoded_extent_max)
+ return -EIO;
+
+ dst_data = dst_len == dst_iter.bi_size
+ ? __bio_map_or_bounce(c, dst, dst_iter, WRITE)
+ : __bounce_alloc(c, dst_len, WRITE);
+
+ ret = __bio_uncompress(c, src, dst_data.b, crc);
+ if (ret)
+ goto err;
+
+ if (dst_data.type != BB_NONE &&
+ dst_data.type != BB_VMAP)
+ memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9));
+err:
+ bio_unmap_or_unbounce(c, dst_data);
+ return ret;
+}
+
+static int attempt_compress(struct bch_fs *c,
+ void *workspace,
+ void *dst, size_t dst_len,
+ void *src, size_t src_len,
+ struct bch_compression_opt compression)
+{
+ enum bch_compression_type compression_type =
+ __bch2_compression_opt_to_type[compression.type];
+
+ switch (compression_type) {
+ case BCH_COMPRESSION_TYPE_lz4:
+ if (compression.level < LZ4HC_MIN_CLEVEL) {
+ int len = src_len;
+ int ret = LZ4_compress_destSize(
+ src, dst,
+ &len, dst_len,
+ workspace);
+ if (len < src_len)
+ return -len;
+
+ return ret;
+ } else {
+ int ret = LZ4_compress_HC(
+ src, dst,
+ src_len, dst_len,
+ compression.level,
+ workspace);
+
+ return ret ?: -1;
+ }
+ case BCH_COMPRESSION_TYPE_gzip: {
+ z_stream strm = {
+ .next_in = src,
+ .avail_in = src_len,
+ .next_out = dst,
+ .avail_out = dst_len,
+ };
+
+ zlib_set_workspace(&strm, workspace);
+ zlib_deflateInit2(&strm,
+ compression.level
+ ? clamp_t(unsigned, compression.level,
+ Z_BEST_SPEED, Z_BEST_COMPRESSION)
+ : Z_DEFAULT_COMPRESSION,
+ Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
+ Z_DEFAULT_STRATEGY);
+
+ if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END)
+ return 0;
+
+ if (zlib_deflateEnd(&strm) != Z_OK)
+ return 0;
+
+ return strm.total_out;
+ }
+ case BCH_COMPRESSION_TYPE_zstd: {
+ /*
+ * rescale:
+ * zstd max compression level is 22, our max level is 15
+ */
+ unsigned level = min((compression.level * 3) / 2, zstd_max_clevel());
+ ZSTD_parameters params = zstd_get_params(level, c->opts.encoded_extent_max);
+ ZSTD_CCtx *ctx = zstd_init_cctx(workspace, c->zstd_workspace_size);
+
+ /*
+ * ZSTD requires that when we decompress we pass in the exact
+ * compressed size - rounding it up to the nearest sector
+ * doesn't work, so we use the first 4 bytes of the buffer for
+ * that.
+ *
+ * Additionally, the ZSTD code seems to have a bug where it will
+ * write just past the end of the buffer - so subtract a fudge
+ * factor (7 bytes) from the dst buffer size to account for
+ * that.
+ */
+ size_t len = zstd_compress_cctx(ctx,
+ dst + 4, dst_len - 4 - 7,
+ src, src_len,
+ &params);
+ if (zstd_is_error(len))
+ return 0;
+
+ *((__le32 *) dst) = cpu_to_le32(len);
+ return len + 4;
+ }
+ default:
+ BUG();
+ }
+}
+
+static unsigned __bio_compress(struct bch_fs *c,
+ struct bio *dst, size_t *dst_len,
+ struct bio *src, size_t *src_len,
+ struct bch_compression_opt compression)
+{
+ struct bbuf src_data = { NULL }, dst_data = { NULL };
+ void *workspace;
+ enum bch_compression_type compression_type =
+ __bch2_compression_opt_to_type[compression.type];
+ unsigned pad;
+ int ret = 0;
+
+ BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR);
+ BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type]));
+
+ /* If it's only one block, don't bother trying to compress: */
+ if (src->bi_iter.bi_size <= c->opts.block_size)
+ return BCH_COMPRESSION_TYPE_incompressible;
+
+ dst_data = bio_map_or_bounce(c, dst, WRITE);
+ src_data = bio_map_or_bounce(c, src, READ);
+
+ workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOFS);
+
+ *src_len = src->bi_iter.bi_size;
+ *dst_len = dst->bi_iter.bi_size;
+
+ /*
+ * XXX: this algorithm sucks when the compression code doesn't tell us
+ * how much would fit, like LZ4 does:
+ */
+ while (1) {
+ if (*src_len <= block_bytes(c)) {
+ ret = -1;
+ break;
+ }
+
+ ret = attempt_compress(c, workspace,
+ dst_data.b, *dst_len,
+ src_data.b, *src_len,
+ compression);
+ if (ret > 0) {
+ *dst_len = ret;
+ ret = 0;
+ break;
+ }
+
+ /* Didn't fit: should we retry with a smaller amount? */
+ if (*src_len <= *dst_len) {
+ ret = -1;
+ break;
+ }
+
+ /*
+ * If ret is negative, it's a hint as to how much data would fit
+ */
+ BUG_ON(-ret >= *src_len);
+
+ if (ret < 0)
+ *src_len = -ret;
+ else
+ *src_len -= (*src_len - *dst_len) / 2;
+ *src_len = round_down(*src_len, block_bytes(c));
+ }
+
+ mempool_free(workspace, &c->compress_workspace[compression_type]);
+
+ if (ret)
+ goto err;
+
+ /* Didn't get smaller: */
+ if (round_up(*dst_len, block_bytes(c)) >= *src_len)
+ goto err;
+
+ pad = round_up(*dst_len, block_bytes(c)) - *dst_len;
+
+ memset(dst_data.b + *dst_len, 0, pad);
+ *dst_len += pad;
+
+ if (dst_data.type != BB_NONE &&
+ dst_data.type != BB_VMAP)
+ memcpy_to_bio(dst, dst->bi_iter, dst_data.b);
+
+ BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
+ BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
+ BUG_ON(*dst_len & (block_bytes(c) - 1));
+ BUG_ON(*src_len & (block_bytes(c) - 1));
+ ret = compression_type;
+out:
+ bio_unmap_or_unbounce(c, src_data);
+ bio_unmap_or_unbounce(c, dst_data);
+ return ret;
+err:
+ ret = BCH_COMPRESSION_TYPE_incompressible;
+ goto out;
+}
+
+unsigned bch2_bio_compress(struct bch_fs *c,
+ struct bio *dst, size_t *dst_len,
+ struct bio *src, size_t *src_len,
+ unsigned compression_opt)
+{
+ unsigned orig_dst = dst->bi_iter.bi_size;
+ unsigned orig_src = src->bi_iter.bi_size;
+ unsigned compression_type;
+
+ /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
+ src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size,
+ c->opts.encoded_extent_max);
+ /* Don't generate a bigger output than input: */
+ dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+
+ compression_type =
+ __bio_compress(c, dst, dst_len, src, src_len,
+ bch2_compression_decode(compression_opt));
+
+ dst->bi_iter.bi_size = orig_dst;
+ src->bi_iter.bi_size = orig_src;
+ return compression_type;
+}
+
+static int __bch2_fs_compress_init(struct bch_fs *, u64);
+
+#define BCH_FEATURE_none 0
+
+static const unsigned bch2_compression_opt_to_feature[] = {
+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t,
+ BCH_COMPRESSION_OPTS()
+#undef x
+};
+
+#undef BCH_FEATURE_none
+
+static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
+{
+ int ret = 0;
+
+ if ((c->sb.features & f) == f)
+ return 0;
+
+ mutex_lock(&c->sb_lock);
+
+ if ((c->sb.features & f) == f) {
+ mutex_unlock(&c->sb_lock);
+ return 0;
+ }
+
+ ret = __bch2_fs_compress_init(c, c->sb.features|f);
+ if (ret) {
+ mutex_unlock(&c->sb_lock);
+ return ret;
+ }
+
+ c->disk_sb.sb->features[0] |= cpu_to_le64(f);
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
+ return 0;
+}
+
+int bch2_check_set_has_compressed_data(struct bch_fs *c,
+ unsigned compression_opt)
+{
+ unsigned compression_type = bch2_compression_decode(compression_opt).type;
+
+ BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature));
+
+ return compression_type
+ ? __bch2_check_set_has_compressed_data(c,
+ 1ULL << bch2_compression_opt_to_feature[compression_type])
+ : 0;
+}
+
+void bch2_fs_compress_exit(struct bch_fs *c)
+{
+ unsigned i;
+
+ mempool_exit(&c->decompress_workspace);
+ for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++)
+ mempool_exit(&c->compress_workspace[i]);
+ mempool_exit(&c->compression_bounce[WRITE]);
+ mempool_exit(&c->compression_bounce[READ]);
+}
+
+static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
+{
+ size_t decompress_workspace_size = 0;
+ ZSTD_parameters params = zstd_get_params(zstd_max_clevel(),
+ c->opts.encoded_extent_max);
+
+ /*
+ * ZSTD is lying: if we allocate the size of the workspace it says it
+ * requires, it returns memory allocation errors
+ */
+ c->zstd_workspace_size = zstd_cctx_workspace_bound(&params.cParams);
+
+ struct {
+ unsigned feature;
+ enum bch_compression_type type;
+ size_t compress_workspace;
+ size_t decompress_workspace;
+ } compression_types[] = {
+ { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4,
+ max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS),
+ 0 },
+ { BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip,
+ zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
+ zlib_inflate_workspacesize(), },
+ { BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd,
+ c->zstd_workspace_size,
+ zstd_dctx_workspace_bound() },
+ }, *i;
+ bool have_compressed = false;
+
+ for (i = compression_types;
+ i < compression_types + ARRAY_SIZE(compression_types);
+ i++)
+ have_compressed |= (features & (1 << i->feature)) != 0;
+
+ if (!have_compressed)
+ return 0;
+
+ if (!mempool_initialized(&c->compression_bounce[READ]) &&
+ mempool_init_kvpmalloc_pool(&c->compression_bounce[READ],
+ 1, c->opts.encoded_extent_max))
+ return -BCH_ERR_ENOMEM_compression_bounce_read_init;
+
+ if (!mempool_initialized(&c->compression_bounce[WRITE]) &&
+ mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE],
+ 1, c->opts.encoded_extent_max))
+ return -BCH_ERR_ENOMEM_compression_bounce_write_init;
+
+ for (i = compression_types;
+ i < compression_types + ARRAY_SIZE(compression_types);
+ i++) {
+ decompress_workspace_size =
+ max(decompress_workspace_size, i->decompress_workspace);
+
+ if (!(features & (1 << i->feature)))
+ continue;
+
+ if (mempool_initialized(&c->compress_workspace[i->type]))
+ continue;
+
+ if (mempool_init_kvpmalloc_pool(
+ &c->compress_workspace[i->type],
+ 1, i->compress_workspace))
+ return -BCH_ERR_ENOMEM_compression_workspace_init;
+ }
+
+ if (!mempool_initialized(&c->decompress_workspace) &&
+ mempool_init_kvpmalloc_pool(&c->decompress_workspace,
+ 1, decompress_workspace_size))
+ return -BCH_ERR_ENOMEM_decompression_workspace_init;
+
+ return 0;
+}
+
+static u64 compression_opt_to_feature(unsigned v)
+{
+ unsigned type = bch2_compression_decode(v).type;
+
+ return BIT_ULL(bch2_compression_opt_to_feature[type]);
+}
+
+int bch2_fs_compress_init(struct bch_fs *c)
+{
+ u64 f = c->sb.features;
+
+ f |= compression_opt_to_feature(c->opts.compression);
+ f |= compression_opt_to_feature(c->opts.background_compression);
+
+ return __bch2_fs_compress_init(c, f);
+}
+
+int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res,
+ struct printbuf *err)
+{
+ char *val = kstrdup(_val, GFP_KERNEL);
+ char *p = val, *type_str, *level_str;
+ struct bch_compression_opt opt = { 0 };
+ int ret;
+
+ if (!val)
+ return -ENOMEM;
+
+ type_str = strsep(&p, ":");
+ level_str = p;
+
+ ret = match_string(bch2_compression_opts, -1, type_str);
+ if (ret < 0 && err)
+ prt_str(err, "invalid compression type");
+ if (ret < 0)
+ goto err;
+
+ opt.type = ret;
+
+ if (level_str) {
+ unsigned level;
+
+ ret = kstrtouint(level_str, 10, &level);
+ if (!ret && !opt.type && level)
+ ret = -EINVAL;
+ if (!ret && level > 15)
+ ret = -EINVAL;
+ if (ret < 0 && err)
+ prt_str(err, "invalid compression level");
+ if (ret < 0)
+ goto err;
+
+ opt.level = level;
+ }
+
+ *res = bch2_compression_encode(opt);
+err:
+ kfree(val);
+ return ret;
+}
+
+void bch2_compression_opt_to_text(struct printbuf *out, u64 v)
+{
+ struct bch_compression_opt opt = bch2_compression_decode(v);
+
+ if (opt.type < BCH_COMPRESSION_OPT_NR)
+ prt_str(out, bch2_compression_opts[opt.type]);
+ else
+ prt_printf(out, "(unknown compression opt %u)", opt.type);
+ if (opt.level)
+ prt_printf(out, ":%u", opt.level);
+}
+
+void bch2_opt_compression_to_text(struct printbuf *out,
+ struct bch_fs *c,
+ struct bch_sb *sb,
+ u64 v)
+{
+ return bch2_compression_opt_to_text(out, v);
+}
+
+int bch2_opt_compression_validate(u64 v, struct printbuf *err)
+{
+ if (!bch2_compression_opt_valid(v)) {
+ prt_printf(err, "invalid compression opt %llu", v);
+ return -BCH_ERR_invalid_sb_opt_compression;
+ }
+
+ return 0;
+}
diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
new file mode 100644
index 000000000000..607fd5e232c9
--- /dev/null
+++ b/fs/bcachefs/compress.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_COMPRESS_H
+#define _BCACHEFS_COMPRESS_H
+
+#include "extents_types.h"
+
+static const unsigned __bch2_compression_opt_to_type[] = {
+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t,
+ BCH_COMPRESSION_OPTS()
+#undef x
+};
+
+struct bch_compression_opt {
+ u8 type:4,
+ level:4;
+};
+
+static inline struct bch_compression_opt __bch2_compression_decode(unsigned v)
+{
+ return (struct bch_compression_opt) {
+ .type = v & 15,
+ .level = v >> 4,
+ };
+}
+
+static inline bool bch2_compression_opt_valid(unsigned v)
+{
+ struct bch_compression_opt opt = __bch2_compression_decode(v);
+
+ return opt.type < ARRAY_SIZE(__bch2_compression_opt_to_type) && !(!opt.type && opt.level);
+}
+
+static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
+{
+ return bch2_compression_opt_valid(v)
+ ? __bch2_compression_decode(v)
+ : (struct bch_compression_opt) { 0 };
+}
+
+static inline unsigned bch2_compression_encode(struct bch_compression_opt opt)
+{
+ return opt.type|(opt.level << 4);
+}
+
+static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
+{
+ return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
+}
+
+int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
+ struct bch_extent_crc_unpacked *);
+int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
+ struct bvec_iter, struct bch_extent_crc_unpacked);
+unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
+ struct bio *, size_t *, unsigned);
+
+int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
+void bch2_fs_compress_exit(struct bch_fs *);
+int bch2_fs_compress_init(struct bch_fs *);
+
+void bch2_compression_opt_to_text(struct printbuf *, u64);
+
+int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
+void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
+int bch2_opt_compression_validate(u64, struct printbuf *);
+
+#define bch2_opt_compression (struct bch_opt_fn) { \
+ .parse = bch2_opt_compression_parse, \
+ .to_text = bch2_opt_compression_to_text, \
+ .validate = bch2_opt_compression_validate, \
+}
+
+#endif /* _BCACHEFS_COMPRESS_H */
diff --git a/fs/bcachefs/counters.c b/fs/bcachefs/counters.c
new file mode 100644
index 000000000000..02a996e06a64
--- /dev/null
+++ b/fs/bcachefs/counters.c
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "super-io.h"
+#include "counters.h"
+
+/* BCH_SB_FIELD_counters */
+
+static const char * const bch2_counter_names[] = {
+#define x(t, n, ...) (#t),
+ BCH_PERSISTENT_COUNTERS()
+#undef x
+ NULL
+};
+
+static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs)
+{
+ if (!ctrs)
+ return 0;
+
+ return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0];
+};
+
+static int bch2_sb_counters_validate(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ struct printbuf *err)
+{
+ return 0;
+};
+
+static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_counters *ctrs = field_to_type(f, counters);
+ unsigned int i;
+ unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
+
+ for (i = 0; i < nr; i++) {
+ if (i < BCH_COUNTER_NR)
+ prt_printf(out, "%s ", bch2_counter_names[i]);
+ else
+ prt_printf(out, "(unknown)");
+
+ prt_tab(out);
+ prt_printf(out, "%llu", le64_to_cpu(ctrs->d[i]));
+ prt_newline(out);
+ }
+};
+
+int bch2_sb_counters_to_cpu(struct bch_fs *c)
+{
+ struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
+ unsigned int i;
+ unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
+ u64 val = 0;
+
+ for (i = 0; i < BCH_COUNTER_NR; i++)
+ c->counters_on_mount[i] = 0;
+
+ for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) {
+ val = le64_to_cpu(ctrs->d[i]);
+ percpu_u64_set(&c->counters[i], val);
+ c->counters_on_mount[i] = val;
+ }
+ return 0;
+};
+
+int bch2_sb_counters_from_cpu(struct bch_fs *c)
+{
+ struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
+ struct bch_sb_field_counters *ret;
+ unsigned int i;
+ unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
+
+ if (nr < BCH_COUNTER_NR) {
+ ret = bch2_sb_field_resize(&c->disk_sb, counters,
+ sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR);
+
+ if (ret) {
+ ctrs = ret;
+ nr = bch2_sb_counter_nr_entries(ctrs);
+ }
+ }
+
+
+ for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++)
+ ctrs->d[i] = cpu_to_le64(percpu_u64_get(&c->counters[i]));
+ return 0;
+}
+
+void bch2_fs_counters_exit(struct bch_fs *c)
+{
+ free_percpu(c->counters);
+}
+
+int bch2_fs_counters_init(struct bch_fs *c)
+{
+ c->counters = __alloc_percpu(sizeof(u64) * BCH_COUNTER_NR, sizeof(u64));
+ if (!c->counters)
+ return -BCH_ERR_ENOMEM_fs_counters_init;
+
+ return bch2_sb_counters_to_cpu(c);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_counters = {
+ .validate = bch2_sb_counters_validate,
+ .to_text = bch2_sb_counters_to_text,
+};
diff --git a/fs/bcachefs/counters.h b/fs/bcachefs/counters.h
new file mode 100644
index 000000000000..4778aa19bf34
--- /dev/null
+++ b/fs/bcachefs/counters.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_COUNTERS_H
+#define _BCACHEFS_COUNTERS_H
+
+#include "bcachefs.h"
+#include "super-io.h"
+
+
+int bch2_sb_counters_to_cpu(struct bch_fs *);
+int bch2_sb_counters_from_cpu(struct bch_fs *);
+
+void bch2_fs_counters_exit(struct bch_fs *);
+int bch2_fs_counters_init(struct bch_fs *);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_counters;
+
+#endif // _BCACHEFS_COUNTERS_H
diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c
new file mode 100644
index 000000000000..ac35b8b705ae
--- /dev/null
+++ b/fs/bcachefs/darray.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/log2.h>
+#include <linux/slab.h>
+#include "darray.h"
+
+int __bch2_darray_resize(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
+{
+ if (new_size > d->size) {
+ new_size = roundup_pow_of_two(new_size);
+
+ void *data = kvmalloc_array(new_size, element_size, gfp);
+ if (!data)
+ return -ENOMEM;
+
+ memcpy(data, d->data, d->size * element_size);
+ if (d->data != d->preallocated)
+ kvfree(d->data);
+ d->data = data;
+ d->size = new_size;
+ }
+
+ return 0;
+}
diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
new file mode 100644
index 000000000000..e367c625f057
--- /dev/null
+++ b/fs/bcachefs/darray.h
@@ -0,0 +1,105 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DARRAY_H
+#define _BCACHEFS_DARRAY_H
+
+/*
+ * Dynamic arrays:
+ *
+ * Inspired by CCAN's darray
+ */
+
+#include <linux/slab.h>
+
+#define DARRAY_PREALLOCATED(_type, _nr) \
+struct { \
+ size_t nr, size; \
+ _type *data; \
+ _type preallocated[_nr]; \
+}
+
+#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0)
+
+typedef DARRAY(char) darray_char;
+
+int __bch2_darray_resize(darray_char *, size_t, size_t, gfp_t);
+
+static inline int __darray_resize(darray_char *d, size_t element_size,
+ size_t new_size, gfp_t gfp)
+{
+ return unlikely(new_size > d->size)
+ ? __bch2_darray_resize(d, element_size, new_size, gfp)
+ : 0;
+}
+
+#define darray_resize_gfp(_d, _new_size, _gfp) \
+ unlikely(__darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp))
+
+#define darray_resize(_d, _new_size) \
+ darray_resize_gfp(_d, _new_size, GFP_KERNEL)
+
+static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more, gfp_t gfp)
+{
+ return __darray_resize(d, t_size, d->nr + more, gfp);
+}
+
+#define darray_make_room_gfp(_d, _more, _gfp) \
+ __darray_make_room((darray_char *) (_d), sizeof((_d)->data[0]), (_more), _gfp)
+
+#define darray_make_room(_d, _more) \
+ darray_make_room_gfp(_d, _more, GFP_KERNEL)
+
+#define darray_room(_d) ((_d).size - (_d).nr)
+
+#define darray_top(_d) ((_d).data[(_d).nr])
+
+#define darray_push_gfp(_d, _item, _gfp) \
+({ \
+ int _ret = darray_make_room_gfp((_d), 1, _gfp); \
+ \
+ if (!_ret) \
+ (_d)->data[(_d)->nr++] = (_item); \
+ _ret; \
+})
+
+#define darray_push(_d, _item) darray_push_gfp(_d, _item, GFP_KERNEL)
+
+#define darray_pop(_d) ((_d)->data[--(_d)->nr])
+
+#define darray_first(_d) ((_d).data[0])
+#define darray_last(_d) ((_d).data[(_d).nr - 1])
+
+#define darray_insert_item(_d, pos, _item) \
+({ \
+ size_t _pos = (pos); \
+ int _ret = darray_make_room((_d), 1); \
+ \
+ if (!_ret) \
+ array_insert_item((_d)->data, (_d)->nr, _pos, (_item)); \
+ _ret; \
+})
+
+#define darray_remove_item(_d, _pos) \
+ array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data)
+
+#define darray_for_each(_d, _i) \
+ for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++)
+
+#define darray_for_each_reverse(_d, _i) \
+ for (_i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i)
+
+#define darray_init(_d) \
+do { \
+ (_d)->nr = 0; \
+ (_d)->size = ARRAY_SIZE((_d)->preallocated); \
+ (_d)->data = (_d)->size ? (_d)->preallocated : NULL; \
+} while (0)
+
+#define darray_exit(_d) \
+do { \
+ if (!ARRAY_SIZE((_d)->preallocated) || \
+ (_d)->data != (_d)->preallocated) \
+ kvfree((_d)->data); \
+ darray_init(_d); \
+} while (0)
+
+#endif /* _BCACHEFS_DARRAY_H */
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
new file mode 100644
index 000000000000..37d6ecae8c30
--- /dev/null
+++ b/fs/bcachefs/data_update.c
@@ -0,0 +1,653 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "data_update.h"
+#include "ec.h"
+#include "error.h"
+#include "extents.h"
+#include "io_write.h"
+#include "keylist.h"
+#include "move.h"
+#include "nocow_locking.h"
+#include "rebalance.h"
+#include "subvolume.h"
+#include "trace.h"
+
+static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k)
+{
+ if (trace_move_extent_finish_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, k);
+ trace_move_extent_finish(c, buf.buf);
+ printbuf_exit(&buf);
+ }
+}
+
+static void trace_move_extent_fail2(struct data_update *m,
+ struct bkey_s_c new,
+ struct bkey_s_c wrote,
+ struct bkey_i *insert,
+ const char *msg)
+{
+ struct bch_fs *c = m->op.c;
+ struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
+ const union bch_extent_entry *entry;
+ struct bch_extent_ptr *ptr;
+ struct extent_ptr_decoded p;
+ struct printbuf buf = PRINTBUF;
+ unsigned i, rewrites_found = 0;
+
+ if (!trace_move_extent_fail_enabled())
+ return;
+
+ prt_str(&buf, msg);
+
+ if (insert) {
+ i = 0;
+ bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
+ if (((1U << i) & m->data_opts.rewrite_ptrs) &&
+ (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
+ !ptr->cached)
+ rewrites_found |= 1U << i;
+ i++;
+ }
+ }
+
+ prt_printf(&buf, "\nrewrite ptrs: %u%u%u%u",
+ (m->data_opts.rewrite_ptrs & (1 << 0)) != 0,
+ (m->data_opts.rewrite_ptrs & (1 << 1)) != 0,
+ (m->data_opts.rewrite_ptrs & (1 << 2)) != 0,
+ (m->data_opts.rewrite_ptrs & (1 << 3)) != 0);
+
+ prt_printf(&buf, "\nrewrites found: %u%u%u%u",
+ (rewrites_found & (1 << 0)) != 0,
+ (rewrites_found & (1 << 1)) != 0,
+ (rewrites_found & (1 << 2)) != 0,
+ (rewrites_found & (1 << 3)) != 0);
+
+ prt_str(&buf, "\nold: ");
+ bch2_bkey_val_to_text(&buf, c, old);
+
+ prt_str(&buf, "\nnew: ");
+ bch2_bkey_val_to_text(&buf, c, new);
+
+ prt_str(&buf, "\nwrote: ");
+ bch2_bkey_val_to_text(&buf, c, wrote);
+
+ if (insert) {
+ prt_str(&buf, "\ninsert: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+ }
+
+ trace_move_extent_fail(c, buf.buf);
+ printbuf_exit(&buf);
+}
+
+static int __bch2_data_update_index_update(struct btree_trans *trans,
+ struct bch_write_op *op)
+{
+ struct bch_fs *c = op->c;
+ struct btree_iter iter;
+ struct data_update *m =
+ container_of(op, struct data_update, op);
+ struct keylist *keys = &op->insert_keys;
+ struct bkey_buf _new, _insert;
+ int ret = 0;
+
+ bch2_bkey_buf_init(&_new);
+ bch2_bkey_buf_init(&_insert);
+ bch2_bkey_buf_realloc(&_insert, c, U8_MAX);
+
+ bch2_trans_iter_init(trans, &iter, m->btree_id,
+ bkey_start_pos(&bch2_keylist_front(keys)->k),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+ while (1) {
+ struct bkey_s_c k;
+ struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
+ struct bkey_i *insert = NULL;
+ struct bkey_i_extent *new;
+ const union bch_extent_entry *entry_c;
+ union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ struct bch_extent_ptr *ptr;
+ const struct bch_extent_ptr *ptr_c;
+ struct bpos next_pos;
+ bool should_check_enospc;
+ s64 i_sectors_delta = 0, disk_sectors_delta = 0;
+ unsigned rewrites_found = 0, durability, i;
+
+ bch2_trans_begin(trans);
+
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ new = bkey_i_to_extent(bch2_keylist_front(keys));
+
+ if (!bch2_extents_match(k, old)) {
+ trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i),
+ NULL, "no match:");
+ goto nowork;
+ }
+
+ bkey_reassemble(_insert.k, k);
+ insert = _insert.k;
+
+ bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
+ new = bkey_i_to_extent(_new.k);
+ bch2_cut_front(iter.pos, &new->k_i);
+
+ bch2_cut_front(iter.pos, insert);
+ bch2_cut_back(new->k.p, insert);
+ bch2_cut_back(insert->k.p, &new->k_i);
+
+ /*
+ * @old: extent that we read from
+ * @insert: key that we're going to update, initialized from
+ * extent currently in btree - same as @old unless we raced with
+ * other updates
+ * @new: extent with new pointers that we'll be adding to @insert
+ *
+ * Fist, drop rewrite_ptrs from @new:
+ */
+ i = 0;
+ bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) {
+ if (((1U << i) & m->data_opts.rewrite_ptrs) &&
+ (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
+ !ptr->cached) {
+ bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr);
+ rewrites_found |= 1U << i;
+ }
+ i++;
+ }
+
+ if (m->data_opts.rewrite_ptrs &&
+ !rewrites_found &&
+ bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) {
+ trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:");
+ goto nowork;
+ }
+
+ /*
+ * A replica that we just wrote might conflict with a replica
+ * that we want to keep, due to racing with another move:
+ */
+restart_drop_conflicting_replicas:
+ extent_for_each_ptr(extent_i_to_s(new), ptr)
+ if ((ptr_c = bch2_bkey_has_device_c(bkey_i_to_s_c(insert), ptr->dev)) &&
+ !ptr_c->cached) {
+ bch2_bkey_drop_ptr_noerror(bkey_i_to_s(&new->k_i), ptr);
+ goto restart_drop_conflicting_replicas;
+ }
+
+ if (!bkey_val_u64s(&new->k)) {
+ trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:");
+ goto nowork;
+ }
+
+ /* Now, drop pointers that conflict with what we just wrote: */
+ extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
+ if ((ptr = bch2_bkey_has_device(bkey_i_to_s(insert), p.ptr.dev)))
+ bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr);
+
+ durability = bch2_bkey_durability(c, bkey_i_to_s_c(insert)) +
+ bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i));
+
+ /* Now, drop excess replicas: */
+restart_drop_extra_replicas:
+ bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) {
+ unsigned ptr_durability = bch2_extent_ptr_durability(c, &p);
+
+ if (!p.ptr.cached &&
+ durability - ptr_durability >= m->op.opts.data_replicas) {
+ durability -= ptr_durability;
+
+ bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr);
+ goto restart_drop_extra_replicas;
+ }
+ }
+
+ /* Finally, add the pointers we just wrote: */
+ extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
+ bch2_extent_ptr_decoded_append(insert, &p);
+
+ bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 });
+ bch2_extent_normalize(c, bkey_i_to_s(insert));
+
+ ret = bch2_sum_sector_overwrites(trans, &iter, insert,
+ &should_check_enospc,
+ &i_sectors_delta,
+ &disk_sectors_delta);
+ if (ret)
+ goto err;
+
+ if (disk_sectors_delta > (s64) op->res.sectors) {
+ ret = bch2_disk_reservation_add(c, &op->res,
+ disk_sectors_delta - op->res.sectors,
+ !should_check_enospc
+ ? BCH_DISK_RESERVATION_NOFAIL : 0);
+ if (ret)
+ goto out;
+ }
+
+ next_pos = insert->k.p;
+
+ /*
+ * Check for nonce offset inconsistency:
+ * This is debug code - we've been seeing this bug rarely, and
+ * it's been hard to reproduce, so this should give us some more
+ * information when it does occur:
+ */
+ struct printbuf err = PRINTBUF;
+ int invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), 0, &err);
+ printbuf_exit(&err);
+
+ if (invalid) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "about to insert invalid key in data update path");
+ prt_str(&buf, "\nold: ");
+ bch2_bkey_val_to_text(&buf, c, old);
+ prt_str(&buf, "\nk: ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_str(&buf, "\nnew: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+
+ bch2_fatal_error(c);
+ goto out;
+ }
+
+ ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id,
+ k.k->p, bkey_start_pos(&insert->k)) ?:
+ bch2_insert_snapshot_whiteouts(trans, m->btree_id,
+ k.k->p, insert->k.p) ?:
+ bch2_bkey_set_needs_rebalance(c, insert,
+ op->opts.background_target,
+ op->opts.background_compression) ?:
+ bch2_trans_update(trans, &iter, insert,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ bch2_trans_commit(trans, &op->res,
+ NULL,
+ BTREE_INSERT_NOCHECK_RW|
+ BTREE_INSERT_NOFAIL|
+ m->data_opts.btree_insert_flags);
+ if (!ret) {
+ bch2_btree_iter_set_pos(&iter, next_pos);
+
+ this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size);
+ trace_move_extent_finish2(c, bkey_i_to_s_c(&new->k_i));
+ }
+err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ ret = 0;
+ if (ret)
+ break;
+next:
+ while (bkey_ge(iter.pos, bch2_keylist_front(keys)->k.p)) {
+ bch2_keylist_pop_front(keys);
+ if (bch2_keylist_empty(keys))
+ goto out;
+ }
+ continue;
+nowork:
+ if (m->stats && m->stats) {
+ BUG_ON(k.k->p.offset <= iter.pos.offset);
+ atomic64_inc(&m->stats->keys_raced);
+ atomic64_add(k.k->p.offset - iter.pos.offset,
+ &m->stats->sectors_raced);
+ }
+
+ this_cpu_inc(c->counters[BCH_COUNTER_move_extent_fail]);
+
+ bch2_btree_iter_advance(&iter);
+ goto next;
+ }
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_bkey_buf_exit(&_insert, c);
+ bch2_bkey_buf_exit(&_new, c);
+ BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
+ return ret;
+}
+
+int bch2_data_update_index_update(struct bch_write_op *op)
+{
+ return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op));
+}
+
+void bch2_data_update_read_done(struct data_update *m,
+ struct bch_extent_crc_unpacked crc)
+{
+ /* write bio must own pages: */
+ BUG_ON(!m->op.wbio.bio.bi_vcnt);
+
+ m->op.crc = crc;
+ m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9;
+
+ closure_call(&m->op.cl, bch2_write, NULL, NULL);
+}
+
+void bch2_data_update_exit(struct data_update *update)
+{
+ struct bch_fs *c = update->op.c;
+ struct bkey_ptrs_c ptrs =
+ bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k));
+ const struct bch_extent_ptr *ptr;
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (c->opts.nocow_enabled)
+ bch2_bucket_nocow_unlock(&c->nocow_locks,
+ PTR_BUCKET_POS(c, ptr), 0);
+ percpu_ref_put(&bch_dev_bkey_exists(c, ptr->dev)->ref);
+ }
+
+ bch2_bkey_buf_exit(&update->k, c);
+ bch2_disk_reservation_put(c, &update->op.res);
+ bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
+}
+
+static void bch2_update_unwritten_extent(struct btree_trans *trans,
+ struct data_update *update)
+{
+ struct bch_fs *c = update->op.c;
+ struct bio *bio = &update->op.wbio.bio;
+ struct bkey_i_extent *e;
+ struct write_point *wp;
+ struct bch_extent_ptr *ptr;
+ struct closure cl;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ closure_init_stack(&cl);
+ bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys);
+
+ while (bio_sectors(bio)) {
+ unsigned sectors = bio_sectors(bio);
+
+ bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos,
+ BTREE_ITER_SLOTS);
+ ret = lockrestart_do(trans, ({
+ k = bch2_btree_iter_peek_slot(&iter);
+ bkey_err(k);
+ }));
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (ret || !bch2_extents_match(k, bkey_i_to_s_c(update->k.k)))
+ break;
+
+ e = bkey_extent_init(update->op.insert_keys.top);
+ e->k.p = update->op.pos;
+
+ ret = bch2_alloc_sectors_start_trans(trans,
+ update->op.target,
+ false,
+ update->op.write_point,
+ &update->op.devs_have,
+ update->op.nr_replicas,
+ update->op.nr_replicas,
+ update->op.watermark,
+ 0, &cl, &wp);
+ if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) {
+ bch2_trans_unlock(trans);
+ closure_sync(&cl);
+ continue;
+ }
+
+ if (ret)
+ return;
+
+ sectors = min(sectors, wp->sectors_free);
+
+ bch2_key_resize(&e->k, sectors);
+
+ bch2_open_bucket_get(c, wp, &update->op.open_buckets);
+ bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
+ bch2_alloc_sectors_done(c, wp);
+
+ bio_advance(bio, sectors << 9);
+ update->op.pos.offset += sectors;
+
+ extent_for_each_ptr(extent_i_to_s(e), ptr)
+ ptr->unwritten = true;
+ bch2_keylist_push(&update->op.insert_keys);
+
+ ret = __bch2_data_update_index_update(trans, &update->op);
+
+ bch2_open_buckets_put(c, &update->op.open_buckets);
+
+ if (ret)
+ break;
+ }
+
+ if (closure_nr_remaining(&cl) != 1) {
+ bch2_trans_unlock(trans);
+ closure_sync(&cl);
+ }
+}
+
+int bch2_extent_drop_ptrs(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ struct data_update_opts data_opts)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_i *n;
+ int ret;
+
+ n = bch2_bkey_make_mut_noupdate(trans, k);
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
+
+ while (data_opts.kill_ptrs) {
+ unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
+ struct bch_extent_ptr *ptr;
+
+ bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
+ data_opts.kill_ptrs ^= 1U << drop;
+ }
+
+ /*
+ * If the new extent no longer has any pointers, bch2_extent_normalize()
+ * will do the appropriate thing with it (turning it into a
+ * KEY_TYPE_error key, or just a discard if it was a cached extent)
+ */
+ bch2_extent_normalize(c, bkey_i_to_s(n));
+
+ /*
+ * Since we're not inserting through an extent iterator
+ * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
+ * we aren't using the extent overwrite path to delete, we're
+ * just using the normal key deletion path:
+ */
+ if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+ n->k.size = 0;
+
+ return bch2_trans_relock(trans) ?:
+ bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+}
+
+int bch2_data_update_init(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct moving_context *ctxt,
+ struct data_update *m,
+ struct write_point_specifier wp,
+ struct bch_io_opts io_opts,
+ struct data_update_opts data_opts,
+ enum btree_id btree_id,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ const struct bch_extent_ptr *ptr;
+ unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
+ unsigned ptrs_locked = 0;
+ int ret = 0;
+
+ bch2_bkey_buf_init(&m->k);
+ bch2_bkey_buf_reassemble(&m->k, c, k);
+ m->btree_id = btree_id;
+ m->data_opts = data_opts;
+ m->ctxt = ctxt;
+ m->stats = ctxt ? ctxt->stats : NULL;
+
+ bch2_write_op_init(&m->op, c, io_opts);
+ m->op.pos = bkey_start_pos(k.k);
+ m->op.version = k.k->version;
+ m->op.target = data_opts.target;
+ m->op.write_point = wp;
+ m->op.nr_replicas = 0;
+ m->op.flags |= BCH_WRITE_PAGES_STABLE|
+ BCH_WRITE_PAGES_OWNED|
+ BCH_WRITE_DATA_ENCODED|
+ BCH_WRITE_MOVE|
+ m->data_opts.write_flags;
+ m->op.compression_opt = io_opts.background_compression ?: io_opts.compression;
+ m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
+
+ bkey_for_each_ptr(ptrs, ptr)
+ percpu_ref_get(&bch_dev_bkey_exists(c, ptr->dev)->ref);
+
+ unsigned durability_have = 0, durability_removing = 0;
+
+ i = 0;
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ bool locked;
+
+ if (((1U << i) & m->data_opts.rewrite_ptrs)) {
+ BUG_ON(p.ptr.cached);
+
+ if (crc_is_compressed(p.crc))
+ reserve_sectors += k.k->size;
+
+ m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
+ durability_removing += bch2_extent_ptr_desired_durability(c, &p);
+ } else if (!p.ptr.cached &&
+ !((1U << i) & m->data_opts.kill_ptrs)) {
+ bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
+ durability_have += bch2_extent_ptr_durability(c, &p);
+ }
+
+ /*
+ * op->csum_type is normally initialized from the fs/file's
+ * current options - but if an extent is encrypted, we require
+ * that it stays encrypted:
+ */
+ if (bch2_csum_type_is_encryption(p.crc.csum_type)) {
+ m->op.nonce = p.crc.nonce + p.crc.offset;
+ m->op.csum_type = p.crc.csum_type;
+ }
+
+ if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
+ m->op.incompressible = true;
+
+ if (c->opts.nocow_enabled) {
+ if (ctxt) {
+ move_ctxt_wait_event(ctxt,
+ (locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
+ PTR_BUCKET_POS(c, &p.ptr), 0)) ||
+ (!atomic_read(&ctxt->read_sectors) &&
+ !atomic_read(&ctxt->write_sectors)));
+
+ if (!locked)
+ bch2_bucket_nocow_lock(&c->nocow_locks,
+ PTR_BUCKET_POS(c, &p.ptr), 0);
+ } else {
+ if (!bch2_bucket_nocow_trylock(&c->nocow_locks,
+ PTR_BUCKET_POS(c, &p.ptr), 0)) {
+ ret = -BCH_ERR_nocow_lock_blocked;
+ goto err;
+ }
+ }
+ ptrs_locked |= (1U << i);
+ }
+
+ i++;
+ }
+
+ /*
+ * If current extent durability is less than io_opts.data_replicas,
+ * we're not trying to rereplicate the extent up to data_replicas here -
+ * unless extra_replicas was specified
+ *
+ * Increasing replication is an explicit operation triggered by
+ * rereplicate, currently, so that users don't get an unexpected -ENOSPC
+ */
+ if (!(m->data_opts.write_flags & BCH_WRITE_CACHED) &&
+ durability_have >= io_opts.data_replicas) {
+ m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
+ m->data_opts.rewrite_ptrs = 0;
+ /* if iter == NULL, it's just a promote */
+ if (iter)
+ ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts);
+ goto done;
+ }
+
+ m->op.nr_replicas = min(durability_removing, io_opts.data_replicas - durability_have) +
+ m->data_opts.extra_replicas;
+ m->op.nr_replicas_required = m->op.nr_replicas;
+
+ BUG_ON(!m->op.nr_replicas);
+
+ if (reserve_sectors) {
+ ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
+ m->data_opts.extra_replicas
+ ? 0
+ : BCH_DISK_RESERVATION_NOFAIL);
+ if (ret)
+ goto err;
+ }
+
+ if (bkey_extent_is_unwritten(k)) {
+ bch2_update_unwritten_extent(trans, m);
+ goto done;
+ }
+
+ return 0;
+err:
+ i = 0;
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ if ((1U << i) & ptrs_locked)
+ bch2_bucket_nocow_unlock(&c->nocow_locks,
+ PTR_BUCKET_POS(c, &p.ptr), 0);
+ percpu_ref_put(&bch_dev_bkey_exists(c, p.ptr.dev)->ref);
+ i++;
+ }
+
+ bch2_bkey_buf_exit(&m->k, c);
+ bch2_bio_free_pages_pool(c, &m->op.wbio.bio);
+ return ret;
+done:
+ bch2_data_update_exit(m);
+ return ret ?: -BCH_ERR_data_update_done;
+}
+
+void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
+ unsigned i = 0;
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ if ((opts->rewrite_ptrs & (1U << i)) && ptr->cached) {
+ opts->kill_ptrs |= 1U << i;
+ opts->rewrite_ptrs ^= 1U << i;
+ }
+
+ i++;
+ }
+}
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
new file mode 100644
index 000000000000..991095bbd469
--- /dev/null
+++ b/fs/bcachefs/data_update.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _BCACHEFS_DATA_UPDATE_H
+#define _BCACHEFS_DATA_UPDATE_H
+
+#include "bkey_buf.h"
+#include "io_write_types.h"
+
+struct moving_context;
+
+struct data_update_opts {
+ unsigned rewrite_ptrs;
+ unsigned kill_ptrs;
+ u16 target;
+ u8 extra_replicas;
+ unsigned btree_insert_flags;
+ unsigned write_flags;
+};
+
+struct data_update {
+ /* extent being updated: */
+ enum btree_id btree_id;
+ struct bkey_buf k;
+ struct data_update_opts data_opts;
+ struct moving_context *ctxt;
+ struct bch_move_stats *stats;
+ struct bch_write_op op;
+};
+
+int bch2_data_update_index_update(struct bch_write_op *);
+
+void bch2_data_update_read_done(struct data_update *,
+ struct bch_extent_crc_unpacked);
+
+int bch2_extent_drop_ptrs(struct btree_trans *,
+ struct btree_iter *,
+ struct bkey_s_c,
+ struct data_update_opts);
+
+void bch2_data_update_exit(struct data_update *);
+int bch2_data_update_init(struct btree_trans *, struct btree_iter *,
+ struct moving_context *,
+ struct data_update *,
+ struct write_point_specifier,
+ struct bch_io_opts, struct data_update_opts,
+ enum btree_id, struct bkey_s_c);
+void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *);
+
+#endif /* _BCACHEFS_DATA_UPDATE_H */
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
new file mode 100644
index 000000000000..57c5128db173
--- /dev/null
+++ b/fs/bcachefs/debug.c
@@ -0,0 +1,954 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Assorted bcachefs debug code
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "fsck.h"
+#include "inode.h"
+#include "super.h"
+
+#include <linux/console.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+
+static struct dentry *bch_debug;
+
+static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
+ struct extent_ptr_decoded pick)
+{
+ struct btree *v = c->verify_data;
+ struct btree_node *n_ondisk = c->verify_ondisk;
+ struct btree_node *n_sorted = c->verify_data->data;
+ struct bset *sorted, *inmemory = &b->data->keys;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+ struct bio *bio;
+ bool failed = false, saw_error = false;
+
+ if (!bch2_dev_get_ioref(ca, READ))
+ return false;
+
+ bio = bio_alloc_bioset(ca->disk_sb.bdev,
+ buf_pages(n_sorted, btree_bytes(c)),
+ REQ_OP_READ|REQ_META,
+ GFP_NOFS,
+ &c->btree_bio);
+ bio->bi_iter.bi_sector = pick.ptr.offset;
+ bch2_bio_map(bio, n_sorted, btree_bytes(c));
+
+ submit_bio_wait(bio);
+
+ bio_put(bio);
+ percpu_ref_put(&ca->io_ref);
+
+ memcpy(n_ondisk, n_sorted, btree_bytes(c));
+
+ v->written = 0;
+ if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error)
+ return false;
+
+ n_sorted = c->verify_data->data;
+ sorted = &n_sorted->keys;
+
+ if (inmemory->u64s != sorted->u64s ||
+ memcmp(inmemory->start,
+ sorted->start,
+ vstruct_end(inmemory) - (void *) inmemory->start)) {
+ unsigned offset = 0, sectors;
+ struct bset *i;
+ unsigned j;
+
+ console_lock();
+
+ printk(KERN_ERR "*** in memory:\n");
+ bch2_dump_bset(c, b, inmemory, 0);
+
+ printk(KERN_ERR "*** read back in:\n");
+ bch2_dump_bset(c, v, sorted, 0);
+
+ while (offset < v->written) {
+ if (!offset) {
+ i = &n_ondisk->keys;
+ sectors = vstruct_blocks(n_ondisk, c->block_bits) <<
+ c->block_bits;
+ } else {
+ struct btree_node_entry *bne =
+ (void *) n_ondisk + (offset << 9);
+ i = &bne->keys;
+
+ sectors = vstruct_blocks(bne, c->block_bits) <<
+ c->block_bits;
+ }
+
+ printk(KERN_ERR "*** on disk block %u:\n", offset);
+ bch2_dump_bset(c, b, i, offset);
+
+ offset += sectors;
+ }
+
+ for (j = 0; j < le16_to_cpu(inmemory->u64s); j++)
+ if (inmemory->_data[j] != sorted->_data[j])
+ break;
+
+ console_unlock();
+ bch_err(c, "verify failed at key %u", j);
+
+ failed = true;
+ }
+
+ if (v->written != b->written) {
+ bch_err(c, "written wrong: expected %u, got %u",
+ b->written, v->written);
+ failed = true;
+ }
+
+ return failed;
+}
+
+void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
+{
+ struct bkey_ptrs_c ptrs;
+ struct extent_ptr_decoded p;
+ const union bch_extent_entry *entry;
+ struct btree *v;
+ struct bset *inmemory = &b->data->keys;
+ struct bkey_packed *k;
+ bool failed = false;
+
+ if (c->opts.nochanges)
+ return;
+
+ bch2_btree_node_io_lock(b);
+ mutex_lock(&c->verify_lock);
+
+ if (!c->verify_ondisk) {
+ c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
+ if (!c->verify_ondisk)
+ goto out;
+ }
+
+ if (!c->verify_data) {
+ c->verify_data = __bch2_btree_node_mem_alloc(c);
+ if (!c->verify_data)
+ goto out;
+
+ list_del_init(&c->verify_data->list);
+ }
+
+ BUG_ON(b->nsets != 1);
+
+ for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_p_next(k))
+ if (k->type == KEY_TYPE_btree_ptr_v2)
+ ((struct bch_btree_ptr_v2 *) bkeyp_val(&b->format, k))->mem_ptr = 0;
+
+ v = c->verify_data;
+ bkey_copy(&v->key, &b->key);
+ v->c.level = b->c.level;
+ v->c.btree_id = b->c.btree_id;
+ bch2_btree_keys_init(v);
+
+ ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key));
+ bkey_for_each_ptr_decode(&b->key.k, ptrs, p, entry)
+ failed |= bch2_btree_verify_replica(c, b, p);
+
+ if (failed) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf.buf);
+ printbuf_exit(&buf);
+ }
+out:
+ mutex_unlock(&c->verify_lock);
+ bch2_btree_node_io_unlock(b);
+}
+
+void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
+ const struct btree *b)
+{
+ struct btree_node *n_ondisk = NULL;
+ struct extent_ptr_decoded pick;
+ struct bch_dev *ca;
+ struct bio *bio = NULL;
+ unsigned offset = 0;
+ int ret;
+
+ if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) {
+ prt_printf(out, "error getting device to read from: invalid device\n");
+ return;
+ }
+
+ ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+ if (!bch2_dev_get_ioref(ca, READ)) {
+ prt_printf(out, "error getting device to read from: not online\n");
+ return;
+ }
+
+ n_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
+ if (!n_ondisk) {
+ prt_printf(out, "memory allocation failure\n");
+ goto out;
+ }
+
+ bio = bio_alloc_bioset(ca->disk_sb.bdev,
+ buf_pages(n_ondisk, btree_bytes(c)),
+ REQ_OP_READ|REQ_META,
+ GFP_NOFS,
+ &c->btree_bio);
+ bio->bi_iter.bi_sector = pick.ptr.offset;
+ bch2_bio_map(bio, n_ondisk, btree_bytes(c));
+
+ ret = submit_bio_wait(bio);
+ if (ret) {
+ prt_printf(out, "IO error reading btree node: %s\n", bch2_err_str(ret));
+ goto out;
+ }
+
+ while (offset < btree_sectors(c)) {
+ struct bset *i;
+ struct nonce nonce;
+ struct bch_csum csum;
+ struct bkey_packed *k;
+ unsigned sectors;
+
+ if (!offset) {
+ i = &n_ondisk->keys;
+
+ if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) {
+ prt_printf(out, "unknown checksum type at offset %u: %llu\n",
+ offset, BSET_CSUM_TYPE(i));
+ goto out;
+ }
+
+ nonce = btree_nonce(i, offset << 9);
+ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, n_ondisk);
+
+ if (bch2_crc_cmp(csum, n_ondisk->csum)) {
+ prt_printf(out, "invalid checksum\n");
+ goto out;
+ }
+
+ bset_encrypt(c, i, offset << 9);
+
+ sectors = vstruct_sectors(n_ondisk, c->block_bits);
+ } else {
+ struct btree_node_entry *bne = (void *) n_ondisk + (offset << 9);
+
+ i = &bne->keys;
+
+ if (i->seq != n_ondisk->keys.seq)
+ break;
+
+ if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) {
+ prt_printf(out, "unknown checksum type at offset %u: %llu\n",
+ offset, BSET_CSUM_TYPE(i));
+ goto out;
+ }
+
+ nonce = btree_nonce(i, offset << 9);
+ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+
+ if (bch2_crc_cmp(csum, bne->csum)) {
+ prt_printf(out, "invalid checksum");
+ goto out;
+ }
+
+ bset_encrypt(c, i, offset << 9);
+
+ sectors = vstruct_sectors(bne, c->block_bits);
+ }
+
+ prt_printf(out, " offset %u version %u, journal seq %llu\n",
+ offset,
+ le16_to_cpu(i->version),
+ le64_to_cpu(i->journal_seq));
+ offset += sectors;
+
+ printbuf_indent_add(out, 4);
+
+ for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) {
+ struct bkey u;
+
+ bch2_bkey_val_to_text(out, c, bkey_disassemble(b, k, &u));
+ prt_newline(out);
+ }
+
+ printbuf_indent_sub(out, 4);
+ }
+out:
+ if (bio)
+ bio_put(bio);
+ kvpfree(n_ondisk, btree_bytes(c));
+ percpu_ref_put(&ca->io_ref);
+}
+
+#ifdef CONFIG_DEBUG_FS
+
+/* XXX: bch_fs refcounting */
+
+struct dump_iter {
+ struct bch_fs *c;
+ enum btree_id id;
+ struct bpos from;
+ struct bpos prev_node;
+ u64 iter;
+
+ struct printbuf buf;
+
+ char __user *ubuf; /* destination user buffer */
+ size_t size; /* size of requested read */
+ ssize_t ret; /* bytes read so far */
+};
+
+static ssize_t flush_buf(struct dump_iter *i)
+{
+ if (i->buf.pos) {
+ size_t bytes = min_t(size_t, i->buf.pos, i->size);
+ int copied = bytes - copy_to_user(i->ubuf, i->buf.buf, bytes);
+
+ i->ret += copied;
+ i->ubuf += copied;
+ i->size -= copied;
+ i->buf.pos -= copied;
+ memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos);
+
+ if (copied != bytes)
+ return -EFAULT;
+ }
+
+ return i->size ? 0 : i->ret;
+}
+
+static int bch2_dump_open(struct inode *inode, struct file *file)
+{
+ struct btree_debug *bd = inode->i_private;
+ struct dump_iter *i;
+
+ i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
+ if (!i)
+ return -ENOMEM;
+
+ file->private_data = i;
+ i->from = POS_MIN;
+ i->iter = 0;
+ i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]);
+ i->id = bd->id;
+ i->buf = PRINTBUF;
+
+ return 0;
+}
+
+static int bch2_dump_release(struct inode *inode, struct file *file)
+{
+ struct dump_iter *i = file->private_data;
+
+ printbuf_exit(&i->buf);
+ kfree(i);
+ return 0;
+}
+
+static ssize_t bch2_read_btree(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct btree_trans *trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ ssize_t ret;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ ret = flush_buf(i);
+ if (ret)
+ return ret;
+
+ trans = bch2_trans_get(i->c);
+ ret = for_each_btree_key2(trans, iter, i->id, i->from,
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ bch2_bkey_val_to_text(&i->buf, i->c, k);
+ prt_newline(&i->buf);
+ drop_locks_do(trans, flush_buf(i));
+ }));
+ i->from = iter.pos;
+
+ bch2_trans_put(trans);
+
+ if (!ret)
+ ret = flush_buf(i);
+
+ return ret ?: i->ret;
+}
+
+static const struct file_operations btree_debug_ops = {
+ .owner = THIS_MODULE,
+ .open = bch2_dump_open,
+ .release = bch2_dump_release,
+ .read = bch2_read_btree,
+};
+
+static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct btree_trans *trans;
+ struct btree_iter iter;
+ struct btree *b;
+ ssize_t ret;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ ret = flush_buf(i);
+ if (ret)
+ return ret;
+
+ if (bpos_eq(SPOS_MAX, i->from))
+ return i->ret;
+
+ trans = bch2_trans_get(i->c);
+retry:
+ bch2_trans_begin(trans);
+
+ for_each_btree_node(trans, iter, i->id, i->from, 0, b, ret) {
+ bch2_btree_node_to_text(&i->buf, i->c, b);
+ i->from = !bpos_eq(SPOS_MAX, b->key.k.p)
+ ? bpos_successor(b->key.k.p)
+ : b->key.k.p;
+
+ ret = drop_locks_do(trans, flush_buf(i));
+ if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
+
+ bch2_trans_put(trans);
+
+ if (!ret)
+ ret = flush_buf(i);
+
+ return ret ?: i->ret;
+}
+
+static const struct file_operations btree_format_debug_ops = {
+ .owner = THIS_MODULE,
+ .open = bch2_dump_open,
+ .release = bch2_dump_release,
+ .read = bch2_read_btree_formats,
+};
+
+static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct btree_trans *trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ ssize_t ret;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ ret = flush_buf(i);
+ if (ret)
+ return ret;
+
+ trans = bch2_trans_get(i->c);
+
+ ret = for_each_btree_key2(trans, iter, i->id, i->from,
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ struct btree_path_level *l = &iter.path->l[0];
+ struct bkey_packed *_k =
+ bch2_btree_node_iter_peek(&l->iter, l->b);
+
+ if (bpos_gt(l->b->key.k.p, i->prev_node)) {
+ bch2_btree_node_to_text(&i->buf, i->c, l->b);
+ i->prev_node = l->b->key.k.p;
+ }
+
+ bch2_bfloat_to_text(&i->buf, l->b, _k);
+ drop_locks_do(trans, flush_buf(i));
+ }));
+ i->from = iter.pos;
+
+ bch2_trans_put(trans);
+
+ if (!ret)
+ ret = flush_buf(i);
+
+ return ret ?: i->ret;
+}
+
+static const struct file_operations bfloat_failed_debug_ops = {
+ .owner = THIS_MODULE,
+ .open = bch2_dump_open,
+ .release = bch2_dump_release,
+ .read = bch2_read_bfloat_failed,
+};
+
+static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
+ struct btree *b)
+{
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 32);
+
+ prt_printf(out, "%px btree=%s l=%u ",
+ b,
+ bch2_btree_id_str(b->c.btree_id),
+ b->c.level);
+ prt_newline(out);
+
+ printbuf_indent_add(out, 2);
+
+ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+ prt_newline(out);
+
+ prt_printf(out, "flags: ");
+ prt_tab(out);
+ prt_bitflags(out, bch2_btree_node_flags, b->flags);
+ prt_newline(out);
+
+ prt_printf(out, "pcpu read locks: ");
+ prt_tab(out);
+ prt_printf(out, "%u", b->c.lock.readers != NULL);
+ prt_newline(out);
+
+ prt_printf(out, "written:");
+ prt_tab(out);
+ prt_printf(out, "%u", b->written);
+ prt_newline(out);
+
+ prt_printf(out, "writes blocked:");
+ prt_tab(out);
+ prt_printf(out, "%u", !list_empty_careful(&b->write_blocked));
+ prt_newline(out);
+
+ prt_printf(out, "will make reachable:");
+ prt_tab(out);
+ prt_printf(out, "%lx", b->will_make_reachable);
+ prt_newline(out);
+
+ prt_printf(out, "journal pin %px:", &b->writes[0].journal);
+ prt_tab(out);
+ prt_printf(out, "%llu", b->writes[0].journal.seq);
+ prt_newline(out);
+
+ prt_printf(out, "journal pin %px:", &b->writes[1].journal);
+ prt_tab(out);
+ prt_printf(out, "%llu", b->writes[1].journal.seq);
+ prt_newline(out);
+
+ printbuf_indent_sub(out, 2);
+}
+
+static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct bch_fs *c = i->c;
+ bool done = false;
+ ssize_t ret = 0;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ do {
+ struct bucket_table *tbl;
+ struct rhash_head *pos;
+ struct btree *b;
+
+ ret = flush_buf(i);
+ if (ret)
+ return ret;
+
+ rcu_read_lock();
+ i->buf.atomic++;
+ tbl = rht_dereference_rcu(c->btree_cache.table.tbl,
+ &c->btree_cache.table);
+ if (i->iter < tbl->size) {
+ rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash)
+ bch2_cached_btree_node_to_text(&i->buf, c, b);
+ i->iter++;
+ } else {
+ done = true;
+ }
+ --i->buf.atomic;
+ rcu_read_unlock();
+ } while (!done);
+
+ if (i->buf.allocation_failure)
+ ret = -ENOMEM;
+
+ if (!ret)
+ ret = flush_buf(i);
+
+ return ret ?: i->ret;
+}
+
+static const struct file_operations cached_btree_nodes_ops = {
+ .owner = THIS_MODULE,
+ .open = bch2_dump_open,
+ .release = bch2_dump_release,
+ .read = bch2_cached_btree_nodes_read,
+};
+
+#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
+static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct bch_fs *c = i->c;
+ struct btree_trans *trans;
+ ssize_t ret = 0;
+ u32 seq;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+restart:
+ seqmutex_lock(&c->btree_trans_lock);
+ list_for_each_entry(trans, &c->btree_trans_list, list) {
+ if (trans->locking_wait.task->pid <= i->iter)
+ continue;
+
+ closure_get(&trans->ref);
+ seq = seqmutex_seq(&c->btree_trans_lock);
+ seqmutex_unlock(&c->btree_trans_lock);
+
+ ret = flush_buf(i);
+ if (ret) {
+ closure_put(&trans->ref);
+ goto unlocked;
+ }
+
+ bch2_btree_trans_to_text(&i->buf, trans);
+
+ prt_printf(&i->buf, "backtrace:");
+ prt_newline(&i->buf);
+ printbuf_indent_add(&i->buf, 2);
+ bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task);
+ printbuf_indent_sub(&i->buf, 2);
+ prt_newline(&i->buf);
+
+ i->iter = trans->locking_wait.task->pid;
+
+ closure_put(&trans->ref);
+
+ if (!seqmutex_relock(&c->btree_trans_lock, seq))
+ goto restart;
+ }
+ seqmutex_unlock(&c->btree_trans_lock);
+unlocked:
+ if (i->buf.allocation_failure)
+ ret = -ENOMEM;
+
+ if (!ret)
+ ret = flush_buf(i);
+
+ return ret ?: i->ret;
+}
+
+static const struct file_operations btree_transactions_ops = {
+ .owner = THIS_MODULE,
+ .open = bch2_dump_open,
+ .release = bch2_dump_release,
+ .read = bch2_btree_transactions_read,
+};
+#endif /* CONFIG_BCACHEFS_DEBUG_TRANSACTIONS */
+
+static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct bch_fs *c = i->c;
+ bool done = false;
+ int err;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ do {
+ err = flush_buf(i);
+ if (err)
+ return err;
+
+ if (!i->size)
+ break;
+
+ done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter);
+ i->iter++;
+ } while (!done);
+
+ if (i->buf.allocation_failure)
+ return -ENOMEM;
+
+ return i->ret;
+}
+
+static const struct file_operations journal_pins_ops = {
+ .owner = THIS_MODULE,
+ .open = bch2_dump_open,
+ .release = bch2_dump_release,
+ .read = bch2_journal_pins_read,
+};
+
+static int lock_held_stats_open(struct inode *inode, struct file *file)
+{
+ struct bch_fs *c = inode->i_private;
+ struct dump_iter *i;
+
+ i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
+
+ if (!i)
+ return -ENOMEM;
+
+ i->iter = 0;
+ i->c = c;
+ i->buf = PRINTBUF;
+ file->private_data = i;
+
+ return 0;
+}
+
+static int lock_held_stats_release(struct inode *inode, struct file *file)
+{
+ struct dump_iter *i = file->private_data;
+
+ printbuf_exit(&i->buf);
+ kfree(i);
+
+ return 0;
+}
+
+static ssize_t lock_held_stats_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct bch_fs *c = i->c;
+ int err;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ while (1) {
+ struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter];
+
+ err = flush_buf(i);
+ if (err)
+ return err;
+
+ if (!i->size)
+ break;
+
+ if (i->iter == ARRAY_SIZE(bch2_btree_transaction_fns) ||
+ !bch2_btree_transaction_fns[i->iter])
+ break;
+
+ prt_printf(&i->buf, "%s: ", bch2_btree_transaction_fns[i->iter]);
+ prt_newline(&i->buf);
+ printbuf_indent_add(&i->buf, 2);
+
+ mutex_lock(&s->lock);
+
+ prt_printf(&i->buf, "Max mem used: %u", s->max_mem);
+ prt_newline(&i->buf);
+
+ if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) {
+ prt_printf(&i->buf, "Lock hold times:");
+ prt_newline(&i->buf);
+
+ printbuf_indent_add(&i->buf, 2);
+ bch2_time_stats_to_text(&i->buf, &s->lock_hold_times);
+ printbuf_indent_sub(&i->buf, 2);
+ }
+
+ if (s->max_paths_text) {
+ prt_printf(&i->buf, "Maximum allocated btree paths (%u):", s->nr_max_paths);
+ prt_newline(&i->buf);
+
+ printbuf_indent_add(&i->buf, 2);
+ prt_str_indented(&i->buf, s->max_paths_text);
+ printbuf_indent_sub(&i->buf, 2);
+ }
+
+ mutex_unlock(&s->lock);
+
+ printbuf_indent_sub(&i->buf, 2);
+ prt_newline(&i->buf);
+ i->iter++;
+ }
+
+ if (i->buf.allocation_failure)
+ return -ENOMEM;
+
+ return i->ret;
+}
+
+static const struct file_operations lock_held_stats_op = {
+ .owner = THIS_MODULE,
+ .open = lock_held_stats_open,
+ .release = lock_held_stats_release,
+ .read = lock_held_stats_read,
+};
+
+static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct bch_fs *c = i->c;
+ struct btree_trans *trans;
+ ssize_t ret = 0;
+ u32 seq;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ if (i->iter)
+ goto out;
+restart:
+ seqmutex_lock(&c->btree_trans_lock);
+ list_for_each_entry(trans, &c->btree_trans_list, list) {
+ if (trans->locking_wait.task->pid <= i->iter)
+ continue;
+
+ closure_get(&trans->ref);
+ seq = seqmutex_seq(&c->btree_trans_lock);
+ seqmutex_unlock(&c->btree_trans_lock);
+
+ ret = flush_buf(i);
+ if (ret) {
+ closure_put(&trans->ref);
+ goto out;
+ }
+
+ bch2_check_for_deadlock(trans, &i->buf);
+
+ i->iter = trans->locking_wait.task->pid;
+
+ closure_put(&trans->ref);
+
+ if (!seqmutex_relock(&c->btree_trans_lock, seq))
+ goto restart;
+ }
+ seqmutex_unlock(&c->btree_trans_lock);
+out:
+ if (i->buf.allocation_failure)
+ ret = -ENOMEM;
+
+ if (!ret)
+ ret = flush_buf(i);
+
+ return ret ?: i->ret;
+}
+
+static const struct file_operations btree_deadlock_ops = {
+ .owner = THIS_MODULE,
+ .open = bch2_dump_open,
+ .release = bch2_dump_release,
+ .read = bch2_btree_deadlock_read,
+};
+
+void bch2_fs_debug_exit(struct bch_fs *c)
+{
+ if (!IS_ERR_OR_NULL(c->fs_debug_dir))
+ debugfs_remove_recursive(c->fs_debug_dir);
+}
+
+void bch2_fs_debug_init(struct bch_fs *c)
+{
+ struct btree_debug *bd;
+ char name[100];
+
+ if (IS_ERR_OR_NULL(bch_debug))
+ return;
+
+ snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
+ c->fs_debug_dir = debugfs_create_dir(name, bch_debug);
+ if (IS_ERR_OR_NULL(c->fs_debug_dir))
+ return;
+
+ debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir,
+ c->btree_debug, &cached_btree_nodes_ops);
+
+#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
+ debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir,
+ c->btree_debug, &btree_transactions_ops);
+#endif
+
+ debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
+ c->btree_debug, &journal_pins_ops);
+
+ debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir,
+ c, &lock_held_stats_op);
+
+ debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir,
+ c->btree_debug, &btree_deadlock_ops);
+
+ c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir);
+ if (IS_ERR_OR_NULL(c->btree_debug_dir))
+ return;
+
+ for (bd = c->btree_debug;
+ bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
+ bd++) {
+ bd->id = bd - c->btree_debug;
+ debugfs_create_file(bch2_btree_id_str(bd->id),
+ 0400, c->btree_debug_dir, bd,
+ &btree_debug_ops);
+
+ snprintf(name, sizeof(name), "%s-formats",
+ bch2_btree_id_str(bd->id));
+
+ debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
+ &btree_format_debug_ops);
+
+ snprintf(name, sizeof(name), "%s-bfloat-failed",
+ bch2_btree_id_str(bd->id));
+
+ debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
+ &bfloat_failed_debug_ops);
+ }
+}
+
+#endif
+
+void bch2_debug_exit(void)
+{
+ if (!IS_ERR_OR_NULL(bch_debug))
+ debugfs_remove_recursive(bch_debug);
+}
+
+int __init bch2_debug_init(void)
+{
+ int ret = 0;
+
+ bch_debug = debugfs_create_dir("bcachefs", NULL);
+ return ret;
+}
diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h
new file mode 100644
index 000000000000..2c37143b5fd1
--- /dev/null
+++ b/fs/bcachefs/debug.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DEBUG_H
+#define _BCACHEFS_DEBUG_H
+
+#include "bcachefs.h"
+
+struct bio;
+struct btree;
+struct bch_fs;
+
+void __bch2_btree_verify(struct bch_fs *, struct btree *);
+void bch2_btree_node_ondisk_to_text(struct printbuf *, struct bch_fs *,
+ const struct btree *);
+
+static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
+{
+ if (bch2_verify_btree_ondisk)
+ __bch2_btree_verify(c, b);
+}
+
+#ifdef CONFIG_DEBUG_FS
+void bch2_fs_debug_exit(struct bch_fs *);
+void bch2_fs_debug_init(struct bch_fs *);
+#else
+static inline void bch2_fs_debug_exit(struct bch_fs *c) {}
+static inline void bch2_fs_debug_init(struct bch_fs *c) {}
+#endif
+
+void bch2_debug_exit(void);
+int bch2_debug_init(void);
+
+#endif /* _BCACHEFS_DEBUG_H */
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
new file mode 100644
index 000000000000..2bfff0da7000
--- /dev/null
+++ b/fs/bcachefs/dirent.c
@@ -0,0 +1,580 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "bkey_methods.h"
+#include "btree_update.h"
+#include "extents.h"
+#include "dirent.h"
+#include "fs.h"
+#include "keylist.h"
+#include "str_hash.h"
+#include "subvolume.h"
+
+#include <linux/dcache.h>
+
+static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
+{
+ unsigned bkey_u64s = bkey_val_u64s(d.k);
+ unsigned bkey_bytes = bkey_u64s * sizeof(u64);
+ u64 last_u64 = ((u64*)d.v)[bkey_u64s - 1];
+#if CPU_BIG_ENDIAN
+ unsigned trailing_nuls = last_u64 ? __builtin_ctzll(last_u64) / 8 : 64 / 8;
+#else
+ unsigned trailing_nuls = last_u64 ? __builtin_clzll(last_u64) / 8 : 64 / 8;
+#endif
+
+ return bkey_bytes -
+ offsetof(struct bch_dirent, d_name) -
+ trailing_nuls;
+}
+
+struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d)
+{
+ return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
+}
+
+static u64 bch2_dirent_hash(const struct bch_hash_info *info,
+ const struct qstr *name)
+{
+ struct bch_str_hash_ctx ctx;
+
+ bch2_str_hash_init(&ctx, info);
+ bch2_str_hash_update(&ctx, info, name->name, name->len);
+
+ /* [0,2) reserved for dots */
+ return max_t(u64, bch2_str_hash_end(&ctx, info), 2);
+}
+
+static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
+{
+ return bch2_dirent_hash(info, key);
+}
+
+static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
+{
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+ struct qstr name = bch2_dirent_get_name(d);
+
+ return bch2_dirent_hash(info, &name);
+}
+
+static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
+{
+ struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
+ const struct qstr l_name = bch2_dirent_get_name(l);
+ const struct qstr *r_name = _r;
+
+ return l_name.len - r_name->len ?: memcmp(l_name.name, r_name->name, l_name.len);
+}
+
+static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
+{
+ struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
+ struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
+ const struct qstr l_name = bch2_dirent_get_name(l);
+ const struct qstr r_name = bch2_dirent_get_name(r);
+
+ return l_name.len - r_name.len ?: memcmp(l_name.name, r_name.name, l_name.len);
+}
+
+static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k)
+{
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+
+ if (d.v->d_type == DT_SUBVOL)
+ return le32_to_cpu(d.v->d_parent_subvol) == inum.subvol;
+ return true;
+}
+
+const struct bch_hash_desc bch2_dirent_hash_desc = {
+ .btree_id = BTREE_ID_dirents,
+ .key_type = KEY_TYPE_dirent,
+ .hash_key = dirent_hash_key,
+ .hash_bkey = dirent_hash_bkey,
+ .cmp_key = dirent_cmp_key,
+ .cmp_bkey = dirent_cmp_bkey,
+ .is_visible = dirent_is_visible,
+};
+
+int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
+{
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+ struct qstr d_name = bch2_dirent_get_name(d);
+ int ret = 0;
+
+ bkey_fsck_err_on(!d_name.len, c, err,
+ dirent_empty_name,
+ "empty name");
+
+ bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len), c, err,
+ dirent_val_too_big,
+ "value too big (%zu > %u)",
+ bkey_val_u64s(k.k), dirent_val_u64s(d_name.len));
+
+ /*
+ * Check new keys don't exceed the max length
+ * (older keys may be larger.)
+ */
+ bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) && d_name.len > BCH_NAME_MAX, c, err,
+ dirent_name_too_long,
+ "dirent name too big (%u > %u)",
+ d_name.len, BCH_NAME_MAX);
+
+ bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len), c, err,
+ dirent_name_embedded_nul,
+ "dirent has stray data after name's NUL");
+
+ bkey_fsck_err_on((d_name.len == 1 && !memcmp(d_name.name, ".", 1)) ||
+ (d_name.len == 2 && !memcmp(d_name.name, "..", 2)), c, err,
+ dirent_name_dot_or_dotdot,
+ "invalid name");
+
+ bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len), c, err,
+ dirent_name_has_slash,
+ "name with /");
+
+ bkey_fsck_err_on(d.v->d_type != DT_SUBVOL &&
+ le64_to_cpu(d.v->d_inum) == d.k->p.inode, c, err,
+ dirent_to_itself,
+ "dirent points to own directory");
+fsck_err:
+ return ret;
+}
+
+void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+ struct qstr d_name = bch2_dirent_get_name(d);
+
+ prt_printf(out, "%.*s -> %llu type %s",
+ d_name.len,
+ d_name.name,
+ d.v->d_type != DT_SUBVOL
+ ? le64_to_cpu(d.v->d_inum)
+ : le32_to_cpu(d.v->d_child_subvol),
+ bch2_d_type_str(d.v->d_type));
+}
+
+static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
+ subvol_inum dir, u8 type,
+ const struct qstr *name, u64 dst)
+{
+ struct bkey_i_dirent *dirent;
+ unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
+
+ if (name->len > BCH_NAME_MAX)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ BUG_ON(u64s > U8_MAX);
+
+ dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
+ if (IS_ERR(dirent))
+ return dirent;
+
+ bkey_dirent_init(&dirent->k_i);
+ dirent->k.u64s = u64s;
+
+ if (type != DT_SUBVOL) {
+ dirent->v.d_inum = cpu_to_le64(dst);
+ } else {
+ dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol);
+ dirent->v.d_child_subvol = cpu_to_le32(dst);
+ }
+
+ dirent->v.d_type = type;
+
+ memcpy(dirent->v.d_name, name->name, name->len);
+ memset(dirent->v.d_name + name->len, 0,
+ bkey_val_bytes(&dirent->k) -
+ offsetof(struct bch_dirent, d_name) -
+ name->len);
+
+ EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len);
+
+ return dirent;
+}
+
+int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
+ const struct bch_hash_info *hash_info,
+ u8 type, const struct qstr *name, u64 dst_inum,
+ u64 *dir_offset, int flags)
+{
+ struct bkey_i_dirent *dirent;
+ int ret;
+
+ dirent = dirent_create_key(trans, dir, type, name, dst_inum);
+ ret = PTR_ERR_OR_ZERO(dirent);
+ if (ret)
+ return ret;
+
+ ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
+ dir, &dirent->k_i, flags);
+ *dir_offset = dirent->k.p.offset;
+
+ return ret;
+}
+
+static void dirent_copy_target(struct bkey_i_dirent *dst,
+ struct bkey_s_c_dirent src)
+{
+ dst->v.d_inum = src.v->d_inum;
+ dst->v.d_type = src.v->d_type;
+}
+
+int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
+ struct bkey_s_c_dirent d, subvol_inum *target)
+{
+ struct bch_subvolume s;
+ int ret = 0;
+
+ if (d.v->d_type == DT_SUBVOL &&
+ le32_to_cpu(d.v->d_parent_subvol) != dir.subvol)
+ return 1;
+
+ if (likely(d.v->d_type != DT_SUBVOL)) {
+ target->subvol = dir.subvol;
+ target->inum = le64_to_cpu(d.v->d_inum);
+ } else {
+ target->subvol = le32_to_cpu(d.v->d_child_subvol);
+
+ ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_CACHED, &s);
+
+ target->inum = le64_to_cpu(s.inode);
+ }
+
+ return ret;
+}
+
+int bch2_dirent_rename(struct btree_trans *trans,
+ subvol_inum src_dir, struct bch_hash_info *src_hash,
+ subvol_inum dst_dir, struct bch_hash_info *dst_hash,
+ const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset,
+ const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset,
+ enum bch_rename_mode mode)
+{
+ struct btree_iter src_iter = { NULL };
+ struct btree_iter dst_iter = { NULL };
+ struct bkey_s_c old_src, old_dst = bkey_s_c_null;
+ struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
+ struct bpos dst_pos =
+ POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
+ unsigned src_type = 0, dst_type = 0, src_update_flags = 0;
+ int ret = 0;
+
+ if (src_dir.subvol != dst_dir.subvol)
+ return -EXDEV;
+
+ memset(src_inum, 0, sizeof(*src_inum));
+ memset(dst_inum, 0, sizeof(*dst_inum));
+
+ /* Lookup src: */
+ ret = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc,
+ src_hash, src_dir, src_name,
+ BTREE_ITER_INTENT);
+ if (ret)
+ goto out;
+
+ old_src = bch2_btree_iter_peek_slot(&src_iter);
+ ret = bkey_err(old_src);
+ if (ret)
+ goto out;
+
+ ret = bch2_dirent_read_target(trans, src_dir,
+ bkey_s_c_to_dirent(old_src), src_inum);
+ if (ret)
+ goto out;
+
+ src_type = bkey_s_c_to_dirent(old_src).v->d_type;
+
+ if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE)
+ return -EOPNOTSUPP;
+
+
+ /* Lookup dst: */
+ if (mode == BCH_RENAME) {
+ /*
+ * Note that we're _not_ checking if the target already exists -
+ * we're relying on the VFS to do that check for us for
+ * correctness:
+ */
+ ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc,
+ dst_hash, dst_dir, dst_name);
+ if (ret)
+ goto out;
+ } else {
+ ret = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
+ dst_hash, dst_dir, dst_name,
+ BTREE_ITER_INTENT);
+ if (ret)
+ goto out;
+
+ old_dst = bch2_btree_iter_peek_slot(&dst_iter);
+ ret = bkey_err(old_dst);
+ if (ret)
+ goto out;
+
+ ret = bch2_dirent_read_target(trans, dst_dir,
+ bkey_s_c_to_dirent(old_dst), dst_inum);
+ if (ret)
+ goto out;
+
+ dst_type = bkey_s_c_to_dirent(old_dst).v->d_type;
+
+ if (dst_type == DT_SUBVOL)
+ return -EOPNOTSUPP;
+ }
+
+ if (mode != BCH_RENAME_EXCHANGE)
+ *src_offset = dst_iter.pos.offset;
+
+ /* Create new dst key: */
+ new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, 0);
+ ret = PTR_ERR_OR_ZERO(new_dst);
+ if (ret)
+ goto out;
+
+ dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
+ new_dst->k.p = dst_iter.pos;
+
+ /* Create new src key: */
+ if (mode == BCH_RENAME_EXCHANGE) {
+ new_src = dirent_create_key(trans, src_dir, 0, src_name, 0);
+ ret = PTR_ERR_OR_ZERO(new_src);
+ if (ret)
+ goto out;
+
+ dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
+ new_src->k.p = src_iter.pos;
+ } else {
+ new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+ ret = PTR_ERR_OR_ZERO(new_src);
+ if (ret)
+ goto out;
+
+ bkey_init(&new_src->k);
+ new_src->k.p = src_iter.pos;
+
+ if (bkey_le(dst_pos, src_iter.pos) &&
+ bkey_lt(src_iter.pos, dst_iter.pos)) {
+ /*
+ * We have a hash collision for the new dst key,
+ * and new_src - the key we're deleting - is between
+ * new_dst's hashed slot and the slot we're going to be
+ * inserting it into - oops. This will break the hash
+ * table if we don't deal with it:
+ */
+ if (mode == BCH_RENAME) {
+ /*
+ * If we're not overwriting, we can just insert
+ * new_dst at the src position:
+ */
+ new_src = new_dst;
+ new_src->k.p = src_iter.pos;
+ goto out_set_src;
+ } else {
+ /* If we're overwriting, we can't insert new_dst
+ * at a different slot because it has to
+ * overwrite old_dst - just make sure to use a
+ * whiteout when deleting src:
+ */
+ new_src->k.type = KEY_TYPE_hash_whiteout;
+ }
+ } else {
+ /* Check if we need a whiteout to delete src: */
+ ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc,
+ src_hash, &src_iter);
+ if (ret < 0)
+ goto out;
+
+ if (ret)
+ new_src->k.type = KEY_TYPE_hash_whiteout;
+ }
+ }
+
+ ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
+ if (ret)
+ goto out;
+out_set_src:
+
+ /*
+ * If we're deleting a subvolume, we need to really delete the dirent,
+ * not just emit a whiteout in the current snapshot:
+ */
+ if (src_type == DT_SUBVOL) {
+ bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
+ ret = bch2_btree_iter_traverse(&src_iter);
+ if (ret)
+ goto out;
+
+ new_src->k.p = src_iter.pos;
+ src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE;
+ }
+
+ ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
+ if (ret)
+ goto out;
+
+ if (mode == BCH_RENAME_EXCHANGE)
+ *src_offset = new_src->k.p.offset;
+ *dst_offset = new_dst->k.p.offset;
+out:
+ bch2_trans_iter_exit(trans, &src_iter);
+ bch2_trans_iter_exit(trans, &dst_iter);
+ return ret;
+}
+
+int __bch2_dirent_lookup_trans(struct btree_trans *trans,
+ struct btree_iter *iter,
+ subvol_inum dir,
+ const struct bch_hash_info *hash_info,
+ const struct qstr *name, subvol_inum *inum,
+ unsigned flags)
+{
+ struct bkey_s_c k;
+ struct bkey_s_c_dirent d;
+ u32 snapshot;
+ int ret;
+
+ ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
+ if (ret)
+ return ret;
+
+ ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
+ hash_info, dir, name, flags);
+ if (ret)
+ return ret;
+
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ d = bkey_s_c_to_dirent(k);
+
+ ret = bch2_dirent_read_target(trans, dir, d, inum);
+ if (ret > 0)
+ ret = -ENOENT;
+err:
+ if (ret)
+ bch2_trans_iter_exit(trans, iter);
+
+ return ret;
+}
+
+u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
+ const struct bch_hash_info *hash_info,
+ const struct qstr *name, subvol_inum *inum)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ int ret;
+retry:
+ bch2_trans_begin(trans);
+
+ ret = __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info,
+ name, inum, 0);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
+ if (!ret)
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
+ return ret;
+}
+
+int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 snapshot)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
+ SPOS(dir, 0, snapshot),
+ POS(dir, U64_MAX), 0, k, ret)
+ if (k.k->type == KEY_TYPE_dirent) {
+ ret = -ENOTEMPTY;
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret;
+}
+
+int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
+{
+ u32 snapshot;
+
+ return bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot) ?:
+ bch2_empty_dir_snapshot(trans, dir.inum, snapshot);
+}
+
+int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_s_c_dirent dirent;
+ subvol_inum target;
+ u32 snapshot;
+ struct bkey_buf sk;
+ struct qstr name;
+ int ret;
+
+ bch2_bkey_buf_init(&sk);
+retry:
+ bch2_trans_begin(trans);
+
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
+ SPOS(inum.inum, ctx->pos, snapshot),
+ POS(inum.inum, U64_MAX), 0, k, ret) {
+ if (k.k->type != KEY_TYPE_dirent)
+ continue;
+
+ dirent = bkey_s_c_to_dirent(k);
+
+ ret = bch2_dirent_read_target(trans, inum, dirent, &target);
+ if (ret < 0)
+ break;
+ if (ret)
+ continue;
+
+ /* dir_emit() can fault and block: */
+ bch2_bkey_buf_reassemble(&sk, c, k);
+ dirent = bkey_i_to_s_c_dirent(sk.k);
+ bch2_trans_unlock(trans);
+
+ name = bch2_dirent_get_name(dirent);
+
+ ctx->pos = dirent.k->p.offset;
+ if (!dir_emit(ctx, name.name,
+ name.len,
+ target.inum,
+ vfs_d_type(dirent.v->d_type)))
+ break;
+ ctx->pos = dirent.k->p.offset + 1;
+
+ /*
+ * read_target looks up subvolumes, we can overflow paths if the
+ * directory has many subvolumes in it
+ */
+ ret = btree_trans_too_many_iters(trans);
+ if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
+
+ bch2_trans_put(trans);
+ bch2_bkey_buf_exit(&sk, c);
+
+ return ret;
+}
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
new file mode 100644
index 000000000000..1e3431990abd
--- /dev/null
+++ b/fs/bcachefs/dirent.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DIRENT_H
+#define _BCACHEFS_DIRENT_H
+
+#include "str_hash.h"
+
+enum bkey_invalid_flags;
+extern const struct bch_hash_desc bch2_dirent_hash_desc;
+
+int bch2_dirent_invalid(struct bch_fs *, struct bkey_s_c,
+ enum bkey_invalid_flags, struct printbuf *);
+void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_dirent ((struct bkey_ops) { \
+ .key_invalid = bch2_dirent_invalid, \
+ .val_to_text = bch2_dirent_to_text, \
+ .min_val_size = 16, \
+})
+
+struct qstr;
+struct file;
+struct dir_context;
+struct bch_fs;
+struct bch_hash_info;
+struct bch_inode_info;
+
+struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d);
+
+static inline unsigned dirent_val_u64s(unsigned len)
+{
+ return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len,
+ sizeof(u64));
+}
+
+int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
+ struct bkey_s_c_dirent, subvol_inum *);
+
+int bch2_dirent_create(struct btree_trans *, subvol_inum,
+ const struct bch_hash_info *, u8,
+ const struct qstr *, u64, u64 *, int);
+
+static inline unsigned vfs_d_type(unsigned type)
+{
+ return type == DT_SUBVOL ? DT_DIR : type;
+}
+
+enum bch_rename_mode {
+ BCH_RENAME,
+ BCH_RENAME_OVERWRITE,
+ BCH_RENAME_EXCHANGE,
+};
+
+int bch2_dirent_rename(struct btree_trans *,
+ subvol_inum, struct bch_hash_info *,
+ subvol_inum, struct bch_hash_info *,
+ const struct qstr *, subvol_inum *, u64 *,
+ const struct qstr *, subvol_inum *, u64 *,
+ enum bch_rename_mode);
+
+int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
+ subvol_inum, const struct bch_hash_info *,
+ const struct qstr *, subvol_inum *, unsigned);
+u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
+ const struct bch_hash_info *,
+ const struct qstr *, subvol_inum *);
+
+int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32);
+int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
+int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);
+
+#endif /* _BCACHEFS_DIRENT_H */
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
new file mode 100644
index 000000000000..4d0cb0ccff32
--- /dev/null
+++ b/fs/bcachefs/disk_groups.c
@@ -0,0 +1,622 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "disk_groups.h"
+#include "sb-members.h"
+#include "super-io.h"
+
+#include <linux/sort.h>
+
+static int group_cmp(const void *_l, const void *_r)
+{
+ const struct bch_disk_group *l = _l;
+ const struct bch_disk_group *r = _r;
+
+ return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) -
+ (BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?:
+ ((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) -
+ (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?:
+ strncmp(l->label, r->label, sizeof(l->label));
+}
+
+static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ struct printbuf *err)
+{
+ struct bch_sb_field_disk_groups *groups =
+ field_to_type(f, disk_groups);
+ struct bch_disk_group *g, *sorted = NULL;
+ unsigned nr_groups = disk_groups_nr(groups);
+ unsigned i, len;
+ int ret = 0;
+
+ for (i = 0; i < sb->nr_devices; i++) {
+ struct bch_member m = bch2_sb_member_get(sb, i);
+ unsigned group_id;
+
+ if (!BCH_MEMBER_GROUP(&m))
+ continue;
+
+ group_id = BCH_MEMBER_GROUP(&m) - 1;
+
+ if (group_id >= nr_groups) {
+ prt_printf(err, "disk %u has invalid label %u (have %u)",
+ i, group_id, nr_groups);
+ return -BCH_ERR_invalid_sb_disk_groups;
+ }
+
+ if (BCH_GROUP_DELETED(&groups->entries[group_id])) {
+ prt_printf(err, "disk %u has deleted label %u", i, group_id);
+ return -BCH_ERR_invalid_sb_disk_groups;
+ }
+ }
+
+ if (!nr_groups)
+ return 0;
+
+ for (i = 0; i < nr_groups; i++) {
+ g = groups->entries + i;
+
+ if (BCH_GROUP_DELETED(g))
+ continue;
+
+ len = strnlen(g->label, sizeof(g->label));
+ if (!len) {
+ prt_printf(err, "label %u empty", i);
+ return -BCH_ERR_invalid_sb_disk_groups;
+ }
+ }
+
+ sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL);
+ if (!sorted)
+ return -BCH_ERR_ENOMEM_disk_groups_validate;
+
+ memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted));
+ sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL);
+
+ for (g = sorted; g + 1 < sorted + nr_groups; g++)
+ if (!BCH_GROUP_DELETED(g) &&
+ !group_cmp(&g[0], &g[1])) {
+ prt_printf(err, "duplicate label %llu.%.*s",
+ BCH_GROUP_PARENT(g),
+ (int) sizeof(g->label), g->label);
+ ret = -BCH_ERR_invalid_sb_disk_groups;
+ goto err;
+ }
+err:
+ kfree(sorted);
+ return ret;
+}
+
+void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ struct bch_disk_groups_cpu *g;
+ struct bch_dev *ca;
+ int i;
+ unsigned iter;
+
+ out->atomic++;
+ rcu_read_lock();
+
+ g = rcu_dereference(c->disk_groups);
+ if (!g)
+ goto out;
+
+ for (i = 0; i < g->nr; i++) {
+ if (i)
+ prt_printf(out, " ");
+
+ if (g->entries[i].deleted) {
+ prt_printf(out, "[deleted]");
+ continue;
+ }
+
+ prt_printf(out, "[parent %d devs", g->entries[i].parent);
+ for_each_member_device_rcu(ca, c, iter, &g->entries[i].devs)
+ prt_printf(out, " %s", ca->name);
+ prt_printf(out, "]");
+ }
+
+out:
+ rcu_read_unlock();
+ out->atomic--;
+}
+
+static void bch2_sb_disk_groups_to_text(struct printbuf *out,
+ struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_disk_groups *groups =
+ field_to_type(f, disk_groups);
+ struct bch_disk_group *g;
+ unsigned nr_groups = disk_groups_nr(groups);
+
+ for (g = groups->entries;
+ g < groups->entries + nr_groups;
+ g++) {
+ if (g != groups->entries)
+ prt_printf(out, " ");
+
+ if (BCH_GROUP_DELETED(g))
+ prt_printf(out, "[deleted]");
+ else
+ prt_printf(out, "[parent %llu name %s]",
+ BCH_GROUP_PARENT(g), g->label);
+ }
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = {
+ .validate = bch2_sb_disk_groups_validate,
+ .to_text = bch2_sb_disk_groups_to_text
+};
+
+int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
+{
+ struct bch_sb_field_disk_groups *groups;
+ struct bch_disk_groups_cpu *cpu_g, *old_g;
+ unsigned i, g, nr_groups;
+
+ lockdep_assert_held(&c->sb_lock);
+
+ groups = bch2_sb_field_get(c->disk_sb.sb, disk_groups);
+ nr_groups = disk_groups_nr(groups);
+
+ if (!groups)
+ return 0;
+
+ cpu_g = kzalloc(struct_size(cpu_g, entries, nr_groups), GFP_KERNEL);
+ if (!cpu_g)
+ return -BCH_ERR_ENOMEM_disk_groups_to_cpu;
+
+ cpu_g->nr = nr_groups;
+
+ for (i = 0; i < nr_groups; i++) {
+ struct bch_disk_group *src = &groups->entries[i];
+ struct bch_disk_group_cpu *dst = &cpu_g->entries[i];
+
+ dst->deleted = BCH_GROUP_DELETED(src);
+ dst->parent = BCH_GROUP_PARENT(src);
+ memcpy(dst->label, src->label, sizeof(dst->label));
+ }
+
+ for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
+ struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, i);
+ struct bch_disk_group_cpu *dst;
+
+ if (!bch2_member_exists(&m))
+ continue;
+
+ g = BCH_MEMBER_GROUP(&m);
+ while (g) {
+ dst = &cpu_g->entries[g - 1];
+ __set_bit(i, dst->devs.d);
+ g = dst->parent;
+ }
+ }
+
+ old_g = rcu_dereference_protected(c->disk_groups,
+ lockdep_is_held(&c->sb_lock));
+ rcu_assign_pointer(c->disk_groups, cpu_g);
+ if (old_g)
+ kfree_rcu(old_g, rcu);
+
+ return 0;
+}
+
+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
+{
+ struct target t = target_decode(target);
+ struct bch_devs_mask *devs;
+
+ rcu_read_lock();
+
+ switch (t.type) {
+ case TARGET_NULL:
+ devs = NULL;
+ break;
+ case TARGET_DEV: {
+ struct bch_dev *ca = t.dev < c->sb.nr_devices
+ ? rcu_dereference(c->devs[t.dev])
+ : NULL;
+ devs = ca ? &ca->self : NULL;
+ break;
+ }
+ case TARGET_GROUP: {
+ struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
+
+ devs = g && t.group < g->nr && !g->entries[t.group].deleted
+ ? &g->entries[t.group].devs
+ : NULL;
+ break;
+ }
+ default:
+ BUG();
+ }
+
+ rcu_read_unlock();
+
+ return devs;
+}
+
+bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
+{
+ struct target t = target_decode(target);
+
+ switch (t.type) {
+ case TARGET_NULL:
+ return false;
+ case TARGET_DEV:
+ return dev == t.dev;
+ case TARGET_GROUP: {
+ struct bch_disk_groups_cpu *g;
+ const struct bch_devs_mask *m;
+ bool ret;
+
+ rcu_read_lock();
+ g = rcu_dereference(c->disk_groups);
+ m = g && t.group < g->nr && !g->entries[t.group].deleted
+ ? &g->entries[t.group].devs
+ : NULL;
+
+ ret = m ? test_bit(dev, m->d) : false;
+ rcu_read_unlock();
+
+ return ret;
+ }
+ default:
+ BUG();
+ }
+}
+
+static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups,
+ unsigned parent,
+ const char *name, unsigned namelen)
+{
+ unsigned i, nr_groups = disk_groups_nr(groups);
+
+ if (!namelen || namelen > BCH_SB_LABEL_SIZE)
+ return -EINVAL;
+
+ for (i = 0; i < nr_groups; i++) {
+ struct bch_disk_group *g = groups->entries + i;
+
+ if (BCH_GROUP_DELETED(g))
+ continue;
+
+ if (!BCH_GROUP_DELETED(g) &&
+ BCH_GROUP_PARENT(g) == parent &&
+ strnlen(g->label, sizeof(g->label)) == namelen &&
+ !memcmp(name, g->label, namelen))
+ return i;
+ }
+
+ return -1;
+}
+
+static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
+ const char *name, unsigned namelen)
+{
+ struct bch_sb_field_disk_groups *groups =
+ bch2_sb_field_get(sb->sb, disk_groups);
+ unsigned i, nr_groups = disk_groups_nr(groups);
+ struct bch_disk_group *g;
+
+ if (!namelen || namelen > BCH_SB_LABEL_SIZE)
+ return -EINVAL;
+
+ for (i = 0;
+ i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]);
+ i++)
+ ;
+
+ if (i == nr_groups) {
+ unsigned u64s =
+ (sizeof(struct bch_sb_field_disk_groups) +
+ sizeof(struct bch_disk_group) * (nr_groups + 1)) /
+ sizeof(u64);
+
+ groups = bch2_sb_field_resize(sb, disk_groups, u64s);
+ if (!groups)
+ return -BCH_ERR_ENOSPC_disk_label_add;
+
+ nr_groups = disk_groups_nr(groups);
+ }
+
+ BUG_ON(i >= nr_groups);
+
+ g = &groups->entries[i];
+
+ memcpy(g->label, name, namelen);
+ if (namelen < sizeof(g->label))
+ g->label[namelen] = '\0';
+ SET_BCH_GROUP_DELETED(g, 0);
+ SET_BCH_GROUP_PARENT(g, parent);
+ SET_BCH_GROUP_DATA_ALLOWED(g, ~0);
+
+ return i;
+}
+
+int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name)
+{
+ struct bch_sb_field_disk_groups *groups =
+ bch2_sb_field_get(sb->sb, disk_groups);
+ int v = -1;
+
+ do {
+ const char *next = strchrnul(name, '.');
+ unsigned len = next - name;
+
+ if (*next == '.')
+ next++;
+
+ v = __bch2_disk_group_find(groups, v + 1, name, len);
+ name = next;
+ } while (*name && v >= 0);
+
+ return v;
+}
+
+int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
+{
+ struct bch_sb_field_disk_groups *groups;
+ unsigned parent = 0;
+ int v = -1;
+
+ do {
+ const char *next = strchrnul(name, '.');
+ unsigned len = next - name;
+
+ if (*next == '.')
+ next++;
+
+ groups = bch2_sb_field_get(sb->sb, disk_groups);
+
+ v = __bch2_disk_group_find(groups, parent, name, len);
+ if (v < 0)
+ v = __bch2_disk_group_add(sb, parent, name, len);
+ if (v < 0)
+ return v;
+
+ parent = v + 1;
+ name = next;
+ } while (*name && v >= 0);
+
+ return v;
+}
+
+void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
+{
+ struct bch_disk_groups_cpu *groups;
+ struct bch_disk_group_cpu *g;
+ unsigned nr = 0;
+ u16 path[32];
+
+ out->atomic++;
+ rcu_read_lock();
+ groups = rcu_dereference(c->disk_groups);
+ if (!groups)
+ goto invalid;
+
+ while (1) {
+ if (nr == ARRAY_SIZE(path))
+ goto invalid;
+
+ if (v >= groups->nr)
+ goto invalid;
+
+ g = groups->entries + v;
+
+ if (g->deleted)
+ goto invalid;
+
+ path[nr++] = v;
+
+ if (!g->parent)
+ break;
+
+ v = g->parent - 1;
+ }
+
+ while (nr) {
+ v = path[--nr];
+ g = groups->entries + v;
+
+ prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
+ if (nr)
+ prt_printf(out, ".");
+ }
+out:
+ rcu_read_unlock();
+ out->atomic--;
+ return;
+invalid:
+ prt_printf(out, "invalid label %u", v);
+ goto out;
+}
+
+void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
+{
+ struct bch_sb_field_disk_groups *groups =
+ bch2_sb_field_get(sb, disk_groups);
+ struct bch_disk_group *g;
+ unsigned nr = 0;
+ u16 path[32];
+
+ while (1) {
+ if (nr == ARRAY_SIZE(path))
+ goto inval;
+
+ if (v >= disk_groups_nr(groups))
+ goto inval;
+
+ g = groups->entries + v;
+
+ if (BCH_GROUP_DELETED(g))
+ goto inval;
+
+ path[nr++] = v;
+
+ if (!BCH_GROUP_PARENT(g))
+ break;
+
+ v = BCH_GROUP_PARENT(g) - 1;
+ }
+
+ while (nr) {
+ v = path[--nr];
+ g = groups->entries + v;
+
+ prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
+ if (nr)
+ prt_printf(out, ".");
+ }
+ return;
+inval:
+ prt_printf(out, "invalid label %u", v);
+}
+
+int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
+{
+ struct bch_member *mi;
+ int ret, v = -1;
+
+ if (!strlen(name) || !strcmp(name, "none"))
+ return 0;
+
+ v = bch2_disk_path_find_or_create(&c->disk_sb, name);
+ if (v < 0)
+ return v;
+
+ ret = bch2_sb_disk_groups_to_cpu(c);
+ if (ret)
+ return ret;
+
+ mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+ SET_BCH_MEMBER_GROUP(mi, v + 1);
+ return 0;
+}
+
+int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
+{
+ int ret;
+
+ mutex_lock(&c->sb_lock);
+ ret = __bch2_dev_group_set(c, ca, name) ?:
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
+ return ret;
+}
+
+int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res,
+ struct printbuf *err)
+{
+ struct bch_dev *ca;
+ int g;
+
+ if (!val)
+ return -EINVAL;
+
+ if (!c)
+ return 0;
+
+ if (!strlen(val) || !strcmp(val, "none")) {
+ *res = 0;
+ return 0;
+ }
+
+ /* Is it a device? */
+ ca = bch2_dev_lookup(c, val);
+ if (!IS_ERR(ca)) {
+ *res = dev_to_target(ca->dev_idx);
+ percpu_ref_put(&ca->ref);
+ return 0;
+ }
+
+ mutex_lock(&c->sb_lock);
+ g = bch2_disk_path_find(&c->disk_sb, val);
+ mutex_unlock(&c->sb_lock);
+
+ if (g >= 0) {
+ *res = group_to_target(g);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
+{
+ struct target t = target_decode(v);
+
+ switch (t.type) {
+ case TARGET_NULL:
+ prt_printf(out, "none");
+ break;
+ case TARGET_DEV: {
+ struct bch_dev *ca;
+
+ out->atomic++;
+ rcu_read_lock();
+ ca = t.dev < c->sb.nr_devices
+ ? rcu_dereference(c->devs[t.dev])
+ : NULL;
+
+ if (ca && percpu_ref_tryget(&ca->io_ref)) {
+ prt_printf(out, "/dev/%pg", ca->disk_sb.bdev);
+ percpu_ref_put(&ca->io_ref);
+ } else if (ca) {
+ prt_printf(out, "offline device %u", t.dev);
+ } else {
+ prt_printf(out, "invalid device %u", t.dev);
+ }
+
+ rcu_read_unlock();
+ out->atomic--;
+ break;
+ }
+ case TARGET_GROUP:
+ bch2_disk_path_to_text(out, c, t.group);
+ break;
+ default:
+ BUG();
+ }
+}
+
+static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
+{
+ struct target t = target_decode(v);
+
+ switch (t.type) {
+ case TARGET_NULL:
+ prt_printf(out, "none");
+ break;
+ case TARGET_DEV: {
+ struct bch_member m = bch2_sb_member_get(sb, t.dev);
+
+ if (bch2_dev_exists(sb, t.dev)) {
+ prt_printf(out, "Device ");
+ pr_uuid(out, m.uuid.b);
+ prt_printf(out, " (%u)", t.dev);
+ } else {
+ prt_printf(out, "Bad device %u", t.dev);
+ }
+ break;
+ }
+ case TARGET_GROUP:
+ bch2_disk_path_to_text_sb(out, sb, t.group);
+ break;
+ default:
+ BUG();
+ }
+}
+
+void bch2_opt_target_to_text(struct printbuf *out,
+ struct bch_fs *c,
+ struct bch_sb *sb,
+ u64 v)
+{
+ if (c)
+ bch2_target_to_text(out, c, v);
+ else
+ bch2_target_to_text_sb(out, sb, v);
+}
diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
new file mode 100644
index 000000000000..441826fff224
--- /dev/null
+++ b/fs/bcachefs/disk_groups.h
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_GROUPS_H
+#define _BCACHEFS_DISK_GROUPS_H
+
+#include "disk_groups_types.h"
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups;
+
+static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
+{
+ return groups
+ ? (vstruct_end(&groups->field) -
+ (void *) &groups->entries[0]) / sizeof(struct bch_disk_group)
+ : 0;
+}
+
+struct target {
+ enum {
+ TARGET_NULL,
+ TARGET_DEV,
+ TARGET_GROUP,
+ } type;
+ union {
+ unsigned dev;
+ unsigned group;
+ };
+};
+
+#define TARGET_DEV_START 1
+#define TARGET_GROUP_START (256 + TARGET_DEV_START)
+
+static inline u16 dev_to_target(unsigned dev)
+{
+ return TARGET_DEV_START + dev;
+}
+
+static inline u16 group_to_target(unsigned group)
+{
+ return TARGET_GROUP_START + group;
+}
+
+static inline struct target target_decode(unsigned target)
+{
+ if (target >= TARGET_GROUP_START)
+ return (struct target) {
+ .type = TARGET_GROUP,
+ .group = target - TARGET_GROUP_START
+ };
+
+ if (target >= TARGET_DEV_START)
+ return (struct target) {
+ .type = TARGET_DEV,
+ .group = target - TARGET_DEV_START
+ };
+
+ return (struct target) { .type = TARGET_NULL };
+}
+
+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
+
+static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c,
+ enum bch_data_type data_type,
+ u16 target)
+{
+ struct bch_devs_mask devs = c->rw_devs[data_type];
+ const struct bch_devs_mask *t = bch2_target_to_mask(c, target);
+
+ if (t)
+ bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
+ return devs;
+}
+
+static inline bool bch2_target_accepts_data(struct bch_fs *c,
+ enum bch_data_type data_type,
+ u16 target)
+{
+ struct bch_devs_mask rw_devs = target_rw_devs(c, data_type, target);
+ return !bitmap_empty(rw_devs.d, BCH_SB_MEMBERS_MAX);
+}
+
+bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
+
+int bch2_disk_path_find(struct bch_sb_handle *, const char *);
+
+/* Exported for userspace bcachefs-tools: */
+int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
+
+void bch2_disk_path_to_text(struct printbuf *, struct bch_fs *, unsigned);
+void bch2_disk_path_to_text_sb(struct printbuf *, struct bch_sb *, unsigned);
+
+void bch2_target_to_text(struct printbuf *out, struct bch_fs *, unsigned);
+
+int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
+void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
+
+#define bch2_opt_target (struct bch_opt_fn) { \
+ .parse = bch2_opt_target_parse, \
+ .to_text = bch2_opt_target_to_text, \
+}
+
+int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
+
+int __bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
+int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
+
+const char *bch2_sb_validate_disk_groups(struct bch_sb *,
+ struct bch_sb_field *);
+
+void bch2_disk_groups_to_text(struct printbuf *, struct bch_fs *);
+
+#endif /* _BCACHEFS_DISK_GROUPS_H */
diff --git a/fs/bcachefs/disk_groups_types.h b/fs/bcachefs/disk_groups_types.h
new file mode 100644
index 000000000000..a54ef085b13d
--- /dev/null
+++ b/fs/bcachefs/disk_groups_types.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_GROUPS_TYPES_H
+#define _BCACHEFS_DISK_GROUPS_TYPES_H
+
+struct bch_disk_group_cpu {
+ bool deleted;
+ u16 parent;
+ u8 label[BCH_SB_LABEL_SIZE];
+ struct bch_devs_mask devs;
+};
+
+struct bch_disk_groups_cpu {
+ struct rcu_head rcu;
+ unsigned nr;
+ struct bch_disk_group_cpu entries[] __counted_by(nr);
+};
+
+#endif /* _BCACHEFS_DISK_GROUPS_TYPES_H */
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
new file mode 100644
index 000000000000..2a77de18c004
--- /dev/null
+++ b/fs/bcachefs/ec.c
@@ -0,0 +1,1981 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* erasure coding */
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "backpointers.h"
+#include "bkey_buf.h"
+#include "bset.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "btree_write_buffer.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "error.h"
+#include "io_read.h"
+#include "keylist.h"
+#include "recovery.h"
+#include "replicas.h"
+#include "super-io.h"
+#include "util.h"
+
+#include <linux/sort.h>
+
+#ifdef __KERNEL__
+
+#include <linux/raid/pq.h>
+#include <linux/raid/xor.h>
+
+static void raid5_recov(unsigned disks, unsigned failed_idx,
+ size_t size, void **data)
+{
+ unsigned i = 2, nr;
+
+ BUG_ON(failed_idx >= disks);
+
+ swap(data[0], data[failed_idx]);
+ memcpy(data[0], data[1], size);
+
+ while (i < disks) {
+ nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
+ xor_blocks(nr, size, data[0], data + i);
+ i += nr;
+ }
+
+ swap(data[0], data[failed_idx]);
+}
+
+static void raid_gen(int nd, int np, size_t size, void **v)
+{
+ if (np >= 1)
+ raid5_recov(nd + np, nd, size, v);
+ if (np >= 2)
+ raid6_call.gen_syndrome(nd + np, size, v);
+ BUG_ON(np > 2);
+}
+
+static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
+{
+ switch (nr) {
+ case 0:
+ break;
+ case 1:
+ if (ir[0] < nd + 1)
+ raid5_recov(nd + 1, ir[0], size, v);
+ else
+ raid6_call.gen_syndrome(nd + np, size, v);
+ break;
+ case 2:
+ if (ir[1] < nd) {
+ /* data+data failure. */
+ raid6_2data_recov(nd + np, size, ir[0], ir[1], v);
+ } else if (ir[0] < nd) {
+ /* data + p/q failure */
+
+ if (ir[1] == nd) /* data + p failure */
+ raid6_datap_recov(nd + np, size, ir[0], v);
+ else { /* data + q failure */
+ raid5_recov(nd + 1, ir[0], size, v);
+ raid6_call.gen_syndrome(nd + np, size, v);
+ }
+ } else {
+ raid_gen(nd, np, size, v);
+ }
+ break;
+ default:
+ BUG();
+ }
+}
+
+#else
+
+#include <raid/raid.h>
+
+#endif
+
+struct ec_bio {
+ struct bch_dev *ca;
+ struct ec_stripe_buf *buf;
+ size_t idx;
+ struct bio bio;
+};
+
+/* Stripes btree keys: */
+
+int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
+{
+ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+ int ret = 0;
+
+ bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) ||
+ bpos_gt(k.k->p, POS(0, U32_MAX)), c, err,
+ stripe_pos_bad,
+ "stripe at bad pos");
+
+ bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), c, err,
+ stripe_val_size_bad,
+ "incorrect value size (%zu < %u)",
+ bkey_val_u64s(k.k), stripe_val_u64s(s));
+
+ ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
+fsck_err:
+ return ret;
+}
+
+void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+ unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
+
+ prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
+ s->algorithm,
+ le16_to_cpu(s->sectors),
+ nr_data,
+ s->nr_redundant,
+ s->csum_type,
+ 1U << s->csum_granularity_bits);
+
+ for (i = 0; i < s->nr_blocks; i++) {
+ const struct bch_extent_ptr *ptr = s->ptrs + i;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ u32 offset;
+ u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
+
+ prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset);
+ if (i < nr_data)
+ prt_printf(out, "#%u", stripe_blockcount_get(s, i));
+ prt_printf(out, " gen %u", ptr->gen);
+ if (ptr_stale(ca, ptr))
+ prt_printf(out, " stale");
+ }
+}
+
+/* returns blocknr in stripe that we matched: */
+static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s,
+ struct bkey_s_c k, unsigned *block)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
+ unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
+
+ bkey_for_each_ptr(ptrs, ptr)
+ for (i = 0; i < nr_data; i++)
+ if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr,
+ le16_to_cpu(s->sectors))) {
+ *block = i;
+ return ptr;
+ }
+
+ return NULL;
+}
+
+static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
+{
+ switch (k.k->type) {
+ case KEY_TYPE_extent: {
+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+ const union bch_extent_entry *entry;
+
+ extent_for_each_entry(e, entry)
+ if (extent_entry_type(entry) ==
+ BCH_EXTENT_ENTRY_stripe_ptr &&
+ entry->stripe_ptr.idx == idx)
+ return true;
+
+ break;
+ }
+ }
+
+ return false;
+}
+
+/* Stripe bufs: */
+
+static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
+{
+ if (buf->key.k.type == KEY_TYPE_stripe) {
+ struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key);
+ unsigned i;
+
+ for (i = 0; i < s->v.nr_blocks; i++) {
+ kvpfree(buf->data[i], buf->size << 9);
+ buf->data[i] = NULL;
+ }
+ }
+}
+
+/* XXX: this is a non-mempoolified memory allocation: */
+static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
+ unsigned offset, unsigned size)
+{
+ struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+ unsigned csum_granularity = 1U << v->csum_granularity_bits;
+ unsigned end = offset + size;
+ unsigned i;
+
+ BUG_ON(end > le16_to_cpu(v->sectors));
+
+ offset = round_down(offset, csum_granularity);
+ end = min_t(unsigned, le16_to_cpu(v->sectors),
+ round_up(end, csum_granularity));
+
+ buf->offset = offset;
+ buf->size = end - offset;
+
+ memset(buf->valid, 0xFF, sizeof(buf->valid));
+
+ for (i = 0; i < v->nr_blocks; i++) {
+ buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL);
+ if (!buf->data[i])
+ goto err;
+ }
+
+ return 0;
+err:
+ ec_stripe_buf_exit(buf);
+ return -BCH_ERR_ENOMEM_stripe_buf;
+}
+
+/* Checksumming: */
+
+static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf,
+ unsigned block, unsigned offset)
+{
+ struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+ unsigned csum_granularity = 1 << v->csum_granularity_bits;
+ unsigned end = buf->offset + buf->size;
+ unsigned len = min(csum_granularity, end - offset);
+
+ BUG_ON(offset >= end);
+ BUG_ON(offset < buf->offset);
+ BUG_ON(offset & (csum_granularity - 1));
+ BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
+ (len & (csum_granularity - 1)));
+
+ return bch2_checksum(NULL, v->csum_type,
+ null_nonce(),
+ buf->data[block] + ((offset - buf->offset) << 9),
+ len << 9);
+}
+
+static void ec_generate_checksums(struct ec_stripe_buf *buf)
+{
+ struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+ unsigned i, j, csums_per_device = stripe_csums_per_device(v);
+
+ if (!v->csum_type)
+ return;
+
+ BUG_ON(buf->offset);
+ BUG_ON(buf->size != le16_to_cpu(v->sectors));
+
+ for (i = 0; i < v->nr_blocks; i++)
+ for (j = 0; j < csums_per_device; j++)
+ stripe_csum_set(v, i, j,
+ ec_block_checksum(buf, i, j << v->csum_granularity_bits));
+}
+
+static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
+{
+ struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+ unsigned csum_granularity = 1 << v->csum_granularity_bits;
+ unsigned i;
+
+ if (!v->csum_type)
+ return;
+
+ for (i = 0; i < v->nr_blocks; i++) {
+ unsigned offset = buf->offset;
+ unsigned end = buf->offset + buf->size;
+
+ if (!test_bit(i, buf->valid))
+ continue;
+
+ while (offset < end) {
+ unsigned j = offset >> v->csum_granularity_bits;
+ unsigned len = min(csum_granularity, end - offset);
+ struct bch_csum want = stripe_csum_get(v, i, j);
+ struct bch_csum got = ec_block_checksum(buf, i, offset);
+
+ if (bch2_crc_cmp(want, got)) {
+ struct printbuf err = PRINTBUF;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev);
+
+ prt_printf(&err, "stripe checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)\n",
+ want.hi, want.lo,
+ got.hi, got.lo,
+ bch2_csum_types[v->csum_type]);
+ prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i);
+ bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
+ bch_err_ratelimited(ca, "%s", err.buf);
+ printbuf_exit(&err);
+
+ clear_bit(i, buf->valid);
+
+ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+ break;
+ }
+
+ offset += len;
+ }
+ }
+}
+
+/* Erasure coding: */
+
+static void ec_generate_ec(struct ec_stripe_buf *buf)
+{
+ struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+ unsigned nr_data = v->nr_blocks - v->nr_redundant;
+ unsigned bytes = le16_to_cpu(v->sectors) << 9;
+
+ raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
+}
+
+static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
+{
+ struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+
+ return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks);
+}
+
+static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
+{
+ struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+ unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0;
+ unsigned nr_data = v->nr_blocks - v->nr_redundant;
+ unsigned bytes = buf->size << 9;
+
+ if (ec_nr_failed(buf) > v->nr_redundant) {
+ bch_err_ratelimited(c,
+ "error doing reconstruct read: unable to read enough blocks");
+ return -1;
+ }
+
+ for (i = 0; i < nr_data; i++)
+ if (!test_bit(i, buf->valid))
+ failed[nr_failed++] = i;
+
+ raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data);
+ return 0;
+}
+
+/* IO: */
+
+static void ec_block_endio(struct bio *bio)
+{
+ struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
+ struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v;
+ struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
+ struct bch_dev *ca = ec_bio->ca;
+ struct closure *cl = bio->bi_private;
+
+ if (bch2_dev_io_err_on(bio->bi_status, ca,
+ bio_data_dir(bio)
+ ? BCH_MEMBER_ERROR_write
+ : BCH_MEMBER_ERROR_read,
+ "erasure coding %s error: %s",
+ bio_data_dir(bio) ? "write" : "read",
+ bch2_blk_status_to_str(bio->bi_status)))
+ clear_bit(ec_bio->idx, ec_bio->buf->valid);
+
+ if (ptr_stale(ca, ptr)) {
+ bch_err_ratelimited(ca->fs,
+ "error %s stripe: stale pointer after io",
+ bio_data_dir(bio) == READ ? "reading from" : "writing to");
+ clear_bit(ec_bio->idx, ec_bio->buf->valid);
+ }
+
+ bio_put(&ec_bio->bio);
+ percpu_ref_put(&ca->io_ref);
+ closure_put(cl);
+}
+
+static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
+ blk_opf_t opf, unsigned idx, struct closure *cl)
+{
+ struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+ unsigned offset = 0, bytes = buf->size << 9;
+ struct bch_extent_ptr *ptr = &v->ptrs[idx];
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant
+ ? BCH_DATA_user
+ : BCH_DATA_parity;
+ int rw = op_is_write(opf);
+
+ if (ptr_stale(ca, ptr)) {
+ bch_err_ratelimited(c,
+ "error %s stripe: stale pointer",
+ rw == READ ? "reading from" : "writing to");
+ clear_bit(idx, buf->valid);
+ return;
+ }
+
+ if (!bch2_dev_get_ioref(ca, rw)) {
+ clear_bit(idx, buf->valid);
+ return;
+ }
+
+ this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
+
+ while (offset < bytes) {
+ unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS,
+ DIV_ROUND_UP(bytes, PAGE_SIZE));
+ unsigned b = min_t(size_t, bytes - offset,
+ nr_iovecs << PAGE_SHIFT);
+ struct ec_bio *ec_bio;
+
+ ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev,
+ nr_iovecs,
+ opf,
+ GFP_KERNEL,
+ &c->ec_bioset),
+ struct ec_bio, bio);
+
+ ec_bio->ca = ca;
+ ec_bio->buf = buf;
+ ec_bio->idx = idx;
+
+ ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9);
+ ec_bio->bio.bi_end_io = ec_block_endio;
+ ec_bio->bio.bi_private = cl;
+
+ bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
+
+ closure_get(cl);
+ percpu_ref_get(&ca->io_ref);
+
+ submit_bio(&ec_bio->bio);
+
+ offset += b;
+ }
+
+ percpu_ref_put(&ca->io_ref);
+}
+
+static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
+ struct ec_stripe_buf *stripe)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
+ POS(0, idx), BTREE_ITER_SLOTS);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+ if (k.k->type != KEY_TYPE_stripe) {
+ ret = -ENOENT;
+ goto err;
+ }
+ bkey_reassemble(&stripe->key, k);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+/* recovery read path: */
+int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
+{
+ struct bch_fs *c = trans->c;
+ struct ec_stripe_buf *buf;
+ struct closure cl;
+ struct bch_stripe *v;
+ unsigned i, offset;
+ int ret = 0;
+
+ closure_init_stack(&cl);
+
+ BUG_ON(!rbio->pick.has_ec);
+
+ buf = kzalloc(sizeof(*buf), GFP_NOFS);
+ if (!buf)
+ return -BCH_ERR_ENOMEM_ec_read_extent;
+
+ ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
+ if (ret) {
+ bch_err_ratelimited(c,
+ "error doing reconstruct read: error %i looking up stripe", ret);
+ kfree(buf);
+ return -EIO;
+ }
+
+ v = &bkey_i_to_stripe(&buf->key)->v;
+
+ if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
+ bch_err_ratelimited(c,
+ "error doing reconstruct read: pointer doesn't match stripe");
+ ret = -EIO;
+ goto err;
+ }
+
+ offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
+ if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
+ bch_err_ratelimited(c,
+ "error doing reconstruct read: read is bigger than stripe");
+ ret = -EIO;
+ goto err;
+ }
+
+ ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
+ if (ret)
+ goto err;
+
+ for (i = 0; i < v->nr_blocks; i++)
+ ec_block_io(c, buf, REQ_OP_READ, i, &cl);
+
+ closure_sync(&cl);
+
+ if (ec_nr_failed(buf) > v->nr_redundant) {
+ bch_err_ratelimited(c,
+ "error doing reconstruct read: unable to read enough blocks");
+ ret = -EIO;
+ goto err;
+ }
+
+ ec_validate_checksums(c, buf);
+
+ ret = ec_do_recov(c, buf);
+ if (ret)
+ goto err;
+
+ memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
+ buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
+err:
+ ec_stripe_buf_exit(buf);
+ kfree(buf);
+ return ret;
+}
+
+/* stripe bucket accounting: */
+
+static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
+{
+ ec_stripes_heap n, *h = &c->ec_stripes_heap;
+
+ if (idx >= h->size) {
+ if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
+ return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
+
+ mutex_lock(&c->ec_stripes_heap_lock);
+ if (n.size > h->size) {
+ memcpy(n.data, h->data, h->used * sizeof(h->data[0]));
+ n.used = h->used;
+ swap(*h, n);
+ }
+ mutex_unlock(&c->ec_stripes_heap_lock);
+
+ free_heap(&n);
+ }
+
+ if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
+ return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
+
+ if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING &&
+ !genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
+ return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
+
+ return 0;
+}
+
+static int ec_stripe_mem_alloc(struct btree_trans *trans,
+ struct btree_iter *iter)
+{
+ return allocate_dropping_locks_errcode(trans,
+ __ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp));
+}
+
+/*
+ * Hash table of open stripes:
+ * Stripes that are being created or modified are kept in a hash table, so that
+ * stripe deletion can skip them.
+ */
+
+static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx)
+{
+ unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
+ struct ec_stripe_new *s;
+
+ hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash)
+ if (s->idx == idx)
+ return true;
+ return false;
+}
+
+static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx)
+{
+ bool ret = false;
+
+ spin_lock(&c->ec_stripes_new_lock);
+ ret = __bch2_stripe_is_open(c, idx);
+ spin_unlock(&c->ec_stripes_new_lock);
+
+ return ret;
+}
+
+static bool bch2_try_open_stripe(struct bch_fs *c,
+ struct ec_stripe_new *s,
+ u64 idx)
+{
+ bool ret;
+
+ spin_lock(&c->ec_stripes_new_lock);
+ ret = !__bch2_stripe_is_open(c, idx);
+ if (ret) {
+ unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
+
+ s->idx = idx;
+ hlist_add_head(&s->hash, &c->ec_stripes_new[hash]);
+ }
+ spin_unlock(&c->ec_stripes_new_lock);
+
+ return ret;
+}
+
+static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
+{
+ BUG_ON(!s->idx);
+
+ spin_lock(&c->ec_stripes_new_lock);
+ hlist_del_init(&s->hash);
+ spin_unlock(&c->ec_stripes_new_lock);
+
+ s->idx = 0;
+}
+
+/* Heap of all existing stripes, ordered by blocks_nonempty */
+
+static u64 stripe_idx_to_delete(struct bch_fs *c)
+{
+ ec_stripes_heap *h = &c->ec_stripes_heap;
+
+ lockdep_assert_held(&c->ec_stripes_heap_lock);
+
+ if (h->used &&
+ h->data[0].blocks_nonempty == 0 &&
+ !bch2_stripe_is_open(c, h->data[0].idx))
+ return h->data[0].idx;
+
+ return 0;
+}
+
+static inline int ec_stripes_heap_cmp(ec_stripes_heap *h,
+ struct ec_stripe_heap_entry l,
+ struct ec_stripe_heap_entry r)
+{
+ return ((l.blocks_nonempty > r.blocks_nonempty) -
+ (l.blocks_nonempty < r.blocks_nonempty));
+}
+
+static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
+ size_t i)
+{
+ struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
+
+ genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i;
+}
+
+static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
+{
+ ec_stripes_heap *h = &c->ec_stripes_heap;
+ struct stripe *m = genradix_ptr(&c->stripes, idx);
+
+ BUG_ON(m->heap_idx >= h->used);
+ BUG_ON(h->data[m->heap_idx].idx != idx);
+}
+
+void bch2_stripes_heap_del(struct bch_fs *c,
+ struct stripe *m, size_t idx)
+{
+ mutex_lock(&c->ec_stripes_heap_lock);
+ heap_verify_backpointer(c, idx);
+
+ heap_del(&c->ec_stripes_heap, m->heap_idx,
+ ec_stripes_heap_cmp,
+ ec_stripes_heap_set_backpointer);
+ mutex_unlock(&c->ec_stripes_heap_lock);
+}
+
+void bch2_stripes_heap_insert(struct bch_fs *c,
+ struct stripe *m, size_t idx)
+{
+ mutex_lock(&c->ec_stripes_heap_lock);
+ BUG_ON(heap_full(&c->ec_stripes_heap));
+
+ heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
+ .idx = idx,
+ .blocks_nonempty = m->blocks_nonempty,
+ }),
+ ec_stripes_heap_cmp,
+ ec_stripes_heap_set_backpointer);
+
+ heap_verify_backpointer(c, idx);
+ mutex_unlock(&c->ec_stripes_heap_lock);
+}
+
+void bch2_stripes_heap_update(struct bch_fs *c,
+ struct stripe *m, size_t idx)
+{
+ ec_stripes_heap *h = &c->ec_stripes_heap;
+ bool do_deletes;
+ size_t i;
+
+ mutex_lock(&c->ec_stripes_heap_lock);
+ heap_verify_backpointer(c, idx);
+
+ h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
+
+ i = m->heap_idx;
+ heap_sift_up(h, i, ec_stripes_heap_cmp,
+ ec_stripes_heap_set_backpointer);
+ heap_sift_down(h, i, ec_stripes_heap_cmp,
+ ec_stripes_heap_set_backpointer);
+
+ heap_verify_backpointer(c, idx);
+
+ do_deletes = stripe_idx_to_delete(c) != 0;
+ mutex_unlock(&c->ec_stripes_heap_lock);
+
+ if (do_deletes)
+ bch2_do_stripe_deletes(c);
+}
+
+/* stripe deletion */
+
+static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_s_c_stripe s;
+ int ret;
+
+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx),
+ BTREE_ITER_INTENT);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_stripe) {
+ bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ s = bkey_s_c_to_stripe(k);
+ for (unsigned i = 0; i < s.v->nr_blocks; i++)
+ if (stripe_blockcount_get(s.v, i)) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf);
+ printbuf_exit(&buf);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static void ec_stripe_delete_work(struct work_struct *work)
+{
+ struct bch_fs *c =
+ container_of(work, struct bch_fs, ec_stripe_delete_work);
+ struct btree_trans *trans = bch2_trans_get(c);
+ int ret;
+ u64 idx;
+
+ while (1) {
+ mutex_lock(&c->ec_stripes_heap_lock);
+ idx = stripe_idx_to_delete(c);
+ mutex_unlock(&c->ec_stripes_heap_lock);
+
+ if (!idx)
+ break;
+
+ ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ ec_stripe_delete(trans, idx));
+ if (ret) {
+ bch_err_fn(c, ret);
+ break;
+ }
+ }
+
+ bch2_trans_put(trans);
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
+}
+
+void bch2_do_stripe_deletes(struct bch_fs *c)
+{
+ if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) &&
+ !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work))
+ bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
+}
+
+/* stripe creation: */
+
+static int ec_stripe_key_update(struct btree_trans *trans,
+ struct bkey_i_stripe *new,
+ bool create)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
+ new->k.p, BTREE_ITER_INTENT);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) {
+ bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s",
+ create ? "creating" : "updating",
+ bch2_bkey_types[k.k->type]);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (k.k->type == KEY_TYPE_stripe) {
+ const struct bch_stripe *old = bkey_s_c_to_stripe(k).v;
+ unsigned i;
+
+ if (old->nr_blocks != new->v.nr_blocks) {
+ bch_err(c, "error updating stripe: nr_blocks does not match");
+ ret = -EINVAL;
+ goto err;
+ }
+
+ for (i = 0; i < new->v.nr_blocks; i++) {
+ unsigned v = stripe_blockcount_get(old, i);
+
+ BUG_ON(v &&
+ (old->ptrs[i].dev != new->v.ptrs[i].dev ||
+ old->ptrs[i].gen != new->v.ptrs[i].gen ||
+ old->ptrs[i].offset != new->v.ptrs[i].offset));
+
+ stripe_blockcount_set(&new->v, i, v);
+ }
+ }
+
+ ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int ec_stripe_update_extent(struct btree_trans *trans,
+ struct bpos bucket, u8 gen,
+ struct ec_stripe_buf *s,
+ struct bpos *bp_pos)
+{
+ struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
+ struct bch_fs *c = trans->c;
+ struct bch_backpointer bp;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ const struct bch_extent_ptr *ptr_c;
+ struct bch_extent_ptr *ptr, *ec_ptr = NULL;
+ struct bch_extent_stripe_ptr stripe_ptr;
+ struct bkey_i *n;
+ int ret, dev, block;
+
+ ret = bch2_get_next_backpointer(trans, bucket, gen,
+ bp_pos, &bp, BTREE_ITER_CACHED);
+ if (ret)
+ return ret;
+ if (bpos_eq(*bp_pos, SPOS_MAX))
+ return 0;
+
+ if (bp.level) {
+ struct printbuf buf = PRINTBUF;
+ struct btree_iter node_iter;
+ struct btree *b;
+
+ b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp);
+ bch2_trans_iter_exit(trans, &node_iter);
+
+ if (!b)
+ return 0;
+
+ prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b);
+ bch2_backpointer_to_text(&buf, &bp);
+
+ bch2_fs_inconsistent(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ return -EIO;
+ }
+
+ k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_INTENT);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+ if (!k.k) {
+ /*
+ * extent no longer exists - we could flush the btree
+ * write buffer and retry to verify, but no need:
+ */
+ return 0;
+ }
+
+ if (extent_has_stripe_ptr(k, s->key.k.p.offset))
+ goto out;
+
+ ptr_c = bkey_matches_stripe(v, k, &block);
+ /*
+ * It doesn't generally make sense to erasure code cached ptrs:
+ * XXX: should we be incrementing a counter?
+ */
+ if (!ptr_c || ptr_c->cached)
+ goto out;
+
+ dev = v->ptrs[block].dev;
+
+ n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr));
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ goto out;
+
+ bkey_reassemble(n, k);
+
+ bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev);
+ ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev);
+ BUG_ON(!ec_ptr);
+
+ stripe_ptr = (struct bch_extent_stripe_ptr) {
+ .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
+ .block = block,
+ .redundancy = v->nr_redundant,
+ .idx = s->key.k.p.offset,
+ };
+
+ __extent_entry_insert(n,
+ (union bch_extent_entry *) ec_ptr,
+ (union bch_extent_entry *) &stripe_ptr);
+
+ ret = bch2_trans_update(trans, &iter, n, 0);
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s,
+ unsigned block)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
+ struct bch_extent_ptr bucket = v->ptrs[block];
+ struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket);
+ struct bpos bp_pos = POS_MIN;
+ int ret = 0;
+
+ while (1) {
+ ret = commit_do(trans, NULL, NULL,
+ BTREE_INSERT_NOCHECK_RW|
+ BTREE_INSERT_NOFAIL,
+ ec_stripe_update_extent(trans, bucket_pos, bucket.gen,
+ s, &bp_pos));
+ if (ret)
+ break;
+ if (bkey_eq(bp_pos, POS_MAX))
+ break;
+
+ bp_pos = bpos_nosnap_successor(bp_pos);
+ }
+
+ return ret;
+}
+
+static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
+ unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
+ int ret = 0;
+
+ ret = bch2_btree_write_buffer_flush(trans);
+ if (ret)
+ goto err;
+
+ for (i = 0; i < nr_data; i++) {
+ ret = ec_stripe_update_bucket(trans, s, i);
+ if (ret)
+ break;
+ }
+err:
+ bch2_trans_put(trans);
+
+ return ret;
+}
+
+static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
+ struct ec_stripe_new *s,
+ unsigned block,
+ struct open_bucket *ob)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ unsigned offset = ca->mi.bucket_size - ob->sectors_free;
+ int ret;
+
+ if (!bch2_dev_get_ioref(ca, WRITE)) {
+ s->err = -BCH_ERR_erofs_no_writes;
+ return;
+ }
+
+ memset(s->new_stripe.data[block] + (offset << 9),
+ 0,
+ ob->sectors_free << 9);
+
+ ret = blkdev_issue_zeroout(ca->disk_sb.bdev,
+ ob->bucket * ca->mi.bucket_size + offset,
+ ob->sectors_free,
+ GFP_KERNEL, 0);
+
+ percpu_ref_put(&ca->io_ref);
+
+ if (ret)
+ s->err = ret;
+}
+
+void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s)
+{
+ if (s->idx)
+ bch2_stripe_close(c, s);
+ kfree(s);
+}
+
+/*
+ * data buckets of new stripe all written: create the stripe
+ */
+static void ec_stripe_create(struct ec_stripe_new *s)
+{
+ struct bch_fs *c = s->c;
+ struct open_bucket *ob;
+ struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
+ unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
+ int ret;
+
+ BUG_ON(s->h->s == s);
+
+ closure_sync(&s->iodone);
+
+ if (!s->err) {
+ for (i = 0; i < nr_data; i++)
+ if (s->blocks[i]) {
+ ob = c->open_buckets + s->blocks[i];
+
+ if (ob->sectors_free)
+ zero_out_rest_of_ec_bucket(c, s, i, ob);
+ }
+ }
+
+ if (s->err) {
+ if (!bch2_err_matches(s->err, EROFS))
+ bch_err(c, "error creating stripe: error writing data buckets");
+ goto err;
+ }
+
+ if (s->have_existing_stripe) {
+ ec_validate_checksums(c, &s->existing_stripe);
+
+ if (ec_do_recov(c, &s->existing_stripe)) {
+ bch_err(c, "error creating stripe: error reading existing stripe");
+ goto err;
+ }
+
+ for (i = 0; i < nr_data; i++)
+ if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i))
+ swap(s->new_stripe.data[i],
+ s->existing_stripe.data[i]);
+
+ ec_stripe_buf_exit(&s->existing_stripe);
+ }
+
+ BUG_ON(!s->allocated);
+ BUG_ON(!s->idx);
+
+ ec_generate_ec(&s->new_stripe);
+
+ ec_generate_checksums(&s->new_stripe);
+
+ /* write p/q: */
+ for (i = nr_data; i < v->nr_blocks; i++)
+ ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone);
+ closure_sync(&s->iodone);
+
+ if (ec_nr_failed(&s->new_stripe)) {
+ bch_err(c, "error creating stripe: error writing redundancy buckets");
+ goto err;
+ }
+
+ ret = bch2_trans_do(c, &s->res, NULL,
+ BTREE_INSERT_NOCHECK_RW|
+ BTREE_INSERT_NOFAIL,
+ ec_stripe_key_update(trans,
+ bkey_i_to_stripe(&s->new_stripe.key),
+ !s->have_existing_stripe));
+ if (ret) {
+ bch_err(c, "error creating stripe: error creating stripe key");
+ goto err;
+ }
+
+ ret = ec_stripe_update_extents(c, &s->new_stripe);
+ if (ret) {
+ bch_err_msg(c, ret, "creating stripe: error updating pointers");
+ goto err;
+ }
+err:
+ bch2_disk_reservation_put(c, &s->res);
+
+ for (i = 0; i < v->nr_blocks; i++)
+ if (s->blocks[i]) {
+ ob = c->open_buckets + s->blocks[i];
+
+ if (i < nr_data) {
+ ob->ec = NULL;
+ __bch2_open_bucket_put(c, ob);
+ } else {
+ bch2_open_bucket_put(c, ob);
+ }
+ }
+
+ mutex_lock(&c->ec_stripe_new_lock);
+ list_del(&s->list);
+ mutex_unlock(&c->ec_stripe_new_lock);
+ wake_up(&c->ec_stripe_new_wait);
+
+ ec_stripe_buf_exit(&s->existing_stripe);
+ ec_stripe_buf_exit(&s->new_stripe);
+ closure_debug_destroy(&s->iodone);
+
+ ec_stripe_new_put(c, s, STRIPE_REF_stripe);
+}
+
+static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c)
+{
+ struct ec_stripe_new *s;
+
+ mutex_lock(&c->ec_stripe_new_lock);
+ list_for_each_entry(s, &c->ec_stripe_new_list, list)
+ if (!atomic_read(&s->ref[STRIPE_REF_io]))
+ goto out;
+ s = NULL;
+out:
+ mutex_unlock(&c->ec_stripe_new_lock);
+
+ return s;
+}
+
+static void ec_stripe_create_work(struct work_struct *work)
+{
+ struct bch_fs *c = container_of(work,
+ struct bch_fs, ec_stripe_create_work);
+ struct ec_stripe_new *s;
+
+ while ((s = get_pending_stripe(c)))
+ ec_stripe_create(s);
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
+}
+
+void bch2_ec_do_stripe_creates(struct bch_fs *c)
+{
+ bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create);
+
+ if (!queue_work(system_long_wq, &c->ec_stripe_create_work))
+ bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
+}
+
+static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
+{
+ struct ec_stripe_new *s = h->s;
+
+ BUG_ON(!s->allocated && !s->err);
+
+ h->s = NULL;
+ s->pending = true;
+
+ mutex_lock(&c->ec_stripe_new_lock);
+ list_add(&s->list, &c->ec_stripe_new_list);
+ mutex_unlock(&c->ec_stripe_new_lock);
+
+ ec_stripe_new_put(c, s, STRIPE_REF_io);
+}
+
+void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
+{
+ struct ec_stripe_new *s = ob->ec;
+
+ s->err = -EIO;
+}
+
+void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
+{
+ struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
+ struct bch_dev *ca;
+ unsigned offset;
+
+ if (!ob)
+ return NULL;
+
+ BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]);
+
+ ca = bch_dev_bkey_exists(c, ob->dev);
+ offset = ca->mi.bucket_size - ob->sectors_free;
+
+ return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
+}
+
+static int unsigned_cmp(const void *_l, const void *_r)
+{
+ unsigned l = *((const unsigned *) _l);
+ unsigned r = *((const unsigned *) _r);
+
+ return cmp_int(l, r);
+}
+
+/* pick most common bucket size: */
+static unsigned pick_blocksize(struct bch_fs *c,
+ struct bch_devs_mask *devs)
+{
+ struct bch_dev *ca;
+ unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX];
+ struct {
+ unsigned nr, size;
+ } cur = { 0, 0 }, best = { 0, 0 };
+
+ for_each_member_device_rcu(ca, c, i, devs)
+ sizes[nr++] = ca->mi.bucket_size;
+
+ sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
+
+ for (i = 0; i < nr; i++) {
+ if (sizes[i] != cur.size) {
+ if (cur.nr > best.nr)
+ best = cur;
+
+ cur.nr = 0;
+ cur.size = sizes[i];
+ }
+
+ cur.nr++;
+ }
+
+ if (cur.nr > best.nr)
+ best = cur;
+
+ return best.size;
+}
+
+static bool may_create_new_stripe(struct bch_fs *c)
+{
+ return false;
+}
+
+static void ec_stripe_key_init(struct bch_fs *c,
+ struct bkey_i *k,
+ unsigned nr_data,
+ unsigned nr_parity,
+ unsigned stripe_size)
+{
+ struct bkey_i_stripe *s = bkey_stripe_init(k);
+ unsigned u64s;
+
+ s->v.sectors = cpu_to_le16(stripe_size);
+ s->v.algorithm = 0;
+ s->v.nr_blocks = nr_data + nr_parity;
+ s->v.nr_redundant = nr_parity;
+ s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9);
+ s->v.csum_type = BCH_CSUM_crc32c;
+ s->v.pad = 0;
+
+ while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
+ BUG_ON(1 << s->v.csum_granularity_bits >=
+ le16_to_cpu(s->v.sectors) ||
+ s->v.csum_granularity_bits == U8_MAX);
+ s->v.csum_granularity_bits++;
+ }
+
+ set_bkey_val_u64s(&s->k, u64s);
+}
+
+static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
+{
+ struct ec_stripe_new *s;
+
+ lockdep_assert_held(&h->lock);
+
+ s = kzalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -BCH_ERR_ENOMEM_ec_new_stripe_alloc;
+
+ mutex_init(&s->lock);
+ closure_init(&s->iodone, NULL);
+ atomic_set(&s->ref[STRIPE_REF_stripe], 1);
+ atomic_set(&s->ref[STRIPE_REF_io], 1);
+ s->c = c;
+ s->h = h;
+ s->nr_data = min_t(unsigned, h->nr_active_devs,
+ BCH_BKEY_PTRS_MAX) - h->redundancy;
+ s->nr_parity = h->redundancy;
+
+ ec_stripe_key_init(c, &s->new_stripe.key,
+ s->nr_data, s->nr_parity, h->blocksize);
+
+ h->s = s;
+ return 0;
+}
+
+static struct ec_stripe_head *
+ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
+ unsigned algo, unsigned redundancy,
+ enum bch_watermark watermark)
+{
+ struct ec_stripe_head *h;
+ struct bch_dev *ca;
+ unsigned i;
+
+ h = kzalloc(sizeof(*h), GFP_KERNEL);
+ if (!h)
+ return NULL;
+
+ mutex_init(&h->lock);
+ BUG_ON(!mutex_trylock(&h->lock));
+
+ h->target = target;
+ h->algo = algo;
+ h->redundancy = redundancy;
+ h->watermark = watermark;
+
+ rcu_read_lock();
+ h->devs = target_rw_devs(c, BCH_DATA_user, target);
+
+ for_each_member_device_rcu(ca, c, i, &h->devs)
+ if (!ca->mi.durability)
+ __clear_bit(i, h->devs.d);
+
+ h->blocksize = pick_blocksize(c, &h->devs);
+
+ for_each_member_device_rcu(ca, c, i, &h->devs)
+ if (ca->mi.bucket_size == h->blocksize)
+ h->nr_active_devs++;
+
+ rcu_read_unlock();
+
+ /*
+ * If we only have redundancy + 1 devices, we're better off with just
+ * replication:
+ */
+ if (h->nr_active_devs < h->redundancy + 2)
+ bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?",
+ h->nr_active_devs, h->redundancy + 2);
+
+ list_add(&h->list, &c->ec_stripe_head_list);
+ return h;
+}
+
+void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
+{
+ if (h->s &&
+ h->s->allocated &&
+ bitmap_weight(h->s->blocks_allocated,
+ h->s->nr_data) == h->s->nr_data)
+ ec_stripe_set_pending(c, h);
+
+ mutex_unlock(&h->lock);
+}
+
+static struct ec_stripe_head *
+__bch2_ec_stripe_head_get(struct btree_trans *trans,
+ unsigned target,
+ unsigned algo,
+ unsigned redundancy,
+ enum bch_watermark watermark)
+{
+ struct bch_fs *c = trans->c;
+ struct ec_stripe_head *h;
+ int ret;
+
+ if (!redundancy)
+ return NULL;
+
+ ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (test_bit(BCH_FS_GOING_RO, &c->flags)) {
+ h = ERR_PTR(-BCH_ERR_erofs_no_writes);
+ goto found;
+ }
+
+ list_for_each_entry(h, &c->ec_stripe_head_list, list)
+ if (h->target == target &&
+ h->algo == algo &&
+ h->redundancy == redundancy &&
+ h->watermark == watermark) {
+ ret = bch2_trans_mutex_lock(trans, &h->lock);
+ if (ret)
+ h = ERR_PTR(ret);
+ goto found;
+ }
+
+ h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark);
+found:
+ if (!IS_ERR_OR_NULL(h) &&
+ h->nr_active_devs < h->redundancy + 2) {
+ mutex_unlock(&h->lock);
+ h = NULL;
+ }
+ mutex_unlock(&c->ec_stripe_head_lock);
+ return h;
+}
+
+static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h,
+ enum bch_watermark watermark, struct closure *cl)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_devs_mask devs = h->devs;
+ struct open_bucket *ob;
+ struct open_buckets buckets;
+ struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
+ unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
+ bool have_cache = true;
+ int ret = 0;
+
+ BUG_ON(v->nr_blocks != h->s->nr_data + h->s->nr_parity);
+ BUG_ON(v->nr_redundant != h->s->nr_parity);
+
+ for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) {
+ __clear_bit(v->ptrs[i].dev, devs.d);
+ if (i < h->s->nr_data)
+ nr_have_data++;
+ else
+ nr_have_parity++;
+ }
+
+ BUG_ON(nr_have_data > h->s->nr_data);
+ BUG_ON(nr_have_parity > h->s->nr_parity);
+
+ buckets.nr = 0;
+ if (nr_have_parity < h->s->nr_parity) {
+ ret = bch2_bucket_alloc_set_trans(trans, &buckets,
+ &h->parity_stripe,
+ &devs,
+ h->s->nr_parity,
+ &nr_have_parity,
+ &have_cache, 0,
+ BCH_DATA_parity,
+ watermark,
+ cl);
+
+ open_bucket_for_each(c, &buckets, ob, i) {
+ j = find_next_zero_bit(h->s->blocks_gotten,
+ h->s->nr_data + h->s->nr_parity,
+ h->s->nr_data);
+ BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
+
+ h->s->blocks[j] = buckets.v[i];
+ v->ptrs[j] = bch2_ob_ptr(c, ob);
+ __set_bit(j, h->s->blocks_gotten);
+ }
+
+ if (ret)
+ return ret;
+ }
+
+ buckets.nr = 0;
+ if (nr_have_data < h->s->nr_data) {
+ ret = bch2_bucket_alloc_set_trans(trans, &buckets,
+ &h->block_stripe,
+ &devs,
+ h->s->nr_data,
+ &nr_have_data,
+ &have_cache, 0,
+ BCH_DATA_user,
+ watermark,
+ cl);
+
+ open_bucket_for_each(c, &buckets, ob, i) {
+ j = find_next_zero_bit(h->s->blocks_gotten,
+ h->s->nr_data, 0);
+ BUG_ON(j >= h->s->nr_data);
+
+ h->s->blocks[j] = buckets.v[i];
+ v->ptrs[j] = bch2_ob_ptr(c, ob);
+ __set_bit(j, h->s->blocks_gotten);
+ }
+
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/* XXX: doesn't obey target: */
+static s64 get_existing_stripe(struct bch_fs *c,
+ struct ec_stripe_head *head)
+{
+ ec_stripes_heap *h = &c->ec_stripes_heap;
+ struct stripe *m;
+ size_t heap_idx;
+ u64 stripe_idx;
+ s64 ret = -1;
+
+ if (may_create_new_stripe(c))
+ return -1;
+
+ mutex_lock(&c->ec_stripes_heap_lock);
+ for (heap_idx = 0; heap_idx < h->used; heap_idx++) {
+ /* No blocks worth reusing, stripe will just be deleted: */
+ if (!h->data[heap_idx].blocks_nonempty)
+ continue;
+
+ stripe_idx = h->data[heap_idx].idx;
+
+ m = genradix_ptr(&c->stripes, stripe_idx);
+
+ if (m->algorithm == head->algo &&
+ m->nr_redundant == head->redundancy &&
+ m->sectors == head->blocksize &&
+ m->blocks_nonempty < m->nr_blocks - m->nr_redundant &&
+ bch2_try_open_stripe(c, head->s, stripe_idx)) {
+ ret = stripe_idx;
+ break;
+ }
+ }
+ mutex_unlock(&c->ec_stripes_heap_lock);
+ return ret;
+}
+
+static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
+ struct bch_stripe *existing_v;
+ unsigned i;
+ s64 idx;
+ int ret;
+
+ /*
+ * If we can't allocate a new stripe, and there's no stripes with empty
+ * blocks for us to reuse, that means we have to wait on copygc:
+ */
+ idx = get_existing_stripe(c, h);
+ if (idx < 0)
+ return -BCH_ERR_stripe_alloc_blocked;
+
+ ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe);
+ if (ret) {
+ bch2_stripe_close(c, h->s);
+ if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch2_fs_fatal_error(c, "error reading stripe key: %s", bch2_err_str(ret));
+ return ret;
+ }
+
+ existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v;
+
+ BUG_ON(existing_v->nr_redundant != h->s->nr_parity);
+ h->s->nr_data = existing_v->nr_blocks -
+ existing_v->nr_redundant;
+
+ ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize);
+ if (ret) {
+ bch2_stripe_close(c, h->s);
+ return ret;
+ }
+
+ BUG_ON(h->s->existing_stripe.size != h->blocksize);
+ BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
+
+ /*
+ * Free buckets we initially allocated - they might conflict with
+ * blocks from the stripe we're reusing:
+ */
+ for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) {
+ bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]);
+ h->s->blocks[i] = 0;
+ }
+ memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten));
+ memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated));
+
+ for (i = 0; i < existing_v->nr_blocks; i++) {
+ if (stripe_blockcount_get(existing_v, i)) {
+ __set_bit(i, h->s->blocks_gotten);
+ __set_bit(i, h->s->blocks_allocated);
+ }
+
+ ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
+ }
+
+ bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key);
+ h->s->have_existing_stripe = true;
+
+ return 0;
+}
+
+static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bpos min_pos = POS(0, 1);
+ struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
+ int ret;
+
+ if (!h->s->res.sectors) {
+ ret = bch2_disk_reservation_get(c, &h->s->res,
+ h->blocksize,
+ h->s->nr_parity,
+ BCH_DISK_RESERVATION_NOFAIL);
+ if (ret)
+ return ret;
+ }
+
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
+ if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
+ if (start_pos.offset) {
+ start_pos = min_pos;
+ bch2_btree_iter_set_pos(&iter, start_pos);
+ continue;
+ }
+
+ ret = -BCH_ERR_ENOSPC_stripe_create;
+ break;
+ }
+
+ if (bkey_deleted(k.k) &&
+ bch2_try_open_stripe(c, h->s, k.k->p.offset))
+ break;
+ }
+
+ c->ec_stripe_hint = iter.pos.offset;
+
+ if (ret)
+ goto err;
+
+ ret = ec_stripe_mem_alloc(trans, &iter);
+ if (ret) {
+ bch2_stripe_close(c, h->s);
+ goto err;
+ }
+
+ h->s->new_stripe.key.k.p = iter.pos;
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+err:
+ bch2_disk_reservation_put(c, &h->s->res);
+ goto out;
+}
+
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
+ unsigned target,
+ unsigned algo,
+ unsigned redundancy,
+ enum bch_watermark watermark,
+ struct closure *cl)
+{
+ struct bch_fs *c = trans->c;
+ struct ec_stripe_head *h;
+ bool waiting = false;
+ int ret;
+
+ h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark);
+ if (IS_ERR_OR_NULL(h))
+ return h;
+
+ if (!h->s) {
+ ret = ec_new_stripe_alloc(c, h);
+ if (ret) {
+ bch_err(c, "failed to allocate new stripe");
+ goto err;
+ }
+ }
+
+ if (h->s->allocated)
+ goto allocated;
+
+ if (h->s->have_existing_stripe)
+ goto alloc_existing;
+
+ /* First, try to allocate a full stripe: */
+ ret = new_stripe_alloc_buckets(trans, h, BCH_WATERMARK_stripe, NULL) ?:
+ __bch2_ec_stripe_head_reserve(trans, h);
+ if (!ret)
+ goto allocate_buf;
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+ bch2_err_matches(ret, ENOMEM))
+ goto err;
+
+ /*
+ * Not enough buckets available for a full stripe: we must reuse an
+ * existing stripe:
+ */
+ while (1) {
+ ret = __bch2_ec_stripe_head_reuse(trans, h);
+ if (!ret)
+ break;
+ if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
+ goto err;
+
+ if (watermark == BCH_WATERMARK_copygc) {
+ ret = new_stripe_alloc_buckets(trans, h, watermark, NULL) ?:
+ __bch2_ec_stripe_head_reserve(trans, h);
+ if (ret)
+ goto err;
+ goto allocate_buf;
+ }
+
+ /* XXX freelist_wait? */
+ closure_wait(&c->freelist_wait, cl);
+ waiting = true;
+ }
+
+ if (waiting)
+ closure_wake_up(&c->freelist_wait);
+alloc_existing:
+ /*
+ * Retry allocating buckets, with the watermark for this
+ * particular write:
+ */
+ ret = new_stripe_alloc_buckets(trans, h, watermark, cl);
+ if (ret)
+ goto err;
+
+allocate_buf:
+ ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize);
+ if (ret)
+ goto err;
+
+ h->s->allocated = true;
+allocated:
+ BUG_ON(!h->s->idx);
+ BUG_ON(!h->s->new_stripe.data[0]);
+ BUG_ON(trans->restarted);
+ return h;
+err:
+ bch2_ec_stripe_head_put(c, h);
+ return ERR_PTR(ret);
+}
+
+static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
+{
+ struct ec_stripe_head *h;
+ struct open_bucket *ob;
+ unsigned i;
+
+ mutex_lock(&c->ec_stripe_head_lock);
+ list_for_each_entry(h, &c->ec_stripe_head_list, list) {
+ mutex_lock(&h->lock);
+ if (!h->s)
+ goto unlock;
+
+ if (!ca)
+ goto found;
+
+ for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) {
+ if (!h->s->blocks[i])
+ continue;
+
+ ob = c->open_buckets + h->s->blocks[i];
+ if (ob->dev == ca->dev_idx)
+ goto found;
+ }
+ goto unlock;
+found:
+ h->s->err = -BCH_ERR_erofs_no_writes;
+ ec_stripe_set_pending(c, h);
+unlock:
+ mutex_unlock(&h->lock);
+ }
+ mutex_unlock(&c->ec_stripe_head_lock);
+}
+
+void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
+{
+ __bch2_ec_stop(c, ca);
+}
+
+void bch2_fs_ec_stop(struct bch_fs *c)
+{
+ __bch2_ec_stop(c, NULL);
+}
+
+static bool bch2_fs_ec_flush_done(struct bch_fs *c)
+{
+ bool ret;
+
+ mutex_lock(&c->ec_stripe_new_lock);
+ ret = list_empty(&c->ec_stripe_new_list);
+ mutex_unlock(&c->ec_stripe_new_lock);
+
+ return ret;
+}
+
+void bch2_fs_ec_flush(struct bch_fs *c)
+{
+ wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c));
+}
+
+int bch2_stripes_read(struct bch_fs *c)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ const struct bch_stripe *s;
+ struct stripe *m;
+ unsigned i;
+ int ret;
+
+ for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ if (k.k->type != KEY_TYPE_stripe)
+ continue;
+
+ ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
+ if (ret)
+ break;
+
+ s = bkey_s_c_to_stripe(k).v;
+
+ m = genradix_ptr(&c->stripes, k.k->p.offset);
+ m->sectors = le16_to_cpu(s->sectors);
+ m->algorithm = s->algorithm;
+ m->nr_blocks = s->nr_blocks;
+ m->nr_redundant = s->nr_redundant;
+ m->blocks_nonempty = 0;
+
+ for (i = 0; i < s->nr_blocks; i++)
+ m->blocks_nonempty += !!stripe_blockcount_get(s, i);
+
+ bch2_stripes_heap_insert(c, m, k.k->p.offset);
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ bch2_trans_put(trans);
+
+ if (ret)
+ bch_err_fn(c, ret);
+
+ return ret;
+}
+
+void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ ec_stripes_heap *h = &c->ec_stripes_heap;
+ struct stripe *m;
+ size_t i;
+
+ mutex_lock(&c->ec_stripes_heap_lock);
+ for (i = 0; i < min_t(size_t, h->used, 50); i++) {
+ m = genradix_ptr(&c->stripes, h->data[i].idx);
+
+ prt_printf(out, "%zu %u/%u+%u", h->data[i].idx,
+ h->data[i].blocks_nonempty,
+ m->nr_blocks - m->nr_redundant,
+ m->nr_redundant);
+ if (bch2_stripe_is_open(c, h->data[i].idx))
+ prt_str(out, " open");
+ prt_newline(out);
+ }
+ mutex_unlock(&c->ec_stripes_heap_lock);
+}
+
+void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ struct ec_stripe_head *h;
+ struct ec_stripe_new *s;
+
+ mutex_lock(&c->ec_stripe_head_lock);
+ list_for_each_entry(h, &c->ec_stripe_head_list, list) {
+ prt_printf(out, "target %u algo %u redundancy %u %s:\n",
+ h->target, h->algo, h->redundancy,
+ bch2_watermarks[h->watermark]);
+
+ if (h->s)
+ prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n",
+ h->s->idx, h->s->nr_data, h->s->nr_parity,
+ bitmap_weight(h->s->blocks_allocated,
+ h->s->nr_data));
+ }
+ mutex_unlock(&c->ec_stripe_head_lock);
+
+ prt_printf(out, "in flight:\n");
+
+ mutex_lock(&c->ec_stripe_new_lock);
+ list_for_each_entry(s, &c->ec_stripe_new_list, list) {
+ prt_printf(out, "\tidx %llu blocks %u+%u ref %u %u %s\n",
+ s->idx, s->nr_data, s->nr_parity,
+ atomic_read(&s->ref[STRIPE_REF_io]),
+ atomic_read(&s->ref[STRIPE_REF_stripe]),
+ bch2_watermarks[s->h->watermark]);
+ }
+ mutex_unlock(&c->ec_stripe_new_lock);
+}
+
+void bch2_fs_ec_exit(struct bch_fs *c)
+{
+ struct ec_stripe_head *h;
+ unsigned i;
+
+ while (1) {
+ mutex_lock(&c->ec_stripe_head_lock);
+ h = list_first_entry_or_null(&c->ec_stripe_head_list,
+ struct ec_stripe_head, list);
+ if (h)
+ list_del(&h->list);
+ mutex_unlock(&c->ec_stripe_head_lock);
+ if (!h)
+ break;
+
+ if (h->s) {
+ for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++)
+ BUG_ON(h->s->blocks[i]);
+
+ kfree(h->s);
+ }
+ kfree(h);
+ }
+
+ BUG_ON(!list_empty(&c->ec_stripe_new_list));
+
+ free_heap(&c->ec_stripes_heap);
+ genradix_free(&c->stripes);
+ bioset_exit(&c->ec_bioset);
+}
+
+void bch2_fs_ec_init_early(struct bch_fs *c)
+{
+ spin_lock_init(&c->ec_stripes_new_lock);
+ mutex_init(&c->ec_stripes_heap_lock);
+
+ INIT_LIST_HEAD(&c->ec_stripe_head_list);
+ mutex_init(&c->ec_stripe_head_lock);
+
+ INIT_LIST_HEAD(&c->ec_stripe_new_list);
+ mutex_init(&c->ec_stripe_new_lock);
+ init_waitqueue_head(&c->ec_stripe_new_wait);
+
+ INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
+ INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
+}
+
+int bch2_fs_ec_init(struct bch_fs *c)
+{
+ return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
+ BIOSET_NEED_BVECS);
+}
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
new file mode 100644
index 000000000000..7d0237c9819f
--- /dev/null
+++ b/fs/bcachefs/ec.h
@@ -0,0 +1,260 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EC_H
+#define _BCACHEFS_EC_H
+
+#include "ec_types.h"
+#include "buckets_types.h"
+#include "extents_types.h"
+
+enum bkey_invalid_flags;
+
+int bch2_stripe_invalid(struct bch_fs *, struct bkey_s_c,
+ enum bkey_invalid_flags, struct printbuf *);
+void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
+ struct bkey_s_c);
+
+#define bch2_bkey_ops_stripe ((struct bkey_ops) { \
+ .key_invalid = bch2_stripe_invalid, \
+ .val_to_text = bch2_stripe_to_text, \
+ .swab = bch2_ptr_swab, \
+ .trans_trigger = bch2_trans_mark_stripe, \
+ .atomic_trigger = bch2_mark_stripe, \
+ .min_val_size = 8, \
+})
+
+static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
+{
+ return DIV_ROUND_UP(le16_to_cpu(s->sectors),
+ 1 << s->csum_granularity_bits);
+}
+
+static inline unsigned stripe_csum_offset(const struct bch_stripe *s,
+ unsigned dev, unsigned csum_idx)
+{
+ unsigned csum_bytes = bch_crc_bytes[s->csum_type];
+
+ return sizeof(struct bch_stripe) +
+ sizeof(struct bch_extent_ptr) * s->nr_blocks +
+ (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes;
+}
+
+static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s,
+ unsigned idx)
+{
+ return stripe_csum_offset(s, s->nr_blocks, 0) +
+ sizeof(u16) * idx;
+}
+
+static inline unsigned stripe_blockcount_get(const struct bch_stripe *s,
+ unsigned idx)
+{
+ return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx));
+}
+
+static inline void stripe_blockcount_set(struct bch_stripe *s,
+ unsigned idx, unsigned v)
+{
+ __le16 *p = (void *) s + stripe_blockcount_offset(s, idx);
+
+ *p = cpu_to_le16(v);
+}
+
+static inline unsigned stripe_val_u64s(const struct bch_stripe *s)
+{
+ return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks),
+ sizeof(u64));
+}
+
+static inline void *stripe_csum(struct bch_stripe *s,
+ unsigned block, unsigned csum_idx)
+{
+ EBUG_ON(block >= s->nr_blocks);
+ EBUG_ON(csum_idx >= stripe_csums_per_device(s));
+
+ return (void *) s + stripe_csum_offset(s, block, csum_idx);
+}
+
+static inline struct bch_csum stripe_csum_get(struct bch_stripe *s,
+ unsigned block, unsigned csum_idx)
+{
+ struct bch_csum csum = { 0 };
+
+ memcpy(&csum, stripe_csum(s, block, csum_idx), bch_crc_bytes[s->csum_type]);
+ return csum;
+}
+
+static inline void stripe_csum_set(struct bch_stripe *s,
+ unsigned block, unsigned csum_idx,
+ struct bch_csum csum)
+{
+ memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]);
+}
+
+static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr,
+ const struct bch_extent_ptr *data_ptr,
+ unsigned sectors)
+{
+ return data_ptr->dev == stripe_ptr->dev &&
+ data_ptr->gen == stripe_ptr->gen &&
+ data_ptr->offset >= stripe_ptr->offset &&
+ data_ptr->offset < stripe_ptr->offset + sectors;
+}
+
+static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s,
+ struct extent_ptr_decoded p)
+{
+ unsigned nr_data = s->nr_blocks - s->nr_redundant;
+
+ BUG_ON(!p.has_ec);
+
+ if (p.ec.block >= nr_data)
+ return false;
+
+ return __bch2_ptr_matches_stripe(&s->ptrs[p.ec.block], &p.ptr,
+ le16_to_cpu(s->sectors));
+}
+
+static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m,
+ struct extent_ptr_decoded p)
+{
+ unsigned nr_data = m->nr_blocks - m->nr_redundant;
+
+ BUG_ON(!p.has_ec);
+
+ if (p.ec.block >= nr_data)
+ return false;
+
+ return __bch2_ptr_matches_stripe(&m->ptrs[p.ec.block], &p.ptr,
+ m->sectors);
+}
+
+struct bch_read_bio;
+
+struct ec_stripe_buf {
+ /* might not be buffering the entire stripe: */
+ unsigned offset;
+ unsigned size;
+ unsigned long valid[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
+
+ void *data[BCH_BKEY_PTRS_MAX];
+
+ __BKEY_PADDED(key, 255);
+};
+
+struct ec_stripe_head;
+
+enum ec_stripe_ref {
+ STRIPE_REF_io,
+ STRIPE_REF_stripe,
+ STRIPE_REF_NR
+};
+
+struct ec_stripe_new {
+ struct bch_fs *c;
+ struct ec_stripe_head *h;
+ struct mutex lock;
+ struct list_head list;
+
+ struct hlist_node hash;
+ u64 idx;
+
+ struct closure iodone;
+
+ atomic_t ref[STRIPE_REF_NR];
+
+ int err;
+
+ u8 nr_data;
+ u8 nr_parity;
+ bool allocated;
+ bool pending;
+ bool have_existing_stripe;
+
+ unsigned long blocks_gotten[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
+ unsigned long blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
+ open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX];
+ struct disk_reservation res;
+
+ struct ec_stripe_buf new_stripe;
+ struct ec_stripe_buf existing_stripe;
+};
+
+struct ec_stripe_head {
+ struct list_head list;
+ struct mutex lock;
+
+ unsigned target;
+ unsigned algo;
+ unsigned redundancy;
+ enum bch_watermark watermark;
+
+ struct bch_devs_mask devs;
+ unsigned nr_active_devs;
+
+ unsigned blocksize;
+
+ struct dev_stripe_state block_stripe;
+ struct dev_stripe_state parity_stripe;
+
+ struct ec_stripe_new *s;
+};
+
+int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *);
+
+void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
+
+void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
+
+int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
+
+void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *);
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *,
+ unsigned, unsigned, unsigned,
+ enum bch_watermark, struct closure *);
+
+void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
+void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
+void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t);
+
+void bch2_do_stripe_deletes(struct bch_fs *);
+void bch2_ec_do_stripe_creates(struct bch_fs *);
+void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *);
+
+static inline void ec_stripe_new_get(struct ec_stripe_new *s,
+ enum ec_stripe_ref ref)
+{
+ atomic_inc(&s->ref[ref]);
+}
+
+static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s,
+ enum ec_stripe_ref ref)
+{
+ BUG_ON(atomic_read(&s->ref[ref]) <= 0);
+
+ if (atomic_dec_and_test(&s->ref[ref]))
+ switch (ref) {
+ case STRIPE_REF_stripe:
+ bch2_ec_stripe_new_free(c, s);
+ break;
+ case STRIPE_REF_io:
+ bch2_ec_do_stripe_creates(c);
+ break;
+ default:
+ BUG();
+ }
+}
+
+void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
+void bch2_fs_ec_stop(struct bch_fs *);
+void bch2_fs_ec_flush(struct bch_fs *);
+
+int bch2_stripes_read(struct bch_fs *);
+
+void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *);
+void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
+
+void bch2_fs_ec_exit(struct bch_fs *);
+void bch2_fs_ec_init_early(struct bch_fs *);
+int bch2_fs_ec_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_EC_H */
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
new file mode 100644
index 000000000000..e2b02a82de32
--- /dev/null
+++ b/fs/bcachefs/ec_types.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EC_TYPES_H
+#define _BCACHEFS_EC_TYPES_H
+
+#include "bcachefs_format.h"
+
+struct bch_replicas_padded {
+ struct bch_replicas_entry e;
+ u8 pad[BCH_BKEY_PTRS_MAX];
+};
+
+struct stripe {
+ size_t heap_idx;
+ u16 sectors;
+ u8 algorithm;
+ u8 nr_blocks;
+ u8 nr_redundant;
+ u8 blocks_nonempty;
+};
+
+struct gc_stripe {
+ u16 sectors;
+
+ u8 nr_blocks;
+ u8 nr_redundant;
+
+ unsigned alive:1; /* does a corresponding key exist in stripes btree? */
+ u16 block_sectors[BCH_BKEY_PTRS_MAX];
+ struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX];
+
+ struct bch_replicas_padded r;
+};
+
+struct ec_stripe_heap_entry {
+ size_t idx;
+ unsigned blocks_nonempty;
+};
+
+typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap;
+
+#endif /* _BCACHEFS_EC_TYPES_H */
diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c
new file mode 100644
index 000000000000..d260ff9bbfeb
--- /dev/null
+++ b/fs/bcachefs/errcode.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "errcode.h"
+
+#include <linux/errname.h>
+
+static const char * const bch2_errcode_strs[] = {
+#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = #err,
+ BCH_ERRCODES()
+#undef x
+ NULL
+};
+
+static unsigned bch2_errcode_parents[] = {
+#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class,
+ BCH_ERRCODES()
+#undef x
+};
+
+const char *bch2_err_str(int err)
+{
+ const char *errstr;
+
+ err = abs(err);
+
+ BUG_ON(err >= BCH_ERR_MAX);
+
+ if (err >= BCH_ERR_START)
+ errstr = bch2_errcode_strs[err - BCH_ERR_START];
+ else if (err)
+ errstr = errname(err);
+ else
+ errstr = "(No error)";
+ return errstr ?: "(Invalid error)";
+}
+
+bool __bch2_err_matches(int err, int class)
+{
+ err = abs(err);
+ class = abs(class);
+
+ BUG_ON(err >= BCH_ERR_MAX);
+ BUG_ON(class >= BCH_ERR_MAX);
+
+ while (err >= BCH_ERR_START && err != class)
+ err = bch2_errcode_parents[err - BCH_ERR_START];
+
+ return err == class;
+}
+
+int __bch2_err_class(int err)
+{
+ err = -err;
+ BUG_ON((unsigned) err >= BCH_ERR_MAX);
+
+ while (err >= BCH_ERR_START && bch2_errcode_parents[err - BCH_ERR_START])
+ err = bch2_errcode_parents[err - BCH_ERR_START];
+
+ return -err;
+}
+
+const char *bch2_blk_status_to_str(blk_status_t status)
+{
+ if (status == BLK_STS_REMOVED)
+ return "device removed";
+ return blk_status_to_str(status);
+}
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
new file mode 100644
index 000000000000..9ce29681eec9
--- /dev/null
+++ b/fs/bcachefs/errcode.h
@@ -0,0 +1,273 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ERRCODE_H
+#define _BCACHEFS_ERRCODE_H
+
+#define BCH_ERRCODES() \
+ x(ERANGE, ERANGE_option_too_small) \
+ x(ERANGE, ERANGE_option_too_big) \
+ x(ENOMEM, ENOMEM_stripe_buf) \
+ x(ENOMEM, ENOMEM_replicas_table) \
+ x(ENOMEM, ENOMEM_cpu_replicas) \
+ x(ENOMEM, ENOMEM_replicas_gc) \
+ x(ENOMEM, ENOMEM_disk_groups_validate) \
+ x(ENOMEM, ENOMEM_disk_groups_to_cpu) \
+ x(ENOMEM, ENOMEM_mark_snapshot) \
+ x(ENOMEM, ENOMEM_mark_stripe) \
+ x(ENOMEM, ENOMEM_mark_stripe_ptr) \
+ x(ENOMEM, ENOMEM_btree_key_cache_create) \
+ x(ENOMEM, ENOMEM_btree_key_cache_fill) \
+ x(ENOMEM, ENOMEM_btree_key_cache_insert) \
+ x(ENOMEM, ENOMEM_trans_kmalloc) \
+ x(ENOMEM, ENOMEM_trans_log_msg) \
+ x(ENOMEM, ENOMEM_do_encrypt) \
+ x(ENOMEM, ENOMEM_ec_read_extent) \
+ x(ENOMEM, ENOMEM_ec_stripe_mem_alloc) \
+ x(ENOMEM, ENOMEM_ec_new_stripe_alloc) \
+ x(ENOMEM, ENOMEM_fs_btree_cache_init) \
+ x(ENOMEM, ENOMEM_fs_btree_key_cache_init) \
+ x(ENOMEM, ENOMEM_fs_counters_init) \
+ x(ENOMEM, ENOMEM_fs_btree_write_buffer_init) \
+ x(ENOMEM, ENOMEM_io_clock_init) \
+ x(ENOMEM, ENOMEM_blacklist_table_init) \
+ x(ENOMEM, ENOMEM_sb_realloc_injected) \
+ x(ENOMEM, ENOMEM_sb_bio_realloc) \
+ x(ENOMEM, ENOMEM_sb_buf_realloc) \
+ x(ENOMEM, ENOMEM_sb_journal_validate) \
+ x(ENOMEM, ENOMEM_sb_journal_v2_validate) \
+ x(ENOMEM, ENOMEM_journal_entry_add) \
+ x(ENOMEM, ENOMEM_journal_read_buf_realloc) \
+ x(ENOMEM, ENOMEM_btree_interior_update_worker_init)\
+ x(ENOMEM, ENOMEM_btree_interior_update_pool_init) \
+ x(ENOMEM, ENOMEM_bio_read_init) \
+ x(ENOMEM, ENOMEM_bio_read_split_init) \
+ x(ENOMEM, ENOMEM_bio_write_init) \
+ x(ENOMEM, ENOMEM_bio_bounce_pages_init) \
+ x(ENOMEM, ENOMEM_writepage_bioset_init) \
+ x(ENOMEM, ENOMEM_dio_read_bioset_init) \
+ x(ENOMEM, ENOMEM_dio_write_bioset_init) \
+ x(ENOMEM, ENOMEM_nocow_flush_bioset_init) \
+ x(ENOMEM, ENOMEM_promote_table_init) \
+ x(ENOMEM, ENOMEM_compression_bounce_read_init) \
+ x(ENOMEM, ENOMEM_compression_bounce_write_init) \
+ x(ENOMEM, ENOMEM_compression_workspace_init) \
+ x(ENOMEM, ENOMEM_decompression_workspace_init) \
+ x(ENOMEM, ENOMEM_bucket_gens) \
+ x(ENOMEM, ENOMEM_buckets_nouse) \
+ x(ENOMEM, ENOMEM_usage_init) \
+ x(ENOMEM, ENOMEM_btree_node_read_all_replicas) \
+ x(ENOMEM, ENOMEM_btree_node_reclaim) \
+ x(ENOMEM, ENOMEM_btree_node_mem_alloc) \
+ x(ENOMEM, ENOMEM_btree_cache_cannibalize_lock) \
+ x(ENOMEM, ENOMEM_buckets_waiting_for_journal_init)\
+ x(ENOMEM, ENOMEM_buckets_waiting_for_journal_set) \
+ x(ENOMEM, ENOMEM_set_nr_journal_buckets) \
+ x(ENOMEM, ENOMEM_dev_journal_init) \
+ x(ENOMEM, ENOMEM_journal_pin_fifo) \
+ x(ENOMEM, ENOMEM_journal_buf) \
+ x(ENOMEM, ENOMEM_gc_start) \
+ x(ENOMEM, ENOMEM_gc_alloc_start) \
+ x(ENOMEM, ENOMEM_gc_reflink_start) \
+ x(ENOMEM, ENOMEM_gc_gens) \
+ x(ENOMEM, ENOMEM_gc_repair_key) \
+ x(ENOMEM, ENOMEM_fsck_extent_ends_at) \
+ x(ENOMEM, ENOMEM_fsck_add_nlink) \
+ x(ENOMEM, ENOMEM_journal_key_insert) \
+ x(ENOMEM, ENOMEM_journal_keys_sort) \
+ x(ENOMEM, ENOMEM_journal_replay) \
+ x(ENOMEM, ENOMEM_read_superblock_clean) \
+ x(ENOMEM, ENOMEM_fs_alloc) \
+ x(ENOMEM, ENOMEM_fs_name_alloc) \
+ x(ENOMEM, ENOMEM_fs_other_alloc) \
+ x(ENOMEM, ENOMEM_dev_alloc) \
+ x(ENOSPC, ENOSPC_disk_reservation) \
+ x(ENOSPC, ENOSPC_bucket_alloc) \
+ x(ENOSPC, ENOSPC_disk_label_add) \
+ x(ENOSPC, ENOSPC_stripe_create) \
+ x(ENOSPC, ENOSPC_inode_create) \
+ x(ENOSPC, ENOSPC_str_hash_create) \
+ x(ENOSPC, ENOSPC_snapshot_create) \
+ x(ENOSPC, ENOSPC_subvolume_create) \
+ x(ENOSPC, ENOSPC_sb) \
+ x(ENOSPC, ENOSPC_sb_journal) \
+ x(ENOSPC, ENOSPC_sb_journal_seq_blacklist) \
+ x(ENOSPC, ENOSPC_sb_quota) \
+ x(ENOSPC, ENOSPC_sb_replicas) \
+ x(ENOSPC, ENOSPC_sb_members) \
+ x(ENOSPC, ENOSPC_sb_members_v2) \
+ x(ENOSPC, ENOSPC_sb_crypt) \
+ x(ENOSPC, ENOSPC_sb_downgrade) \
+ x(ENOSPC, ENOSPC_btree_slot) \
+ x(ENOSPC, ENOSPC_snapshot_tree) \
+ x(ENOENT, ENOENT_bkey_type_mismatch) \
+ x(ENOENT, ENOENT_str_hash_lookup) \
+ x(ENOENT, ENOENT_str_hash_set_must_replace) \
+ x(ENOENT, ENOENT_inode) \
+ x(ENOENT, ENOENT_not_subvol) \
+ x(ENOENT, ENOENT_not_directory) \
+ x(ENOENT, ENOENT_directory_dead) \
+ x(ENOENT, ENOENT_subvolume) \
+ x(ENOENT, ENOENT_snapshot_tree) \
+ x(ENOENT, ENOENT_dirent_doesnt_match_inode) \
+ x(ENOENT, ENOENT_dev_not_found) \
+ x(ENOENT, ENOENT_dev_idx_not_found) \
+ x(0, open_buckets_empty) \
+ x(0, freelist_empty) \
+ x(BCH_ERR_freelist_empty, no_buckets_found) \
+ x(0, transaction_restart) \
+ x(BCH_ERR_transaction_restart, transaction_restart_fault_inject) \
+ x(BCH_ERR_transaction_restart, transaction_restart_relock) \
+ x(BCH_ERR_transaction_restart, transaction_restart_relock_path) \
+ x(BCH_ERR_transaction_restart, transaction_restart_relock_path_intent) \
+ x(BCH_ERR_transaction_restart, transaction_restart_relock_after_fill) \
+ x(BCH_ERR_transaction_restart, transaction_restart_too_many_iters) \
+ x(BCH_ERR_transaction_restart, transaction_restart_lock_node_reused) \
+ x(BCH_ERR_transaction_restart, transaction_restart_fill_relock) \
+ x(BCH_ERR_transaction_restart, transaction_restart_fill_mem_alloc_fail)\
+ x(BCH_ERR_transaction_restart, transaction_restart_mem_realloced) \
+ x(BCH_ERR_transaction_restart, transaction_restart_in_traverse_all) \
+ x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock) \
+ x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock_write)\
+ x(BCH_ERR_transaction_restart, transaction_restart_deadlock_recursion_limit)\
+ x(BCH_ERR_transaction_restart, transaction_restart_upgrade) \
+ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_upgrade) \
+ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_fill) \
+ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_raced) \
+ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_realloced)\
+ x(BCH_ERR_transaction_restart, transaction_restart_journal_preres_get) \
+ x(BCH_ERR_transaction_restart, transaction_restart_split_race) \
+ x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \
+ x(BCH_ERR_transaction_restart, transaction_restart_nested) \
+ x(0, no_btree_node) \
+ x(BCH_ERR_no_btree_node, no_btree_node_relock) \
+ x(BCH_ERR_no_btree_node, no_btree_node_upgrade) \
+ x(BCH_ERR_no_btree_node, no_btree_node_drop) \
+ x(BCH_ERR_no_btree_node, no_btree_node_lock_root) \
+ x(BCH_ERR_no_btree_node, no_btree_node_up) \
+ x(BCH_ERR_no_btree_node, no_btree_node_down) \
+ x(BCH_ERR_no_btree_node, no_btree_node_init) \
+ x(BCH_ERR_no_btree_node, no_btree_node_cached) \
+ x(BCH_ERR_no_btree_node, no_btree_node_srcu_reset) \
+ x(0, btree_insert_fail) \
+ x(BCH_ERR_btree_insert_fail, btree_insert_btree_node_full) \
+ x(BCH_ERR_btree_insert_fail, btree_insert_need_mark_replicas) \
+ x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \
+ x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \
+ x(BCH_ERR_btree_insert_fail, btree_insert_need_flush_buffer) \
+ x(0, backpointer_to_overwritten_btree_node) \
+ x(0, lock_fail_root_changed) \
+ x(0, journal_reclaim_would_deadlock) \
+ x(EINVAL, fsck) \
+ x(BCH_ERR_fsck, fsck_fix) \
+ x(BCH_ERR_fsck, fsck_ignore) \
+ x(BCH_ERR_fsck, fsck_errors_not_fixed) \
+ x(BCH_ERR_fsck, fsck_repair_unimplemented) \
+ x(BCH_ERR_fsck, fsck_repair_impossible) \
+ x(0, restart_recovery) \
+ x(0, data_update_done) \
+ x(EINVAL, device_state_not_allowed) \
+ x(EINVAL, member_info_missing) \
+ x(EINVAL, mismatched_block_size) \
+ x(EINVAL, block_size_too_small) \
+ x(EINVAL, bucket_size_too_small) \
+ x(EINVAL, device_size_too_small) \
+ x(EINVAL, device_not_a_member_of_filesystem) \
+ x(EINVAL, device_has_been_removed) \
+ x(EINVAL, device_already_online) \
+ x(EINVAL, insufficient_devices_to_start) \
+ x(EINVAL, invalid) \
+ x(EINVAL, internal_fsck_err) \
+ x(EROFS, erofs_trans_commit) \
+ x(EROFS, erofs_no_writes) \
+ x(EROFS, erofs_journal_err) \
+ x(EROFS, erofs_sb_err) \
+ x(EROFS, erofs_unfixed_errors) \
+ x(EROFS, erofs_norecovery) \
+ x(EROFS, erofs_nochanges) \
+ x(EROFS, insufficient_devices) \
+ x(0, operation_blocked) \
+ x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \
+ x(BCH_ERR_operation_blocked, journal_res_get_blocked) \
+ x(BCH_ERR_operation_blocked, journal_preres_get_blocked) \
+ x(BCH_ERR_operation_blocked, bucket_alloc_blocked) \
+ x(BCH_ERR_operation_blocked, stripe_alloc_blocked) \
+ x(BCH_ERR_invalid, invalid_sb) \
+ x(BCH_ERR_invalid_sb, invalid_sb_magic) \
+ x(BCH_ERR_invalid_sb, invalid_sb_version) \
+ x(BCH_ERR_invalid_sb, invalid_sb_features) \
+ x(BCH_ERR_invalid_sb, invalid_sb_too_big) \
+ x(BCH_ERR_invalid_sb, invalid_sb_csum_type) \
+ x(BCH_ERR_invalid_sb, invalid_sb_csum) \
+ x(BCH_ERR_invalid_sb, invalid_sb_block_size) \
+ x(BCH_ERR_invalid_sb, invalid_sb_uuid) \
+ x(BCH_ERR_invalid_sb, invalid_sb_too_many_members) \
+ x(BCH_ERR_invalid_sb, invalid_sb_dev_idx) \
+ x(BCH_ERR_invalid_sb, invalid_sb_time_precision) \
+ x(BCH_ERR_invalid_sb, invalid_sb_field_size) \
+ x(BCH_ERR_invalid_sb, invalid_sb_layout) \
+ x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_type) \
+ x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_nr_superblocks) \
+ x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_superblocks_overlap) \
+ x(BCH_ERR_invalid_sb, invalid_sb_members_missing) \
+ x(BCH_ERR_invalid_sb, invalid_sb_members) \
+ x(BCH_ERR_invalid_sb, invalid_sb_disk_groups) \
+ x(BCH_ERR_invalid_sb, invalid_sb_replicas) \
+ x(BCH_ERR_invalid_sb, invalid_replicas_entry) \
+ x(BCH_ERR_invalid_sb, invalid_sb_journal) \
+ x(BCH_ERR_invalid_sb, invalid_sb_journal_seq_blacklist) \
+ x(BCH_ERR_invalid_sb, invalid_sb_crypt) \
+ x(BCH_ERR_invalid_sb, invalid_sb_clean) \
+ x(BCH_ERR_invalid_sb, invalid_sb_quota) \
+ x(BCH_ERR_invalid_sb, invalid_sb_errors) \
+ x(BCH_ERR_invalid_sb, invalid_sb_opt_compression) \
+ x(BCH_ERR_invalid_sb, invalid_sb_ext) \
+ x(BCH_ERR_invalid_sb, invalid_sb_downgrade) \
+ x(BCH_ERR_invalid, invalid_bkey) \
+ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \
+ x(EIO, btree_node_read_err) \
+ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
+ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
+ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \
+ x(BCH_ERR_btree_node_read_err, btree_node_read_err_bad_node) \
+ x(BCH_ERR_btree_node_read_err, btree_node_read_err_incompatible) \
+ x(0, nopromote) \
+ x(BCH_ERR_nopromote, nopromote_may_not) \
+ x(BCH_ERR_nopromote, nopromote_already_promoted) \
+ x(BCH_ERR_nopromote, nopromote_unwritten) \
+ x(BCH_ERR_nopromote, nopromote_congested) \
+ x(BCH_ERR_nopromote, nopromote_in_flight) \
+ x(BCH_ERR_nopromote, nopromote_enomem)
+
+enum bch_errcode {
+ BCH_ERR_START = 2048,
+#define x(class, err) BCH_ERR_##err,
+ BCH_ERRCODES()
+#undef x
+ BCH_ERR_MAX
+};
+
+const char *bch2_err_str(int);
+bool __bch2_err_matches(int, int);
+
+static inline bool _bch2_err_matches(int err, int class)
+{
+ return err < 0 && __bch2_err_matches(err, class);
+}
+
+#define bch2_err_matches(_err, _class) \
+({ \
+ BUILD_BUG_ON(!__builtin_constant_p(_class)); \
+ unlikely(_bch2_err_matches(_err, _class)); \
+})
+
+int __bch2_err_class(int);
+
+static inline long bch2_err_class(long err)
+{
+ return err < 0 ? __bch2_err_class(err) : err;
+}
+
+#define BLK_STS_REMOVED ((__force blk_status_t)128)
+
+const char *bch2_blk_status_to_str(blk_status_t);
+
+#endif /* _BCACHFES_ERRCODE_H */
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
new file mode 100644
index 000000000000..25cf78a7b946
--- /dev/null
+++ b/fs/bcachefs/error.c
@@ -0,0 +1,302 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "error.h"
+#include "super.h"
+
+#define FSCK_ERR_RATELIMIT_NR 10
+
+bool bch2_inconsistent_error(struct bch_fs *c)
+{
+ set_bit(BCH_FS_ERROR, &c->flags);
+
+ switch (c->opts.errors) {
+ case BCH_ON_ERROR_continue:
+ return false;
+ case BCH_ON_ERROR_ro:
+ if (bch2_fs_emergency_read_only(c))
+ bch_err(c, "inconsistency detected - emergency read only");
+ return true;
+ case BCH_ON_ERROR_panic:
+ panic(bch2_fmt(c, "panic after error"));
+ return true;
+ default:
+ BUG();
+ }
+}
+
+void bch2_topology_error(struct bch_fs *c)
+{
+ set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags);
+ if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
+ bch2_inconsistent_error(c);
+}
+
+void bch2_fatal_error(struct bch_fs *c)
+{
+ if (bch2_fs_emergency_read_only(c))
+ bch_err(c, "fatal error - emergency read only");
+}
+
+void bch2_io_error_work(struct work_struct *work)
+{
+ struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
+ struct bch_fs *c = ca->fs;
+ bool dev;
+
+ down_write(&c->state_lock);
+ dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro,
+ BCH_FORCE_IF_DEGRADED);
+ if (dev
+ ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
+ BCH_FORCE_IF_DEGRADED)
+ : bch2_fs_emergency_read_only(c))
+ bch_err(ca,
+ "too many IO errors, setting %s RO",
+ dev ? "device" : "filesystem");
+ up_write(&c->state_lock);
+}
+
+void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type)
+{
+ atomic64_inc(&ca->errors[type]);
+ //queue_work(system_long_wq, &ca->io_error_work);
+}
+
+enum ask_yn {
+ YN_NO,
+ YN_YES,
+ YN_ALLNO,
+ YN_ALLYES,
+};
+
+#ifdef __KERNEL__
+#define bch2_fsck_ask_yn() YN_NO
+#else
+
+#include "tools-util.h"
+
+enum ask_yn bch2_fsck_ask_yn(void)
+{
+ char *buf = NULL;
+ size_t buflen = 0;
+ bool ret;
+
+ while (true) {
+ fputs(" (y,n, or Y,N for all errors of this type) ", stdout);
+ fflush(stdout);
+
+ if (getline(&buf, &buflen, stdin) < 0)
+ die("error reading from standard input");
+
+ strim(buf);
+ if (strlen(buf) != 1)
+ continue;
+
+ switch (buf[0]) {
+ case 'n':
+ return YN_NO;
+ case 'y':
+ return YN_YES;
+ case 'N':
+ return YN_ALLNO;
+ case 'Y':
+ return YN_ALLYES;
+ }
+ }
+
+ free(buf);
+ return ret;
+}
+
+#endif
+
+static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt)
+{
+ struct fsck_err_state *s;
+
+ if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
+ return NULL;
+
+ list_for_each_entry(s, &c->fsck_error_msgs, list)
+ if (s->fmt == fmt) {
+ /*
+ * move it to the head of the list: repeated fsck errors
+ * are common
+ */
+ list_move(&s->list, &c->fsck_error_msgs);
+ return s;
+ }
+
+ s = kzalloc(sizeof(*s), GFP_NOFS);
+ if (!s) {
+ if (!c->fsck_alloc_msgs_err)
+ bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
+ c->fsck_alloc_msgs_err = true;
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&s->list);
+ s->fmt = fmt;
+ list_add(&s->list, &c->fsck_error_msgs);
+ return s;
+}
+
+int bch2_fsck_err(struct bch_fs *c,
+ enum bch_fsck_flags flags,
+ enum bch_sb_error_id err,
+ const char *fmt, ...)
+{
+ struct fsck_err_state *s = NULL;
+ va_list args;
+ bool print = true, suppressing = false, inconsistent = false;
+ struct printbuf buf = PRINTBUF, *out = &buf;
+ int ret = -BCH_ERR_fsck_ignore;
+
+ if (test_bit(err, c->sb.errors_silent))
+ return -BCH_ERR_fsck_fix;
+
+ bch2_sb_error_count(c, err);
+
+ va_start(args, fmt);
+ prt_vprintf(out, fmt, args);
+ va_end(args);
+
+ mutex_lock(&c->fsck_error_msgs_lock);
+ s = fsck_err_get(c, fmt);
+ if (s) {
+ /*
+ * We may be called multiple times for the same error on
+ * transaction restart - this memoizes instead of asking the user
+ * multiple times for the same error:
+ */
+ if (s->last_msg && !strcmp(buf.buf, s->last_msg)) {
+ ret = s->ret;
+ mutex_unlock(&c->fsck_error_msgs_lock);
+ printbuf_exit(&buf);
+ return ret;
+ }
+
+ kfree(s->last_msg);
+ s->last_msg = kstrdup(buf.buf, GFP_KERNEL);
+
+ if (c->opts.ratelimit_errors &&
+ !(flags & FSCK_NO_RATELIMIT) &&
+ s->nr >= FSCK_ERR_RATELIMIT_NR) {
+ if (s->nr == FSCK_ERR_RATELIMIT_NR)
+ suppressing = true;
+ else
+ print = false;
+ }
+
+ s->nr++;
+ }
+
+#ifdef BCACHEFS_LOG_PREFIX
+ if (!strncmp(fmt, "bcachefs:", 9))
+ prt_printf(out, bch2_log_msg(c, ""));
+#endif
+
+ if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) {
+ if (c->opts.errors != BCH_ON_ERROR_continue ||
+ !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
+ prt_str(out, ", shutting down");
+ inconsistent = true;
+ ret = -BCH_ERR_fsck_errors_not_fixed;
+ } else if (flags & FSCK_CAN_FIX) {
+ prt_str(out, ", fixing");
+ ret = -BCH_ERR_fsck_fix;
+ } else {
+ prt_str(out, ", continuing");
+ ret = -BCH_ERR_fsck_ignore;
+ }
+ } else if (c->opts.fix_errors == FSCK_FIX_exit) {
+ prt_str(out, ", exiting");
+ ret = -BCH_ERR_fsck_errors_not_fixed;
+ } else if (flags & FSCK_CAN_FIX) {
+ int fix = s && s->fix
+ ? s->fix
+ : c->opts.fix_errors;
+
+ if (fix == FSCK_FIX_ask) {
+ int ask;
+
+ prt_str(out, ": fix?");
+ bch2_print_string_as_lines(KERN_ERR, out->buf);
+ print = false;
+
+ ask = bch2_fsck_ask_yn();
+
+ if (ask >= YN_ALLNO && s)
+ s->fix = ask == YN_ALLNO
+ ? FSCK_FIX_no
+ : FSCK_FIX_yes;
+
+ ret = ask & 1
+ ? -BCH_ERR_fsck_fix
+ : -BCH_ERR_fsck_ignore;
+ } else if (fix == FSCK_FIX_yes ||
+ (c->opts.nochanges &&
+ !(flags & FSCK_CAN_IGNORE))) {
+ prt_str(out, ", fixing");
+ ret = -BCH_ERR_fsck_fix;
+ } else {
+ prt_str(out, ", not fixing");
+ }
+ } else if (flags & FSCK_NEED_FSCK) {
+ prt_str(out, " (run fsck to correct)");
+ } else {
+ prt_str(out, " (repair unimplemented)");
+ }
+
+ if (ret == -BCH_ERR_fsck_ignore &&
+ (c->opts.fix_errors == FSCK_FIX_exit ||
+ !(flags & FSCK_CAN_IGNORE)))
+ ret = -BCH_ERR_fsck_errors_not_fixed;
+
+ if (print)
+ bch2_print_string_as_lines(KERN_ERR, out->buf);
+
+ if (!test_bit(BCH_FS_FSCK_DONE, &c->flags) &&
+ (ret != -BCH_ERR_fsck_fix &&
+ ret != -BCH_ERR_fsck_ignore))
+ bch_err(c, "Unable to continue, halting");
+ else if (suppressing)
+ bch_err(c, "Ratelimiting new instances of previous error");
+
+ if (s)
+ s->ret = ret;
+
+ mutex_unlock(&c->fsck_error_msgs_lock);
+
+ printbuf_exit(&buf);
+
+ if (inconsistent)
+ bch2_inconsistent_error(c);
+
+ if (ret == -BCH_ERR_fsck_fix) {
+ set_bit(BCH_FS_ERRORS_FIXED, &c->flags);
+ } else {
+ set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags);
+ set_bit(BCH_FS_ERROR, &c->flags);
+ }
+
+ return ret;
+}
+
+void bch2_flush_fsck_errs(struct bch_fs *c)
+{
+ struct fsck_err_state *s, *n;
+
+ mutex_lock(&c->fsck_error_msgs_lock);
+
+ list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) {
+ if (s->ratelimited && s->last_msg)
+ bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg);
+
+ list_del(&s->list);
+ kfree(s->last_msg);
+ kfree(s);
+ }
+
+ mutex_unlock(&c->fsck_error_msgs_lock);
+}
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
new file mode 100644
index 000000000000..fec17d1353d1
--- /dev/null
+++ b/fs/bcachefs/error.h
@@ -0,0 +1,242 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ERROR_H
+#define _BCACHEFS_ERROR_H
+
+#include <linux/list.h>
+#include <linux/printk.h>
+#include "sb-errors.h"
+
+struct bch_dev;
+struct bch_fs;
+struct work_struct;
+
+/*
+ * XXX: separate out errors that indicate on disk data is inconsistent, and flag
+ * superblock as such
+ */
+
+/* Error messages: */
+
+/*
+ * Inconsistency errors: The on disk data is inconsistent. If these occur during
+ * initial recovery, they don't indicate a bug in the running code - we walk all
+ * the metadata before modifying anything. If they occur at runtime, they
+ * indicate either a bug in the running code or (less likely) data is being
+ * silently corrupted under us.
+ *
+ * XXX: audit all inconsistent errors and make sure they're all recoverable, in
+ * BCH_ON_ERROR_CONTINUE mode
+ */
+
+bool bch2_inconsistent_error(struct bch_fs *);
+
+void bch2_topology_error(struct bch_fs *);
+
+#define bch2_fs_inconsistent(c, ...) \
+({ \
+ bch_err(c, __VA_ARGS__); \
+ bch2_inconsistent_error(c); \
+})
+
+#define bch2_fs_inconsistent_on(cond, c, ...) \
+({ \
+ bool _ret = unlikely(!!(cond)); \
+ \
+ if (_ret) \
+ bch2_fs_inconsistent(c, __VA_ARGS__); \
+ _ret; \
+})
+
+/*
+ * Later we might want to mark only the particular device inconsistent, not the
+ * entire filesystem:
+ */
+
+#define bch2_dev_inconsistent(ca, ...) \
+do { \
+ bch_err(ca, __VA_ARGS__); \
+ bch2_inconsistent_error((ca)->fs); \
+} while (0)
+
+#define bch2_dev_inconsistent_on(cond, ca, ...) \
+({ \
+ bool _ret = unlikely(!!(cond)); \
+ \
+ if (_ret) \
+ bch2_dev_inconsistent(ca, __VA_ARGS__); \
+ _ret; \
+})
+
+/*
+ * When a transaction update discovers or is causing a fs inconsistency, it's
+ * helpful to also dump the pending updates:
+ */
+#define bch2_trans_inconsistent(trans, ...) \
+({ \
+ bch_err(trans->c, __VA_ARGS__); \
+ bch2_dump_trans_updates(trans); \
+ bch2_inconsistent_error(trans->c); \
+})
+
+#define bch2_trans_inconsistent_on(cond, trans, ...) \
+({ \
+ bool _ret = unlikely(!!(cond)); \
+ \
+ if (_ret) \
+ bch2_trans_inconsistent(trans, __VA_ARGS__); \
+ _ret; \
+})
+
+/*
+ * Fsck errors: inconsistency errors we detect at mount time, and should ideally
+ * be able to repair:
+ */
+
+struct fsck_err_state {
+ struct list_head list;
+ const char *fmt;
+ u64 nr;
+ bool ratelimited;
+ int ret;
+ int fix;
+ char *last_msg;
+};
+
+enum bch_fsck_flags {
+ FSCK_CAN_FIX = 1 << 0,
+ FSCK_CAN_IGNORE = 1 << 1,
+ FSCK_NEED_FSCK = 1 << 2,
+ FSCK_NO_RATELIMIT = 1 << 3,
+};
+
+#define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err)
+
+__printf(4, 5) __cold
+int bch2_fsck_err(struct bch_fs *,
+ enum bch_fsck_flags,
+ enum bch_sb_error_id,
+ const char *, ...);
+void bch2_flush_fsck_errs(struct bch_fs *);
+
+#define __fsck_err(c, _flags, _err_type, ...) \
+({ \
+ int _ret = bch2_fsck_err(c, _flags, BCH_FSCK_ERR_##_err_type, \
+ __VA_ARGS__); \
+ \
+ if (_ret != -BCH_ERR_fsck_fix && \
+ _ret != -BCH_ERR_fsck_ignore) { \
+ ret = _ret; \
+ goto fsck_err; \
+ } \
+ \
+ _ret == -BCH_ERR_fsck_fix; \
+})
+
+/* These macros return true if error should be fixed: */
+
+/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
+
+#define __fsck_err_on(cond, c, _flags, _err_type, ...) \
+ (unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false)
+
+#define need_fsck_err_on(cond, c, _err_type, ...) \
+ __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__)
+
+#define need_fsck_err(c, _err_type, ...) \
+ __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__)
+
+#define mustfix_fsck_err(c, _err_type, ...) \
+ __fsck_err(c, FSCK_CAN_FIX, _err_type, __VA_ARGS__)
+
+#define mustfix_fsck_err_on(cond, c, _err_type, ...) \
+ __fsck_err_on(cond, c, FSCK_CAN_FIX, _err_type, __VA_ARGS__)
+
+#define fsck_err(c, _err_type, ...) \
+ __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
+
+#define fsck_err_on(cond, c, _err_type, ...) \
+ __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
+
+__printf(4, 0)
+static inline void bch2_bkey_fsck_err(struct bch_fs *c,
+ struct printbuf *err_msg,
+ enum bch_sb_error_id err_type,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ prt_vprintf(err_msg, fmt, args);
+ va_end(args);
+}
+
+#define bkey_fsck_err(c, _err_msg, _err_type, ...) \
+do { \
+ prt_printf(_err_msg, __VA_ARGS__); \
+ bch2_sb_error_count(c, BCH_FSCK_ERR_##_err_type); \
+ ret = -BCH_ERR_invalid_bkey; \
+ goto fsck_err; \
+} while (0)
+
+#define bkey_fsck_err_on(cond, ...) \
+do { \
+ if (unlikely(cond)) \
+ bkey_fsck_err(__VA_ARGS__); \
+} while (0)
+
+/*
+ * Fatal errors: these don't indicate a bug, but we can't continue running in RW
+ * mode - pretty much just due to metadata IO errors:
+ */
+
+void bch2_fatal_error(struct bch_fs *);
+
+#define bch2_fs_fatal_error(c, ...) \
+do { \
+ bch_err(c, __VA_ARGS__); \
+ bch2_fatal_error(c); \
+} while (0)
+
+#define bch2_fs_fatal_err_on(cond, c, ...) \
+({ \
+ bool _ret = unlikely(!!(cond)); \
+ \
+ if (_ret) \
+ bch2_fs_fatal_error(c, __VA_ARGS__); \
+ _ret; \
+})
+
+/*
+ * IO errors: either recoverable metadata IO (because we have replicas), or data
+ * IO - we need to log it and print out a message, but we don't (necessarily)
+ * want to shut down the fs:
+ */
+
+void bch2_io_error_work(struct work_struct *);
+
+/* Does the error handling without logging a message */
+void bch2_io_error(struct bch_dev *, enum bch_member_error_type);
+
+#define bch2_dev_io_err_on(cond, ca, _type, ...) \
+({ \
+ bool _ret = (cond); \
+ \
+ if (_ret) { \
+ bch_err_dev_ratelimited(ca, __VA_ARGS__); \
+ bch2_io_error(ca, _type); \
+ } \
+ _ret; \
+})
+
+#define bch2_dev_inum_io_err_on(cond, ca, _type, ...) \
+({ \
+ bool _ret = (cond); \
+ \
+ if (_ret) { \
+ bch_err_inum_offset_ratelimited(ca, __VA_ARGS__); \
+ bch2_io_error(ca, _type); \
+ } \
+ _ret; \
+})
+
+#endif /* _BCACHEFS_ERROR_H */
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
new file mode 100644
index 000000000000..21af6fb8cecf
--- /dev/null
+++ b/fs/bcachefs/extent_update.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "debug.h"
+#include "extents.h"
+#include "extent_update.h"
+
+/*
+ * This counts the number of iterators to the alloc & ec btrees we'll need
+ * inserting/removing this extent:
+ */
+static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ unsigned ret = 0, lru = 0;
+
+ bkey_extent_entry_for_each(ptrs, entry) {
+ switch (__extent_entry_type(entry)) {
+ case BCH_EXTENT_ENTRY_ptr:
+ /* Might also be updating LRU btree */
+ if (entry->ptr.cached)
+ lru++;
+
+ fallthrough;
+ case BCH_EXTENT_ENTRY_stripe_ptr:
+ ret++;
+ }
+ }
+
+ /*
+ * Updating keys in the alloc btree may also update keys in the
+ * freespace or discard btrees:
+ */
+ return lru + ret * 2;
+}
+
+static int count_iters_for_insert(struct btree_trans *trans,
+ struct bkey_s_c k,
+ unsigned offset,
+ struct bpos *end,
+ unsigned *nr_iters,
+ unsigned max_iters)
+{
+ int ret = 0, ret2 = 0;
+
+ if (*nr_iters >= max_iters) {
+ *end = bpos_min(*end, k.k->p);
+ ret = 1;
+ }
+
+ switch (k.k->type) {
+ case KEY_TYPE_extent:
+ case KEY_TYPE_reflink_v:
+ *nr_iters += bch2_bkey_nr_alloc_ptrs(k);
+
+ if (*nr_iters >= max_iters) {
+ *end = bpos_min(*end, k.k->p);
+ ret = 1;
+ }
+
+ break;
+ case KEY_TYPE_reflink_p: {
+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+ u64 idx = le64_to_cpu(p.v->idx);
+ unsigned sectors = bpos_min(*end, p.k->p).offset -
+ bkey_start_offset(p.k);
+ struct btree_iter iter;
+ struct bkey_s_c r_k;
+
+ for_each_btree_key_norestart(trans, iter,
+ BTREE_ID_reflink, POS(0, idx + offset),
+ BTREE_ITER_SLOTS, r_k, ret2) {
+ if (bkey_ge(bkey_start_pos(r_k.k), POS(0, idx + sectors)))
+ break;
+
+ /* extent_update_to_keys(), for the reflink_v update */
+ *nr_iters += 1;
+
+ *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k);
+
+ if (*nr_iters >= max_iters) {
+ struct bpos pos = bkey_start_pos(k.k);
+ pos.offset += min_t(u64, k.k->size,
+ r_k.k->p.offset - idx);
+
+ *end = bpos_min(*end, pos);
+ ret = 1;
+ break;
+ }
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ break;
+ }
+ }
+
+ return ret2 ?: ret;
+}
+
+#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3)
+
+int bch2_extent_atomic_end(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_i *insert,
+ struct bpos *end)
+{
+ struct btree_iter copy;
+ struct bkey_s_c k;
+ unsigned nr_iters = 0;
+ int ret;
+
+ ret = bch2_btree_iter_traverse(iter);
+ if (ret)
+ return ret;
+
+ *end = insert->k.p;
+
+ /* extent_update_to_keys(): */
+ nr_iters += 1;
+
+ ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end,
+ &nr_iters, EXTENT_ITERS_MAX / 2);
+ if (ret < 0)
+ return ret;
+
+ bch2_trans_copy_iter(&copy, iter);
+
+ for_each_btree_key_upto_continue_norestart(copy, insert->k.p, 0, k, ret) {
+ unsigned offset = 0;
+
+ if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k)))
+ offset = bkey_start_offset(&insert->k) -
+ bkey_start_offset(k.k);
+
+ /* extent_handle_overwrites(): */
+ switch (bch2_extent_overlap(&insert->k, k.k)) {
+ case BCH_EXTENT_OVERLAP_ALL:
+ case BCH_EXTENT_OVERLAP_FRONT:
+ nr_iters += 1;
+ break;
+ case BCH_EXTENT_OVERLAP_BACK:
+ case BCH_EXTENT_OVERLAP_MIDDLE:
+ nr_iters += 2;
+ break;
+ }
+
+ ret = count_iters_for_insert(trans, k, offset, end,
+ &nr_iters, EXTENT_ITERS_MAX);
+ if (ret)
+ break;
+ }
+
+ bch2_trans_iter_exit(trans, &copy);
+ return ret < 0 ? ret : 0;
+}
+
+int bch2_extent_trim_atomic(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_i *k)
+{
+ struct bpos end;
+ int ret;
+
+ ret = bch2_extent_atomic_end(trans, iter, k, &end);
+ if (ret)
+ return ret;
+
+ bch2_cut_back(end, k);
+ return 0;
+}
diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h
new file mode 100644
index 000000000000..6f5cf449361a
--- /dev/null
+++ b/fs/bcachefs/extent_update.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EXTENT_UPDATE_H
+#define _BCACHEFS_EXTENT_UPDATE_H
+
+#include "bcachefs.h"
+
+int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *,
+ struct bkey_i *, struct bpos *);
+int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *,
+ struct bkey_i *);
+
+#endif /* _BCACHEFS_EXTENT_UPDATE_H */
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
new file mode 100644
index 000000000000..9d8afcb5979a
--- /dev/null
+++ b/fs/bcachefs/extents.c
@@ -0,0 +1,1511 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
+ *
+ * Code for managing the extent btree and dynamically updating the writeback
+ * dirty sector count.
+ */
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_gc.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "compress.h"
+#include "debug.h"
+#include "disk_groups.h"
+#include "error.h"
+#include "extents.h"
+#include "inode.h"
+#include "journal.h"
+#include "replicas.h"
+#include "super.h"
+#include "super-io.h"
+#include "trace.h"
+#include "util.h"
+
+static unsigned bch2_crc_field_size_max[] = {
+ [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
+ [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
+ [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX,
+};
+
+static void bch2_extent_crc_pack(union bch_extent_crc *,
+ struct bch_extent_crc_unpacked,
+ enum bch_extent_entry_type);
+
+static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f,
+ unsigned dev)
+{
+ struct bch_dev_io_failures *i;
+
+ for (i = f->devs; i < f->devs + f->nr; i++)
+ if (i->dev == dev)
+ return i;
+
+ return NULL;
+}
+
+void bch2_mark_io_failure(struct bch_io_failures *failed,
+ struct extent_ptr_decoded *p)
+{
+ struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev);
+
+ if (!f) {
+ BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
+
+ f = &failed->devs[failed->nr++];
+ f->dev = p->ptr.dev;
+ f->idx = p->idx;
+ f->nr_failed = 1;
+ f->nr_retries = 0;
+ } else if (p->idx != f->idx) {
+ f->idx = p->idx;
+ f->nr_failed = 1;
+ f->nr_retries = 0;
+ } else {
+ f->nr_failed++;
+ }
+}
+
+/*
+ * returns true if p1 is better than p2:
+ */
+static inline bool ptr_better(struct bch_fs *c,
+ const struct extent_ptr_decoded p1,
+ const struct extent_ptr_decoded p2)
+{
+ if (likely(!p1.idx && !p2.idx)) {
+ struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev);
+ struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev);
+
+ u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
+ u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
+
+ /* Pick at random, biased in favor of the faster device: */
+
+ return bch2_rand_range(l1 + l2) > l1;
+ }
+
+ if (bch2_force_reconstruct_read)
+ return p1.idx > p2.idx;
+
+ return p1.idx < p2.idx;
+}
+
+/*
+ * This picks a non-stale pointer, preferably from a device other than @avoid.
+ * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
+ * other devices, it will still pick a pointer from avoid.
+ */
+int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
+ struct bch_io_failures *failed,
+ struct extent_ptr_decoded *pick)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ struct bch_dev_io_failures *f;
+ struct bch_dev *ca;
+ int ret = 0;
+
+ if (k.k->type == KEY_TYPE_error)
+ return -EIO;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ /*
+ * Unwritten extent: no need to actually read, treat it as a
+ * hole and return 0s:
+ */
+ if (p.ptr.unwritten)
+ return 0;
+
+ ca = bch_dev_bkey_exists(c, p.ptr.dev);
+
+ /*
+ * If there are any dirty pointers it's an error if we can't
+ * read:
+ */
+ if (!ret && !p.ptr.cached)
+ ret = -EIO;
+
+ if (p.ptr.cached && ptr_stale(ca, &p.ptr))
+ continue;
+
+ f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL;
+ if (f)
+ p.idx = f->nr_failed < f->nr_retries
+ ? f->idx
+ : f->idx + 1;
+
+ if (!p.idx &&
+ !bch2_dev_is_readable(ca))
+ p.idx++;
+
+ if (bch2_force_reconstruct_read &&
+ !p.idx && p.has_ec)
+ p.idx++;
+
+ if (p.idx >= (unsigned) p.has_ec + 1)
+ continue;
+
+ if (ret > 0 && !ptr_better(c, p, *pick))
+ continue;
+
+ *pick = p;
+ ret = 1;
+ }
+
+ return ret;
+}
+
+/* KEY_TYPE_btree_ptr: */
+
+int bch2_btree_ptr_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
+{
+ int ret = 0;
+
+ bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, c, err,
+ btree_ptr_val_too_big,
+ "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
+
+ ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
+fsck_err:
+ return ret;
+}
+
+void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ bch2_bkey_ptrs_to_text(out, c, k);
+}
+
+int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
+{
+ int ret = 0;
+
+ bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, c, err,
+ btree_ptr_v2_val_too_big,
+ "value too big (%zu > %zu)",
+ bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
+
+ ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
+fsck_err:
+ return ret;
+}
+
+void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
+
+ prt_printf(out, "seq %llx written %u min_key %s",
+ le64_to_cpu(bp.v->seq),
+ le16_to_cpu(bp.v->sectors_written),
+ BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : "");
+
+ bch2_bpos_to_text(out, bp.v->min_key);
+ prt_printf(out, " ");
+ bch2_bkey_ptrs_to_text(out, c, k);
+}
+
+void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
+ unsigned big_endian, int write,
+ struct bkey_s k)
+{
+ struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k);
+
+ compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key);
+
+ if (version < bcachefs_metadata_version_inode_btree_change &&
+ btree_id_is_extents(btree_id) &&
+ !bkey_eq(bp.v->min_key, POS_MIN))
+ bp.v->min_key = write
+ ? bpos_nosnap_predecessor(bp.v->min_key)
+ : bpos_nosnap_successor(bp.v->min_key);
+}
+
+/* KEY_TYPE_extent: */
+
+bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
+{
+ struct bkey_ptrs l_ptrs = bch2_bkey_ptrs(l);
+ struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r);
+ union bch_extent_entry *en_l;
+ const union bch_extent_entry *en_r;
+ struct extent_ptr_decoded lp, rp;
+ bool use_right_ptr;
+ struct bch_dev *ca;
+
+ en_l = l_ptrs.start;
+ en_r = r_ptrs.start;
+ while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
+ if (extent_entry_type(en_l) != extent_entry_type(en_r))
+ return false;
+
+ en_l = extent_entry_next(en_l);
+ en_r = extent_entry_next(en_r);
+ }
+
+ if (en_l < l_ptrs.end || en_r < r_ptrs.end)
+ return false;
+
+ en_l = l_ptrs.start;
+ en_r = r_ptrs.start;
+ lp.crc = bch2_extent_crc_unpack(l.k, NULL);
+ rp.crc = bch2_extent_crc_unpack(r.k, NULL);
+
+ while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) &&
+ __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) {
+ if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size !=
+ rp.ptr.offset + rp.crc.offset ||
+ lp.ptr.dev != rp.ptr.dev ||
+ lp.ptr.gen != rp.ptr.gen ||
+ lp.ptr.unwritten != rp.ptr.unwritten ||
+ lp.has_ec != rp.has_ec)
+ return false;
+
+ /* Extents may not straddle buckets: */
+ ca = bch_dev_bkey_exists(c, lp.ptr.dev);
+ if (PTR_BUCKET_NR(ca, &lp.ptr) != PTR_BUCKET_NR(ca, &rp.ptr))
+ return false;
+
+ if (lp.has_ec != rp.has_ec ||
+ (lp.has_ec &&
+ (lp.ec.block != rp.ec.block ||
+ lp.ec.redundancy != rp.ec.redundancy ||
+ lp.ec.idx != rp.ec.idx)))
+ return false;
+
+ if (lp.crc.compression_type != rp.crc.compression_type ||
+ lp.crc.nonce != rp.crc.nonce)
+ return false;
+
+ if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <=
+ lp.crc.uncompressed_size) {
+ /* can use left extent's crc entry */
+ } else if (lp.crc.live_size <= rp.crc.offset) {
+ /* can use right extent's crc entry */
+ } else {
+ /* check if checksums can be merged: */
+ if (lp.crc.csum_type != rp.crc.csum_type ||
+ lp.crc.nonce != rp.crc.nonce ||
+ crc_is_compressed(lp.crc) ||
+ !bch2_checksum_mergeable(lp.crc.csum_type))
+ return false;
+
+ if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size ||
+ rp.crc.offset)
+ return false;
+
+ if (lp.crc.csum_type &&
+ lp.crc.uncompressed_size +
+ rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9))
+ return false;
+ }
+
+ en_l = extent_entry_next(en_l);
+ en_r = extent_entry_next(en_r);
+ }
+
+ en_l = l_ptrs.start;
+ en_r = r_ptrs.start;
+ while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
+ if (extent_entry_is_crc(en_l)) {
+ struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
+ struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
+
+ if (crc_l.uncompressed_size + crc_r.uncompressed_size >
+ bch2_crc_field_size_max[extent_entry_type(en_l)])
+ return false;
+ }
+
+ en_l = extent_entry_next(en_l);
+ en_r = extent_entry_next(en_r);
+ }
+
+ use_right_ptr = false;
+ en_l = l_ptrs.start;
+ en_r = r_ptrs.start;
+ while (en_l < l_ptrs.end) {
+ if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr &&
+ use_right_ptr)
+ en_l->ptr = en_r->ptr;
+
+ if (extent_entry_is_crc(en_l)) {
+ struct bch_extent_crc_unpacked crc_l =
+ bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
+ struct bch_extent_crc_unpacked crc_r =
+ bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
+
+ use_right_ptr = false;
+
+ if (crc_l.offset + crc_l.live_size + crc_r.live_size <=
+ crc_l.uncompressed_size) {
+ /* can use left extent's crc entry */
+ } else if (crc_l.live_size <= crc_r.offset) {
+ /* can use right extent's crc entry */
+ crc_r.offset -= crc_l.live_size;
+ bch2_extent_crc_pack(entry_to_crc(en_l), crc_r,
+ extent_entry_type(en_l));
+ use_right_ptr = true;
+ } else {
+ crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
+ crc_l.csum,
+ crc_r.csum,
+ crc_r.uncompressed_size << 9);
+
+ crc_l.uncompressed_size += crc_r.uncompressed_size;
+ crc_l.compressed_size += crc_r.compressed_size;
+ bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
+ extent_entry_type(en_l));
+ }
+ }
+
+ en_l = extent_entry_next(en_l);
+ en_r = extent_entry_next(en_r);
+ }
+
+ bch2_key_resize(l.k, l.k->size + r.k->size);
+ return true;
+}
+
+/* KEY_TYPE_reservation: */
+
+int bch2_reservation_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
+{
+ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+ int ret = 0;
+
+ bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, c, err,
+ reservation_key_nr_replicas_invalid,
+ "invalid nr_replicas (%u)", r.v->nr_replicas);
+fsck_err:
+ return ret;
+}
+
+void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+ prt_printf(out, "generation %u replicas %u",
+ le32_to_cpu(r.v->generation),
+ r.v->nr_replicas);
+}
+
+bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
+{
+ struct bkey_s_reservation l = bkey_s_to_reservation(_l);
+ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r);
+
+ if (l.v->generation != r.v->generation ||
+ l.v->nr_replicas != r.v->nr_replicas)
+ return false;
+
+ bch2_key_resize(l.k, l.k->size + r.k->size);
+ return true;
+}
+
+/* Extent checksum entries: */
+
+/* returns true if not equal */
+static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
+ struct bch_extent_crc_unpacked r)
+{
+ return (l.csum_type != r.csum_type ||
+ l.compression_type != r.compression_type ||
+ l.compressed_size != r.compressed_size ||
+ l.uncompressed_size != r.uncompressed_size ||
+ l.offset != r.offset ||
+ l.live_size != r.live_size ||
+ l.nonce != r.nonce ||
+ bch2_crc_cmp(l.csum, r.csum));
+}
+
+static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
+ struct bch_extent_crc_unpacked n)
+{
+ return !crc_is_compressed(u) &&
+ u.csum_type &&
+ u.uncompressed_size > u.live_size &&
+ bch2_csum_type_is_encryption(u.csum_type) ==
+ bch2_csum_type_is_encryption(n.csum_type);
+}
+
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c k,
+ struct bch_extent_crc_unpacked n)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ struct bch_extent_crc_unpacked crc;
+ const union bch_extent_entry *i;
+
+ if (!n.csum_type)
+ return false;
+
+ bkey_for_each_crc(k.k, ptrs, crc, i)
+ if (can_narrow_crc(crc, n))
+ return true;
+
+ return false;
+}
+
+/*
+ * We're writing another replica for this extent, so while we've got the data in
+ * memory we'll be computing a new checksum for the currently live data.
+ *
+ * If there are other replicas we aren't moving, and they are checksummed but
+ * not compressed, we can modify them to point to only the data that is
+ * currently live (so that readers won't have to bounce) while we've got the
+ * checksum we need:
+ */
+bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n)
+{
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+ struct bch_extent_crc_unpacked u;
+ struct extent_ptr_decoded p;
+ union bch_extent_entry *i;
+ bool ret = false;
+
+ /* Find a checksum entry that covers only live data: */
+ if (!n.csum_type) {
+ bkey_for_each_crc(&k->k, ptrs, u, i)
+ if (!crc_is_compressed(u) &&
+ u.csum_type &&
+ u.live_size == u.uncompressed_size) {
+ n = u;
+ goto found;
+ }
+ return false;
+ }
+found:
+ BUG_ON(crc_is_compressed(n));
+ BUG_ON(n.offset);
+ BUG_ON(n.live_size != k->k.size);
+
+restart_narrow_pointers:
+ ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+
+ bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
+ if (can_narrow_crc(p.crc, n)) {
+ bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr);
+ p.ptr.offset += p.crc.offset;
+ p.crc = n;
+ bch2_extent_ptr_decoded_append(k, &p);
+ ret = true;
+ goto restart_narrow_pointers;
+ }
+
+ return ret;
+}
+
+static void bch2_extent_crc_pack(union bch_extent_crc *dst,
+ struct bch_extent_crc_unpacked src,
+ enum bch_extent_entry_type type)
+{
+#define set_common_fields(_dst, _src) \
+ _dst.type = 1 << type; \
+ _dst.csum_type = _src.csum_type, \
+ _dst.compression_type = _src.compression_type, \
+ _dst._compressed_size = _src.compressed_size - 1, \
+ _dst._uncompressed_size = _src.uncompressed_size - 1, \
+ _dst.offset = _src.offset
+
+ switch (type) {
+ case BCH_EXTENT_ENTRY_crc32:
+ set_common_fields(dst->crc32, src);
+ dst->crc32.csum = (u32 __force) *((__le32 *) &src.csum.lo);
+ break;
+ case BCH_EXTENT_ENTRY_crc64:
+ set_common_fields(dst->crc64, src);
+ dst->crc64.nonce = src.nonce;
+ dst->crc64.csum_lo = (u64 __force) src.csum.lo;
+ dst->crc64.csum_hi = (u64 __force) *((__le16 *) &src.csum.hi);
+ break;
+ case BCH_EXTENT_ENTRY_crc128:
+ set_common_fields(dst->crc128, src);
+ dst->crc128.nonce = src.nonce;
+ dst->crc128.csum = src.csum;
+ break;
+ default:
+ BUG();
+ }
+#undef set_common_fields
+}
+
+void bch2_extent_crc_append(struct bkey_i *k,
+ struct bch_extent_crc_unpacked new)
+{
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+ union bch_extent_crc *crc = (void *) ptrs.end;
+ enum bch_extent_entry_type type;
+
+ if (bch_crc_bytes[new.csum_type] <= 4 &&
+ new.uncompressed_size <= CRC32_SIZE_MAX &&
+ new.nonce <= CRC32_NONCE_MAX)
+ type = BCH_EXTENT_ENTRY_crc32;
+ else if (bch_crc_bytes[new.csum_type] <= 10 &&
+ new.uncompressed_size <= CRC64_SIZE_MAX &&
+ new.nonce <= CRC64_NONCE_MAX)
+ type = BCH_EXTENT_ENTRY_crc64;
+ else if (bch_crc_bytes[new.csum_type] <= 16 &&
+ new.uncompressed_size <= CRC128_SIZE_MAX &&
+ new.nonce <= CRC128_NONCE_MAX)
+ type = BCH_EXTENT_ENTRY_crc128;
+ else
+ BUG();
+
+ bch2_extent_crc_pack(crc, new, type);
+
+ k->k.u64s += extent_entry_u64s(ptrs.end);
+
+ EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX);
+}
+
+/* Generic code for keys with pointers: */
+
+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k)
+{
+ return bch2_bkey_devs(k).nr;
+}
+
+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
+{
+ return k.k->type == KEY_TYPE_reservation
+ ? bkey_s_c_to_reservation(k).v->nr_replicas
+ : bch2_bkey_dirty_devs(k).nr;
+}
+
+unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k)
+{
+ unsigned ret = 0;
+
+ if (k.k->type == KEY_TYPE_reservation) {
+ ret = bkey_s_c_to_reservation(k).v->nr_replicas;
+ } else {
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ ret += !p.ptr.cached && !crc_is_compressed(p.crc);
+ }
+
+ return ret;
+}
+
+unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ unsigned ret = 0;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ if (!p.ptr.cached && crc_is_compressed(p.crc))
+ ret += p.crc.compressed_size;
+
+ return ret;
+}
+
+bool bch2_bkey_is_incompressible(struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct bch_extent_crc_unpacked crc;
+
+ bkey_for_each_crc(k.k, ptrs, crc, entry)
+ if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
+ return true;
+ return false;
+}
+
+unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p = { 0 };
+ unsigned replicas = 0;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ if (p.ptr.cached)
+ continue;
+
+ if (p.has_ec)
+ replicas += p.ec.redundancy;
+
+ replicas++;
+
+ }
+
+ return replicas;
+}
+
+static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent_ptr_decoded *p)
+{
+ if (p->ptr.cached)
+ return 0;
+
+ return p->has_ec
+ ? p->ec.redundancy + 1
+ : ca->mi.durability;
+}
+
+unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev);
+
+ return __extent_ptr_durability(ca, p);
+}
+
+unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev);
+
+ if (ca->mi.state == BCH_MEMBER_STATE_failed)
+ return 0;
+
+ return __extent_ptr_durability(ca, p);
+}
+
+unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ unsigned durability = 0;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ durability += bch2_extent_ptr_durability(c, &p);
+
+ return durability;
+}
+
+static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ unsigned durability = 0;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev])
+ durability += bch2_extent_ptr_durability(c, &p);
+
+ return durability;
+}
+
+void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry)
+{
+ union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
+ union bch_extent_entry *next = extent_entry_next(entry);
+
+ memmove_u64s(entry, next, (u64 *) end - (u64 *) next);
+ k->k.u64s -= extent_entry_u64s(entry);
+}
+
+void bch2_extent_ptr_decoded_append(struct bkey_i *k,
+ struct extent_ptr_decoded *p)
+{
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+ struct bch_extent_crc_unpacked crc =
+ bch2_extent_crc_unpack(&k->k, NULL);
+ union bch_extent_entry *pos;
+
+ if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
+ pos = ptrs.start;
+ goto found;
+ }
+
+ bkey_for_each_crc(&k->k, ptrs, crc, pos)
+ if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
+ pos = extent_entry_next(pos);
+ goto found;
+ }
+
+ bch2_extent_crc_append(k, p->crc);
+ pos = bkey_val_end(bkey_i_to_s(k));
+found:
+ p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
+ __extent_entry_insert(k, pos, to_entry(&p->ptr));
+
+ if (p->has_ec) {
+ p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
+ __extent_entry_insert(k, pos, to_entry(&p->ec));
+ }
+}
+
+static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
+ union bch_extent_entry *entry)
+{
+ union bch_extent_entry *i = ptrs.start;
+
+ if (i == entry)
+ return NULL;
+
+ while (extent_entry_next(i) != entry)
+ i = extent_entry_next(i);
+ return i;
+}
+
+/*
+ * Returns pointer to the next entry after the one being dropped:
+ */
+union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k,
+ struct bch_extent_ptr *ptr)
+{
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+ union bch_extent_entry *entry = to_entry(ptr), *next;
+ union bch_extent_entry *ret = entry;
+ bool drop_crc = true;
+
+ EBUG_ON(ptr < &ptrs.start->ptr ||
+ ptr >= &ptrs.end->ptr);
+ EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
+
+ for (next = extent_entry_next(entry);
+ next != ptrs.end;
+ next = extent_entry_next(next)) {
+ if (extent_entry_is_crc(next)) {
+ break;
+ } else if (extent_entry_is_ptr(next)) {
+ drop_crc = false;
+ break;
+ }
+ }
+
+ extent_entry_drop(k, entry);
+
+ while ((entry = extent_entry_prev(ptrs, entry))) {
+ if (extent_entry_is_ptr(entry))
+ break;
+
+ if ((extent_entry_is_crc(entry) && drop_crc) ||
+ extent_entry_is_stripe_ptr(entry)) {
+ ret = (void *) ret - extent_entry_bytes(entry);
+ extent_entry_drop(k, entry);
+ }
+ }
+
+ return ret;
+}
+
+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
+ struct bch_extent_ptr *ptr)
+{
+ bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
+ union bch_extent_entry *ret =
+ bch2_bkey_drop_ptr_noerror(k, ptr);
+
+ /*
+ * If we deleted all the dirty pointers and there's still cached
+ * pointers, we could set the cached pointers to dirty if they're not
+ * stale - but to do that correctly we'd need to grab an open_bucket
+ * reference so that we don't race with bucket reuse:
+ */
+ if (have_dirty &&
+ !bch2_bkey_dirty_devs(k.s_c).nr) {
+ k.k->type = KEY_TYPE_error;
+ set_bkey_val_u64s(k.k, 0);
+ ret = NULL;
+ } else if (!bch2_bkey_nr_ptrs(k.s_c)) {
+ k.k->type = KEY_TYPE_deleted;
+ set_bkey_val_u64s(k.k, 0);
+ ret = NULL;
+ }
+
+ return ret;
+}
+
+void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
+{
+ struct bch_extent_ptr *ptr;
+
+ bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
+}
+
+void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
+{
+ struct bch_extent_ptr *ptr = bch2_bkey_has_device(k, dev);
+
+ if (ptr)
+ bch2_bkey_drop_ptr_noerror(k, ptr);
+}
+
+const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
+
+ bkey_for_each_ptr(ptrs, ptr)
+ if (ptr->dev == dev)
+ return ptr;
+
+ return NULL;
+}
+
+bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
+
+ bkey_for_each_ptr(ptrs, ptr)
+ if (bch2_dev_in_target(c, ptr->dev, target) &&
+ (!ptr->cached ||
+ !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
+ return true;
+
+ return false;
+}
+
+bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
+ struct bch_extent_ptr m, u64 offset)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ if (p.ptr.dev == m.dev &&
+ p.ptr.gen == m.gen &&
+ (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) ==
+ (s64) m.offset - offset)
+ return true;
+
+ return false;
+}
+
+/*
+ * Returns true if two extents refer to the same data:
+ */
+bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
+{
+ if (k1.k->type != k2.k->type)
+ return false;
+
+ if (bkey_extent_is_direct_data(k1.k)) {
+ struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1);
+ struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
+ const union bch_extent_entry *entry1, *entry2;
+ struct extent_ptr_decoded p1, p2;
+
+ if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2))
+ return false;
+
+ bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
+ bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
+ if (p1.ptr.dev == p2.ptr.dev &&
+ p1.ptr.gen == p2.ptr.gen &&
+ (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
+ (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+ return true;
+
+ return false;
+ } else {
+ /* KEY_TYPE_deleted, etc. */
+ return true;
+ }
+}
+
+struct bch_extent_ptr *
+bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2)
+{
+ struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2);
+ union bch_extent_entry *entry2;
+ struct extent_ptr_decoded p2;
+
+ bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
+ if (p1.ptr.dev == p2.ptr.dev &&
+ p1.ptr.gen == p2.ptr.gen &&
+ (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
+ (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+ return &entry2->ptr;
+
+ return NULL;
+}
+
+void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr)
+{
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+ union bch_extent_entry *entry;
+ union bch_extent_entry *ec = NULL;
+
+ bkey_extent_entry_for_each(ptrs, entry) {
+ if (&entry->ptr == ptr) {
+ ptr->cached = true;
+ if (ec)
+ extent_entry_drop(k, ec);
+ return;
+ }
+
+ if (extent_entry_is_stripe_ptr(entry))
+ ec = entry;
+ else if (extent_entry_is_ptr(entry))
+ ec = NULL;
+ }
+
+ BUG();
+}
+
+/*
+ * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
+ *
+ * Returns true if @k should be dropped entirely
+ *
+ * For existing keys, only called when btree nodes are being rewritten, not when
+ * they're merely being compacted/resorted in memory.
+ */
+bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
+{
+ struct bch_extent_ptr *ptr;
+
+ bch2_bkey_drop_ptrs(k, ptr,
+ ptr->cached &&
+ ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr));
+
+ return bkey_deleted(k.k);
+}
+
+void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ bool first = true;
+
+ if (c)
+ prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k));
+
+ bkey_extent_entry_for_each(ptrs, entry) {
+ if (!first)
+ prt_printf(out, " ");
+
+ switch (__extent_entry_type(entry)) {
+ case BCH_EXTENT_ENTRY_ptr: {
+ const struct bch_extent_ptr *ptr = entry_to_ptr(entry);
+ struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+ ? bch_dev_bkey_exists(c, ptr->dev)
+ : NULL;
+
+ if (!ca) {
+ prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
+ (u64) ptr->offset, ptr->gen,
+ ptr->cached ? " cached" : "");
+ } else {
+ u32 offset;
+ u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
+
+ prt_printf(out, "ptr: %u:%llu:%u gen %u",
+ ptr->dev, b, offset, ptr->gen);
+ if (ptr->cached)
+ prt_str(out, " cached");
+ if (ptr->unwritten)
+ prt_str(out, " unwritten");
+ if (ca && ptr_stale(ca, ptr))
+ prt_printf(out, " stale");
+ }
+ break;
+ }
+ case BCH_EXTENT_ENTRY_crc32:
+ case BCH_EXTENT_ENTRY_crc64:
+ case BCH_EXTENT_ENTRY_crc128: {
+ struct bch_extent_crc_unpacked crc =
+ bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
+
+ prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
+ crc.compressed_size,
+ crc.uncompressed_size,
+ crc.offset, crc.nonce,
+ bch2_csum_types[crc.csum_type],
+ bch2_compression_types[crc.compression_type]);
+ break;
+ }
+ case BCH_EXTENT_ENTRY_stripe_ptr: {
+ const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr;
+
+ prt_printf(out, "ec: idx %llu block %u",
+ (u64) ec->idx, ec->block);
+ break;
+ }
+ case BCH_EXTENT_ENTRY_rebalance: {
+ const struct bch_extent_rebalance *r = &entry->rebalance;
+
+ prt_str(out, "rebalance: target ");
+ if (c)
+ bch2_target_to_text(out, c, r->target);
+ else
+ prt_printf(out, "%u", r->target);
+ prt_str(out, " compression ");
+ bch2_compression_opt_to_text(out, r->compression);
+ break;
+ }
+ default:
+ prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
+ return;
+ }
+
+ first = false;
+ }
+}
+
+static int extent_ptr_invalid(struct bch_fs *c,
+ struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ const struct bch_extent_ptr *ptr,
+ unsigned size_ondisk,
+ bool metadata,
+ struct printbuf *err)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr2;
+ u64 bucket;
+ u32 bucket_offset;
+ struct bch_dev *ca;
+ int ret = 0;
+
+ if (!bch2_dev_exists2(c, ptr->dev)) {
+ /*
+ * If we're in the write path this key might have already been
+ * overwritten, and we could be seeing a device that doesn't
+ * exist anymore due to racing with device removal:
+ */
+ if (flags & BKEY_INVALID_WRITE)
+ return 0;
+
+ bkey_fsck_err(c, err, ptr_to_invalid_device,
+ "pointer to invalid device (%u)", ptr->dev);
+ }
+
+ ca = bch_dev_bkey_exists(c, ptr->dev);
+ bkey_for_each_ptr(ptrs, ptr2)
+ bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, c, err,
+ ptr_to_duplicate_device,
+ "multiple pointers to same device (%u)", ptr->dev);
+
+ bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
+
+ bkey_fsck_err_on(bucket >= ca->mi.nbuckets, c, err,
+ ptr_after_last_bucket,
+ "pointer past last bucket (%llu > %llu)", bucket, ca->mi.nbuckets);
+ bkey_fsck_err_on(ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket), c, err,
+ ptr_before_first_bucket,
+ "pointer before first bucket (%llu < %u)", bucket, ca->mi.first_bucket);
+ bkey_fsck_err_on(bucket_offset + size_ondisk > ca->mi.bucket_size, c, err,
+ ptr_spans_multiple_buckets,
+ "pointer spans multiple buckets (%u + %u > %u)",
+ bucket_offset, size_ondisk, ca->mi.bucket_size);
+fsck_err:
+ return ret;
+}
+
+int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct bch_extent_crc_unpacked crc;
+ unsigned size_ondisk = k.k->size;
+ unsigned nonce = UINT_MAX;
+ unsigned nr_ptrs = 0;
+ bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false;
+ int ret = 0;
+
+ if (bkey_is_btree_ptr(k.k))
+ size_ondisk = btree_sectors(c);
+
+ bkey_extent_entry_for_each(ptrs, entry) {
+ bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, c, err,
+ extent_ptrs_invalid_entry,
+ "invalid extent entry type (got %u, max %u)",
+ __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
+
+ bkey_fsck_err_on(bkey_is_btree_ptr(k.k) &&
+ !extent_entry_is_ptr(entry), c, err,
+ btree_ptr_has_non_ptr,
+ "has non ptr field");
+
+ switch (extent_entry_type(entry)) {
+ case BCH_EXTENT_ENTRY_ptr:
+ ret = extent_ptr_invalid(c, k, flags, &entry->ptr,
+ size_ondisk, false, err);
+ if (ret)
+ return ret;
+
+ bkey_fsck_err_on(entry->ptr.cached && have_ec, c, err,
+ ptr_cached_and_erasure_coded,
+ "cached, erasure coded ptr");
+
+ if (!entry->ptr.unwritten)
+ have_written = true;
+ else
+ have_unwritten = true;
+
+ have_ec = false;
+ crc_since_last_ptr = false;
+ nr_ptrs++;
+ break;
+ case BCH_EXTENT_ENTRY_crc32:
+ case BCH_EXTENT_ENTRY_crc64:
+ case BCH_EXTENT_ENTRY_crc128:
+ crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
+
+ bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, c, err,
+ ptr_crc_uncompressed_size_too_small,
+ "checksum offset + key size > uncompressed size");
+ bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), c, err,
+ ptr_crc_csum_type_unknown,
+ "invalid checksum type");
+ bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, c, err,
+ ptr_crc_compression_type_unknown,
+ "invalid compression type");
+
+ if (bch2_csum_type_is_encryption(crc.csum_type)) {
+ if (nonce == UINT_MAX)
+ nonce = crc.offset + crc.nonce;
+ else if (nonce != crc.offset + crc.nonce)
+ bkey_fsck_err(c, err, ptr_crc_nonce_mismatch,
+ "incorrect nonce");
+ }
+
+ bkey_fsck_err_on(crc_since_last_ptr, c, err,
+ ptr_crc_redundant,
+ "redundant crc entry");
+ crc_since_last_ptr = true;
+
+ bkey_fsck_err_on(crc_is_encoded(crc) &&
+ (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
+ (flags & (BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT)), c, err,
+ ptr_crc_uncompressed_size_too_big,
+ "too large encoded extent");
+
+ size_ondisk = crc.compressed_size;
+ break;
+ case BCH_EXTENT_ENTRY_stripe_ptr:
+ bkey_fsck_err_on(have_ec, c, err,
+ ptr_stripe_redundant,
+ "redundant stripe entry");
+ have_ec = true;
+ break;
+ case BCH_EXTENT_ENTRY_rebalance: {
+ const struct bch_extent_rebalance *r = &entry->rebalance;
+
+ if (!bch2_compression_opt_valid(r->compression)) {
+ struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
+ prt_printf(err, "invalid compression opt %u:%u",
+ opt.type, opt.level);
+ return -BCH_ERR_invalid_bkey;
+ }
+ break;
+ }
+ }
+ }
+
+ bkey_fsck_err_on(!nr_ptrs, c, err,
+ extent_ptrs_no_ptrs,
+ "no ptrs");
+ bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, c, err,
+ extent_ptrs_too_many_ptrs,
+ "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX);
+ bkey_fsck_err_on(have_written && have_unwritten, c, err,
+ extent_ptrs_written_and_unwritten,
+ "extent with unwritten and written ptrs");
+ bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, c, err,
+ extent_ptrs_unwritten,
+ "has unwritten ptrs");
+ bkey_fsck_err_on(crc_since_last_ptr, c, err,
+ extent_ptrs_redundant_crc,
+ "redundant crc entry");
+ bkey_fsck_err_on(have_ec, c, err,
+ extent_ptrs_redundant_stripe,
+ "redundant stripe entry");
+fsck_err:
+ return ret;
+}
+
+void bch2_ptr_swab(struct bkey_s k)
+{
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+ union bch_extent_entry *entry;
+ u64 *d;
+
+ for (d = (u64 *) ptrs.start;
+ d != (u64 *) ptrs.end;
+ d++)
+ *d = swab64(*d);
+
+ for (entry = ptrs.start;
+ entry < ptrs.end;
+ entry = extent_entry_next(entry)) {
+ switch (extent_entry_type(entry)) {
+ case BCH_EXTENT_ENTRY_ptr:
+ break;
+ case BCH_EXTENT_ENTRY_crc32:
+ entry->crc32.csum = swab32(entry->crc32.csum);
+ break;
+ case BCH_EXTENT_ENTRY_crc64:
+ entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
+ entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
+ break;
+ case BCH_EXTENT_ENTRY_crc128:
+ entry->crc128.csum.hi = (__force __le64)
+ swab64((__force u64) entry->crc128.csum.hi);
+ entry->crc128.csum.lo = (__force __le64)
+ swab64((__force u64) entry->crc128.csum.lo);
+ break;
+ case BCH_EXTENT_ENTRY_stripe_ptr:
+ break;
+ case BCH_EXTENT_ENTRY_rebalance:
+ break;
+ }
+ }
+}
+
+const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+
+ bkey_extent_entry_for_each(ptrs, entry)
+ if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
+ return &entry->rebalance;
+
+ return NULL;
+}
+
+unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
+ unsigned target, unsigned compression)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ unsigned rewrite_ptrs = 0;
+
+ if (compression) {
+ unsigned compression_type = bch2_compression_opt_to_type(compression);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ unsigned i = 0;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
+ p.ptr.unwritten) {
+ rewrite_ptrs = 0;
+ goto incompressible;
+ }
+
+ if (!p.ptr.cached && p.crc.compression_type != compression_type)
+ rewrite_ptrs |= 1U << i;
+ i++;
+ }
+ }
+incompressible:
+ if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
+ const struct bch_extent_ptr *ptr;
+ unsigned i = 0;
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target))
+ rewrite_ptrs |= 1U << i;
+ i++;
+ }
+ }
+
+ return rewrite_ptrs;
+}
+
+bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
+{
+ const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
+
+ /*
+ * If it's an indirect extent, we don't delete the rebalance entry when
+ * done so that we know what options were applied - check if it still
+ * needs work done:
+ */
+ if (r &&
+ k.k->type == KEY_TYPE_reflink_v &&
+ !bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression))
+ r = NULL;
+
+ return r != NULL;
+}
+
+int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
+ unsigned target, unsigned compression)
+{
+ struct bkey_s k = bkey_i_to_s(_k);
+ struct bch_extent_rebalance *r;
+ bool needs_rebalance;
+
+ if (!bkey_extent_is_direct_data(k.k))
+ return 0;
+
+ /* get existing rebalance entry: */
+ r = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
+ if (r) {
+ if (k.k->type == KEY_TYPE_reflink_v) {
+ /*
+ * indirect extents: existing options take precedence,
+ * so that we don't move extents back and forth if
+ * they're referenced by different inodes with different
+ * options:
+ */
+ if (r->target)
+ target = r->target;
+ if (r->compression)
+ compression = r->compression;
+ }
+
+ r->target = target;
+ r->compression = compression;
+ }
+
+ needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression);
+
+ if (needs_rebalance && !r) {
+ union bch_extent_entry *new = bkey_val_end(k);
+
+ new->rebalance.type = 1U << BCH_EXTENT_ENTRY_rebalance;
+ new->rebalance.compression = compression;
+ new->rebalance.target = target;
+ new->rebalance.unused = 0;
+ k.k->u64s += extent_entry_u64s(new);
+ } else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) {
+ /*
+ * For indirect extents, don't delete the rebalance entry when
+ * we're finished so that we know we specifically moved it or
+ * compressed it to its current location/compression type
+ */
+ extent_entry_drop(k, (union bch_extent_entry *) r);
+ }
+
+ return 0;
+}
+
+/* Generic extent code: */
+
+int bch2_cut_front_s(struct bpos where, struct bkey_s k)
+{
+ unsigned new_val_u64s = bkey_val_u64s(k.k);
+ int val_u64s_delta;
+ u64 sub;
+
+ if (bkey_le(where, bkey_start_pos(k.k)))
+ return 0;
+
+ EBUG_ON(bkey_gt(where, k.k->p));
+
+ sub = where.offset - bkey_start_offset(k.k);
+
+ k.k->size -= sub;
+
+ if (!k.k->size) {
+ k.k->type = KEY_TYPE_deleted;
+ new_val_u64s = 0;
+ }
+
+ switch (k.k->type) {
+ case KEY_TYPE_extent:
+ case KEY_TYPE_reflink_v: {
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+ union bch_extent_entry *entry;
+ bool seen_crc = false;
+
+ bkey_extent_entry_for_each(ptrs, entry) {
+ switch (extent_entry_type(entry)) {
+ case BCH_EXTENT_ENTRY_ptr:
+ if (!seen_crc)
+ entry->ptr.offset += sub;
+ break;
+ case BCH_EXTENT_ENTRY_crc32:
+ entry->crc32.offset += sub;
+ break;
+ case BCH_EXTENT_ENTRY_crc64:
+ entry->crc64.offset += sub;
+ break;
+ case BCH_EXTENT_ENTRY_crc128:
+ entry->crc128.offset += sub;
+ break;
+ case BCH_EXTENT_ENTRY_stripe_ptr:
+ break;
+ case BCH_EXTENT_ENTRY_rebalance:
+ break;
+ }
+
+ if (extent_entry_is_crc(entry))
+ seen_crc = true;
+ }
+
+ break;
+ }
+ case KEY_TYPE_reflink_p: {
+ struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
+
+ le64_add_cpu(&p.v->idx, sub);
+ break;
+ }
+ case KEY_TYPE_inline_data:
+ case KEY_TYPE_indirect_inline_data: {
+ void *p = bkey_inline_data_p(k);
+ unsigned bytes = bkey_inline_data_bytes(k.k);
+
+ sub = min_t(u64, sub << 9, bytes);
+
+ memmove(p, p + sub, bytes - sub);
+
+ new_val_u64s -= sub >> 3;
+ break;
+ }
+ }
+
+ val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
+ BUG_ON(val_u64s_delta < 0);
+
+ set_bkey_val_u64s(k.k, new_val_u64s);
+ memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
+ return -val_u64s_delta;
+}
+
+int bch2_cut_back_s(struct bpos where, struct bkey_s k)
+{
+ unsigned new_val_u64s = bkey_val_u64s(k.k);
+ int val_u64s_delta;
+ u64 len = 0;
+
+ if (bkey_ge(where, k.k->p))
+ return 0;
+
+ EBUG_ON(bkey_lt(where, bkey_start_pos(k.k)));
+
+ len = where.offset - bkey_start_offset(k.k);
+
+ k.k->p.offset = where.offset;
+ k.k->size = len;
+
+ if (!len) {
+ k.k->type = KEY_TYPE_deleted;
+ new_val_u64s = 0;
+ }
+
+ switch (k.k->type) {
+ case KEY_TYPE_inline_data:
+ case KEY_TYPE_indirect_inline_data:
+ new_val_u64s = (bkey_inline_data_offset(k.k) +
+ min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3;
+ break;
+ }
+
+ val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
+ BUG_ON(val_u64s_delta < 0);
+
+ set_bkey_val_u64s(k.k, new_val_u64s);
+ memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
+ return -val_u64s_delta;
+}
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
new file mode 100644
index 000000000000..a2ce8a3be13c
--- /dev/null
+++ b/fs/bcachefs/extents.h
@@ -0,0 +1,765 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EXTENTS_H
+#define _BCACHEFS_EXTENTS_H
+
+#include "bcachefs.h"
+#include "bkey.h"
+#include "extents_types.h"
+
+struct bch_fs;
+struct btree_trans;
+enum bkey_invalid_flags;
+
+/* extent entries: */
+
+#define extent_entry_last(_e) \
+ ((typeof(&(_e).v->start[0])) bkey_val_end(_e))
+
+#define entry_to_ptr(_entry) \
+({ \
+ EBUG_ON((_entry) && !extent_entry_is_ptr(_entry)); \
+ \
+ __builtin_choose_expr( \
+ type_is_exact(_entry, const union bch_extent_entry *), \
+ (const struct bch_extent_ptr *) (_entry), \
+ (struct bch_extent_ptr *) (_entry)); \
+})
+
+/* downcast, preserves const */
+#define to_entry(_entry) \
+({ \
+ BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) && \
+ !type_is(_entry, struct bch_extent_ptr *) && \
+ !type_is(_entry, struct bch_extent_stripe_ptr *)); \
+ \
+ __builtin_choose_expr( \
+ (type_is_exact(_entry, const union bch_extent_crc *) || \
+ type_is_exact(_entry, const struct bch_extent_ptr *) ||\
+ type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\
+ (const union bch_extent_entry *) (_entry), \
+ (union bch_extent_entry *) (_entry)); \
+})
+
+#define extent_entry_next(_entry) \
+ ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
+
+static inline unsigned
+__extent_entry_type(const union bch_extent_entry *e)
+{
+ return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX;
+}
+
+static inline enum bch_extent_entry_type
+extent_entry_type(const union bch_extent_entry *e)
+{
+ int ret = __ffs(e->type);
+
+ EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX);
+
+ return ret;
+}
+
+static inline size_t extent_entry_bytes(const union bch_extent_entry *entry)
+{
+ switch (extent_entry_type(entry)) {
+#define x(f, n) \
+ case BCH_EXTENT_ENTRY_##f: \
+ return sizeof(struct bch_extent_##f);
+ BCH_EXTENT_ENTRY_TYPES()
+#undef x
+ default:
+ BUG();
+ }
+}
+
+static inline size_t extent_entry_u64s(const union bch_extent_entry *entry)
+{
+ return extent_entry_bytes(entry) / sizeof(u64);
+}
+
+static inline void __extent_entry_insert(struct bkey_i *k,
+ union bch_extent_entry *dst,
+ union bch_extent_entry *new)
+{
+ union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
+
+ memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new),
+ dst, (u64 *) end - (u64 *) dst);
+ k->k.u64s += extent_entry_u64s(new);
+ memcpy_u64s_small(dst, new, extent_entry_u64s(new));
+}
+
+static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
+{
+ union bch_extent_entry *next = extent_entry_next(entry);
+
+ /* stripes have ptrs, but their layout doesn't work with this code */
+ BUG_ON(k.k->type == KEY_TYPE_stripe);
+
+ memmove_u64s_down(entry, next,
+ (u64 *) bkey_val_end(k) - (u64 *) next);
+ k.k->u64s -= (u64 *) next - (u64 *) entry;
+}
+
+static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
+{
+ return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
+}
+
+static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e)
+{
+ return extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr;
+}
+
+static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
+{
+ switch (extent_entry_type(e)) {
+ case BCH_EXTENT_ENTRY_crc32:
+ case BCH_EXTENT_ENTRY_crc64:
+ case BCH_EXTENT_ENTRY_crc128:
+ return true;
+ default:
+ return false;
+ }
+}
+
+union bch_extent_crc {
+ u8 type;
+ struct bch_extent_crc32 crc32;
+ struct bch_extent_crc64 crc64;
+ struct bch_extent_crc128 crc128;
+};
+
+#define __entry_to_crc(_entry) \
+ __builtin_choose_expr( \
+ type_is_exact(_entry, const union bch_extent_entry *), \
+ (const union bch_extent_crc *) (_entry), \
+ (union bch_extent_crc *) (_entry))
+
+#define entry_to_crc(_entry) \
+({ \
+ EBUG_ON((_entry) && !extent_entry_is_crc(_entry)); \
+ \
+ __entry_to_crc(_entry); \
+})
+
+static inline struct bch_extent_crc_unpacked
+bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
+{
+#define common_fields(_crc) \
+ .csum_type = _crc.csum_type, \
+ .compression_type = _crc.compression_type, \
+ .compressed_size = _crc._compressed_size + 1, \
+ .uncompressed_size = _crc._uncompressed_size + 1, \
+ .offset = _crc.offset, \
+ .live_size = k->size
+
+ if (!crc)
+ return (struct bch_extent_crc_unpacked) {
+ .compressed_size = k->size,
+ .uncompressed_size = k->size,
+ .live_size = k->size,
+ };
+
+ switch (extent_entry_type(to_entry(crc))) {
+ case BCH_EXTENT_ENTRY_crc32: {
+ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
+ common_fields(crc->crc32),
+ };
+
+ *((__le32 *) &ret.csum.lo) = (__le32 __force) crc->crc32.csum;
+ return ret;
+ }
+ case BCH_EXTENT_ENTRY_crc64: {
+ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
+ common_fields(crc->crc64),
+ .nonce = crc->crc64.nonce,
+ .csum.lo = (__force __le64) crc->crc64.csum_lo,
+ };
+
+ *((__le16 *) &ret.csum.hi) = (__le16 __force) crc->crc64.csum_hi;
+
+ return ret;
+ }
+ case BCH_EXTENT_ENTRY_crc128: {
+ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
+ common_fields(crc->crc128),
+ .nonce = crc->crc128.nonce,
+ .csum = crc->crc128.csum,
+ };
+
+ return ret;
+ }
+ default:
+ BUG();
+ }
+#undef common_fields
+}
+
+static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc)
+{
+ return (crc.compression_type != BCH_COMPRESSION_TYPE_none &&
+ crc.compression_type != BCH_COMPRESSION_TYPE_incompressible);
+}
+
+static inline bool crc_is_encoded(struct bch_extent_crc_unpacked crc)
+{
+ return crc.csum_type != BCH_CSUM_none || crc_is_compressed(crc);
+}
+
+/* bkey_ptrs: generically over any key type that has ptrs */
+
+struct bkey_ptrs_c {
+ const union bch_extent_entry *start;
+ const union bch_extent_entry *end;
+};
+
+struct bkey_ptrs {
+ union bch_extent_entry *start;
+ union bch_extent_entry *end;
+};
+
+static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
+{
+ switch (k.k->type) {
+ case KEY_TYPE_btree_ptr: {
+ struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k);
+
+ return (struct bkey_ptrs_c) {
+ to_entry(&e.v->start[0]),
+ to_entry(extent_entry_last(e))
+ };
+ }
+ case KEY_TYPE_extent: {
+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+
+ return (struct bkey_ptrs_c) {
+ e.v->start,
+ extent_entry_last(e)
+ };
+ }
+ case KEY_TYPE_stripe: {
+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+
+ return (struct bkey_ptrs_c) {
+ to_entry(&s.v->ptrs[0]),
+ to_entry(&s.v->ptrs[s.v->nr_blocks]),
+ };
+ }
+ case KEY_TYPE_reflink_v: {
+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
+
+ return (struct bkey_ptrs_c) {
+ r.v->start,
+ bkey_val_end(r),
+ };
+ }
+ case KEY_TYPE_btree_ptr_v2: {
+ struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k);
+
+ return (struct bkey_ptrs_c) {
+ to_entry(&e.v->start[0]),
+ to_entry(extent_entry_last(e))
+ };
+ }
+ default:
+ return (struct bkey_ptrs_c) { NULL, NULL };
+ }
+}
+
+static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
+{
+ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c);
+
+ return (struct bkey_ptrs) {
+ (void *) p.start,
+ (void *) p.end
+ };
+}
+
+#define __bkey_extent_entry_for_each_from(_start, _end, _entry) \
+ for ((_entry) = (_start); \
+ (_entry) < (_end); \
+ (_entry) = extent_entry_next(_entry))
+
+#define __bkey_ptr_next(_ptr, _end) \
+({ \
+ typeof(_end) _entry; \
+ \
+ __bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry) \
+ if (extent_entry_is_ptr(_entry)) \
+ break; \
+ \
+ _entry < (_end) ? entry_to_ptr(_entry) : NULL; \
+})
+
+#define bkey_extent_entry_for_each_from(_p, _entry, _start) \
+ __bkey_extent_entry_for_each_from(_start, (_p).end, _entry)
+
+#define bkey_extent_entry_for_each(_p, _entry) \
+ bkey_extent_entry_for_each_from(_p, _entry, _p.start)
+
+#define __bkey_for_each_ptr(_start, _end, _ptr) \
+ for ((_ptr) = (_start); \
+ ((_ptr) = __bkey_ptr_next(_ptr, _end)); \
+ (_ptr)++)
+
+#define bkey_ptr_next(_p, _ptr) \
+ __bkey_ptr_next(_ptr, (_p).end)
+
+#define bkey_for_each_ptr(_p, _ptr) \
+ __bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr)
+
+#define __bkey_ptr_next_decode(_k, _end, _ptr, _entry) \
+({ \
+ __label__ out; \
+ \
+ (_ptr).idx = 0; \
+ (_ptr).has_ec = false; \
+ \
+ __bkey_extent_entry_for_each_from(_entry, _end, _entry) \
+ switch (extent_entry_type(_entry)) { \
+ case BCH_EXTENT_ENTRY_ptr: \
+ (_ptr).ptr = _entry->ptr; \
+ goto out; \
+ case BCH_EXTENT_ENTRY_crc32: \
+ case BCH_EXTENT_ENTRY_crc64: \
+ case BCH_EXTENT_ENTRY_crc128: \
+ (_ptr).crc = bch2_extent_crc_unpack(_k, \
+ entry_to_crc(_entry)); \
+ break; \
+ case BCH_EXTENT_ENTRY_stripe_ptr: \
+ (_ptr).ec = _entry->stripe_ptr; \
+ (_ptr).has_ec = true; \
+ break; \
+ default: \
+ /* nothing */ \
+ break; \
+ } \
+out: \
+ _entry < (_end); \
+})
+
+#define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry) \
+ for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \
+ (_entry) = _start; \
+ __bkey_ptr_next_decode(_k, _end, _ptr, _entry); \
+ (_entry) = extent_entry_next(_entry))
+
+#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \
+ __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \
+ _ptr, _entry)
+
+#define bkey_crc_next(_k, _start, _end, _crc, _iter) \
+({ \
+ __bkey_extent_entry_for_each_from(_iter, _end, _iter) \
+ if (extent_entry_is_crc(_iter)) { \
+ (_crc) = bch2_extent_crc_unpack(_k, \
+ entry_to_crc(_iter)); \
+ break; \
+ } \
+ \
+ (_iter) < (_end); \
+})
+
+#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \
+ for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \
+ (_iter) = (_start); \
+ bkey_crc_next(_k, _start, _end, _crc, _iter); \
+ (_iter) = extent_entry_next(_iter))
+
+#define bkey_for_each_crc(_k, _p, _crc, _iter) \
+ __bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter)
+
+/* Iterate over pointers in KEY_TYPE_extent: */
+
+#define extent_for_each_entry_from(_e, _entry, _start) \
+ __bkey_extent_entry_for_each_from(_start, \
+ extent_entry_last(_e), _entry)
+
+#define extent_for_each_entry(_e, _entry) \
+ extent_for_each_entry_from(_e, _entry, (_e).v->start)
+
+#define extent_ptr_next(_e, _ptr) \
+ __bkey_ptr_next(_ptr, extent_entry_last(_e))
+
+#define extent_for_each_ptr(_e, _ptr) \
+ __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr)
+
+#define extent_for_each_ptr_decode(_e, _ptr, _entry) \
+ __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \
+ extent_entry_last(_e), _ptr, _entry)
+
+/* utility code common to all keys with pointers: */
+
+void bch2_mark_io_failure(struct bch_io_failures *,
+ struct extent_ptr_decoded *);
+int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
+ struct bch_io_failures *,
+ struct extent_ptr_decoded *);
+
+/* KEY_TYPE_btree_ptr: */
+
+int bch2_btree_ptr_invalid(struct bch_fs *, struct bkey_s_c,
+ enum bkey_invalid_flags, struct printbuf *);
+void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
+ struct bkey_s_c);
+
+int bch2_btree_ptr_v2_invalid(struct bch_fs *, struct bkey_s_c,
+ enum bkey_invalid_flags, struct printbuf *);
+void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
+ int, struct bkey_s);
+
+#define bch2_bkey_ops_btree_ptr ((struct bkey_ops) { \
+ .key_invalid = bch2_btree_ptr_invalid, \
+ .val_to_text = bch2_btree_ptr_to_text, \
+ .swab = bch2_ptr_swab, \
+ .trans_trigger = bch2_trans_mark_extent, \
+ .atomic_trigger = bch2_mark_extent, \
+})
+
+#define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) { \
+ .key_invalid = bch2_btree_ptr_v2_invalid, \
+ .val_to_text = bch2_btree_ptr_v2_to_text, \
+ .swab = bch2_ptr_swab, \
+ .compat = bch2_btree_ptr_v2_compat, \
+ .trans_trigger = bch2_trans_mark_extent, \
+ .atomic_trigger = bch2_mark_extent, \
+ .min_val_size = 40, \
+})
+
+/* KEY_TYPE_extent: */
+
+bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+
+#define bch2_bkey_ops_extent ((struct bkey_ops) { \
+ .key_invalid = bch2_bkey_ptrs_invalid, \
+ .val_to_text = bch2_bkey_ptrs_to_text, \
+ .swab = bch2_ptr_swab, \
+ .key_normalize = bch2_extent_normalize, \
+ .key_merge = bch2_extent_merge, \
+ .trans_trigger = bch2_trans_mark_extent, \
+ .atomic_trigger = bch2_mark_extent, \
+})
+
+/* KEY_TYPE_reservation: */
+
+int bch2_reservation_invalid(struct bch_fs *, struct bkey_s_c,
+ enum bkey_invalid_flags, struct printbuf *);
+void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+
+#define bch2_bkey_ops_reservation ((struct bkey_ops) { \
+ .key_invalid = bch2_reservation_invalid, \
+ .val_to_text = bch2_reservation_to_text, \
+ .key_merge = bch2_reservation_merge, \
+ .trans_trigger = bch2_trans_mark_reservation, \
+ .atomic_trigger = bch2_mark_reservation, \
+ .min_val_size = 8, \
+})
+
+/* Extent checksum entries: */
+
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c,
+ struct bch_extent_crc_unpacked);
+bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked);
+void bch2_extent_crc_append(struct bkey_i *,
+ struct bch_extent_crc_unpacked);
+
+/* Generic code for keys with pointers: */
+
+static inline bool bkey_is_btree_ptr(const struct bkey *k)
+{
+ switch (k->type) {
+ case KEY_TYPE_btree_ptr:
+ case KEY_TYPE_btree_ptr_v2:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline bool bkey_extent_is_direct_data(const struct bkey *k)
+{
+ switch (k->type) {
+ case KEY_TYPE_btree_ptr:
+ case KEY_TYPE_btree_ptr_v2:
+ case KEY_TYPE_extent:
+ case KEY_TYPE_reflink_v:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline bool bkey_extent_is_inline_data(const struct bkey *k)
+{
+ return k->type == KEY_TYPE_inline_data ||
+ k->type == KEY_TYPE_indirect_inline_data;
+}
+
+static inline unsigned bkey_inline_data_offset(const struct bkey *k)
+{
+ switch (k->type) {
+ case KEY_TYPE_inline_data:
+ return sizeof(struct bch_inline_data);
+ case KEY_TYPE_indirect_inline_data:
+ return sizeof(struct bch_indirect_inline_data);
+ default:
+ BUG();
+ }
+}
+
+static inline unsigned bkey_inline_data_bytes(const struct bkey *k)
+{
+ return bkey_val_bytes(k) - bkey_inline_data_offset(k);
+}
+
+#define bkey_inline_data_p(_k) (((void *) (_k).v) + bkey_inline_data_offset((_k).k))
+
+static inline bool bkey_extent_is_data(const struct bkey *k)
+{
+ return bkey_extent_is_direct_data(k) ||
+ bkey_extent_is_inline_data(k) ||
+ k->type == KEY_TYPE_reflink_p;
+}
+
+/*
+ * Should extent be counted under inode->i_sectors?
+ */
+static inline bool bkey_extent_is_allocation(const struct bkey *k)
+{
+ switch (k->type) {
+ case KEY_TYPE_extent:
+ case KEY_TYPE_reservation:
+ case KEY_TYPE_reflink_p:
+ case KEY_TYPE_reflink_v:
+ case KEY_TYPE_inline_data:
+ case KEY_TYPE_indirect_inline_data:
+ case KEY_TYPE_error:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline bool bkey_extent_is_unwritten(struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
+
+ bkey_for_each_ptr(ptrs, ptr)
+ if (ptr->unwritten)
+ return true;
+ return false;
+}
+
+static inline bool bkey_extent_is_reservation(struct bkey_s_c k)
+{
+ return k.k->type == KEY_TYPE_reservation ||
+ bkey_extent_is_unwritten(k);
+}
+
+static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
+{
+ struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
+
+ bkey_for_each_ptr(p, ptr)
+ ret.devs[ret.nr++] = ptr->dev;
+
+ return ret;
+}
+
+static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
+{
+ struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
+
+ bkey_for_each_ptr(p, ptr)
+ if (!ptr->cached)
+ ret.devs[ret.nr++] = ptr->dev;
+
+ return ret;
+}
+
+static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
+{
+ struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
+
+ bkey_for_each_ptr(p, ptr)
+ if (ptr->cached)
+ ret.devs[ret.nr++] = ptr->dev;
+
+ return ret;
+}
+
+static inline unsigned bch2_bkey_ptr_data_type(struct bkey_s_c k, const struct bch_extent_ptr *ptr)
+{
+ switch (k.k->type) {
+ case KEY_TYPE_btree_ptr:
+ case KEY_TYPE_btree_ptr_v2:
+ return BCH_DATA_btree;
+ case KEY_TYPE_extent:
+ case KEY_TYPE_reflink_v:
+ return BCH_DATA_user;
+ case KEY_TYPE_stripe: {
+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+
+ BUG_ON(ptr < s.v->ptrs ||
+ ptr >= s.v->ptrs + s.v->nr_blocks);
+
+ return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant
+ ? BCH_DATA_parity
+ : BCH_DATA_user;
+ }
+ default:
+ BUG();
+ }
+}
+
+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
+unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
+bool bch2_bkey_is_incompressible(struct bkey_s_c);
+unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
+
+unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
+unsigned bch2_extent_ptr_desired_durability(struct bch_fs *, struct extent_ptr_decoded *);
+unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *);
+unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
+
+void bch2_bkey_drop_device(struct bkey_s, unsigned);
+void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
+
+const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned);
+
+static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev)
+{
+ return (void *) bch2_bkey_has_device_c(k.s_c, dev);
+}
+
+bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
+
+void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
+
+static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr)
+{
+ struct bch_extent_ptr *dest;
+
+ EBUG_ON(bch2_bkey_has_device(bkey_i_to_s(k), ptr.dev));
+
+ switch (k->k.type) {
+ case KEY_TYPE_btree_ptr:
+ case KEY_TYPE_btree_ptr_v2:
+ case KEY_TYPE_extent:
+ EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
+
+ ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
+ dest = (struct bch_extent_ptr *)((void *) &k->v + bkey_val_bytes(&k->k));
+ *dest = ptr;
+ k->k.u64s++;
+ break;
+ default:
+ BUG();
+ }
+}
+
+void bch2_extent_ptr_decoded_append(struct bkey_i *,
+ struct extent_ptr_decoded *);
+union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s,
+ struct bch_extent_ptr *);
+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
+ struct bch_extent_ptr *);
+
+#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \
+do { \
+ struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k); \
+ \
+ _ptr = &_ptrs.start->ptr; \
+ \
+ while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) { \
+ if (_cond) { \
+ _ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr); \
+ _ptrs = bch2_bkey_ptrs(_k); \
+ continue; \
+ } \
+ \
+ (_ptr)++; \
+ } \
+} while (0)
+
+bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
+ struct bch_extent_ptr, u64);
+bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c);
+struct bch_extent_ptr *
+bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s);
+
+void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *);
+
+bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
+void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
+ struct bkey_s_c);
+int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c,
+ enum bkey_invalid_flags, struct printbuf *);
+
+void bch2_ptr_swab(struct bkey_s);
+
+const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
+unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
+ unsigned, unsigned);
+bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c);
+
+int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
+ unsigned, unsigned);
+
+/* Generic extent code: */
+
+enum bch_extent_overlap {
+ BCH_EXTENT_OVERLAP_ALL = 0,
+ BCH_EXTENT_OVERLAP_BACK = 1,
+ BCH_EXTENT_OVERLAP_FRONT = 2,
+ BCH_EXTENT_OVERLAP_MIDDLE = 3,
+};
+
+/* Returns how k overlaps with m */
+static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
+ const struct bkey *m)
+{
+ int cmp1 = bkey_lt(k->p, m->p);
+ int cmp2 = bkey_gt(bkey_start_pos(k), bkey_start_pos(m));
+
+ return (cmp1 << 1) + cmp2;
+}
+
+int bch2_cut_front_s(struct bpos, struct bkey_s);
+int bch2_cut_back_s(struct bpos, struct bkey_s);
+
+static inline void bch2_cut_front(struct bpos where, struct bkey_i *k)
+{
+ bch2_cut_front_s(where, bkey_i_to_s(k));
+}
+
+static inline void bch2_cut_back(struct bpos where, struct bkey_i *k)
+{
+ bch2_cut_back_s(where, bkey_i_to_s(k));
+}
+
+/**
+ * bch_key_resize - adjust size of @k
+ *
+ * bkey_start_offset(k) will be preserved, modifies where the extent ends
+ */
+static inline void bch2_key_resize(struct bkey *k, unsigned new_size)
+{
+ k->p.offset -= k->size;
+ k->p.offset += new_size;
+ k->size = new_size;
+}
+
+#endif /* _BCACHEFS_EXTENTS_H */
diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h
new file mode 100644
index 000000000000..43d6c341ecca
--- /dev/null
+++ b/fs/bcachefs/extents_types.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EXTENTS_TYPES_H
+#define _BCACHEFS_EXTENTS_TYPES_H
+
+#include "bcachefs_format.h"
+
+struct bch_extent_crc_unpacked {
+ u32 compressed_size;
+ u32 uncompressed_size;
+ u32 live_size;
+
+ u8 csum_type;
+ u8 compression_type;
+
+ u16 offset;
+
+ u16 nonce;
+
+ struct bch_csum csum;
+};
+
+struct extent_ptr_decoded {
+ unsigned idx;
+ bool has_ec;
+ struct bch_extent_crc_unpacked crc;
+ struct bch_extent_ptr ptr;
+ struct bch_extent_stripe_ptr ec;
+};
+
+struct bch_io_failures {
+ u8 nr;
+ struct bch_dev_io_failures {
+ u8 dev;
+ u8 idx;
+ u8 nr_failed;
+ u8 nr_retries;
+ } devs[BCH_REPLICAS_MAX];
+};
+
+#endif /* _BCACHEFS_EXTENTS_TYPES_H */
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
new file mode 100644
index 000000000000..05429c9631cd
--- /dev/null
+++ b/fs/bcachefs/eytzinger.h
@@ -0,0 +1,281 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _EYTZINGER_H
+#define _EYTZINGER_H
+
+#include <linux/bitops.h>
+#include <linux/log2.h>
+
+#include "util.h"
+
+/*
+ * Traversal for trees in eytzinger layout - a full binary tree layed out in an
+ * array
+ */
+
+/*
+ * One based indexing version:
+ *
+ * With one based indexing each level of the tree starts at a power of two -
+ * good for cacheline alignment:
+ */
+
+static inline unsigned eytzinger1_child(unsigned i, unsigned child)
+{
+ EBUG_ON(child > 1);
+
+ return (i << 1) + child;
+}
+
+static inline unsigned eytzinger1_left_child(unsigned i)
+{
+ return eytzinger1_child(i, 0);
+}
+
+static inline unsigned eytzinger1_right_child(unsigned i)
+{
+ return eytzinger1_child(i, 1);
+}
+
+static inline unsigned eytzinger1_first(unsigned size)
+{
+ return rounddown_pow_of_two(size);
+}
+
+static inline unsigned eytzinger1_last(unsigned size)
+{
+ return rounddown_pow_of_two(size + 1) - 1;
+}
+
+/*
+ * eytzinger1_next() and eytzinger1_prev() have the nice properties that
+ *
+ * eytzinger1_next(0) == eytzinger1_first())
+ * eytzinger1_prev(0) == eytzinger1_last())
+ *
+ * eytzinger1_prev(eytzinger1_first()) == 0
+ * eytzinger1_next(eytzinger1_last()) == 0
+ */
+
+static inline unsigned eytzinger1_next(unsigned i, unsigned size)
+{
+ EBUG_ON(i > size);
+
+ if (eytzinger1_right_child(i) <= size) {
+ i = eytzinger1_right_child(i);
+
+ i <<= __fls(size + 1) - __fls(i);
+ i >>= i > size;
+ } else {
+ i >>= ffz(i) + 1;
+ }
+
+ return i;
+}
+
+static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
+{
+ EBUG_ON(i > size);
+
+ if (eytzinger1_left_child(i) <= size) {
+ i = eytzinger1_left_child(i) + 1;
+
+ i <<= __fls(size + 1) - __fls(i);
+ i -= 1;
+ i >>= i > size;
+ } else {
+ i >>= __ffs(i) + 1;
+ }
+
+ return i;
+}
+
+static inline unsigned eytzinger1_extra(unsigned size)
+{
+ return (size + 1 - rounddown_pow_of_two(size)) << 1;
+}
+
+static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
+ unsigned extra)
+{
+ unsigned b = __fls(i);
+ unsigned shift = __fls(size) - b;
+ int s;
+
+ EBUG_ON(!i || i > size);
+
+ i ^= 1U << b;
+ i <<= 1;
+ i |= 1;
+ i <<= shift;
+
+ /*
+ * sign bit trick:
+ *
+ * if (i > extra)
+ * i -= (i - extra) >> 1;
+ */
+ s = extra - i;
+ i += (s >> 1) & (s >> 31);
+
+ return i;
+}
+
+static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
+ unsigned extra)
+{
+ unsigned shift;
+ int s;
+
+ EBUG_ON(!i || i > size);
+
+ /*
+ * sign bit trick:
+ *
+ * if (i > extra)
+ * i += i - extra;
+ */
+ s = extra - i;
+ i -= s & (s >> 31);
+
+ shift = __ffs(i);
+
+ i >>= shift + 1;
+ i |= 1U << (__fls(size) - shift);
+
+ return i;
+}
+
+static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size)
+{
+ return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size));
+}
+
+static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
+{
+ return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size));
+}
+
+#define eytzinger1_for_each(_i, _size) \
+ for ((_i) = eytzinger1_first((_size)); \
+ (_i) != 0; \
+ (_i) = eytzinger1_next((_i), (_size)))
+
+/* Zero based indexing version: */
+
+static inline unsigned eytzinger0_child(unsigned i, unsigned child)
+{
+ EBUG_ON(child > 1);
+
+ return (i << 1) + 1 + child;
+}
+
+static inline unsigned eytzinger0_left_child(unsigned i)
+{
+ return eytzinger0_child(i, 0);
+}
+
+static inline unsigned eytzinger0_right_child(unsigned i)
+{
+ return eytzinger0_child(i, 1);
+}
+
+static inline unsigned eytzinger0_first(unsigned size)
+{
+ return eytzinger1_first(size) - 1;
+}
+
+static inline unsigned eytzinger0_last(unsigned size)
+{
+ return eytzinger1_last(size) - 1;
+}
+
+static inline unsigned eytzinger0_next(unsigned i, unsigned size)
+{
+ return eytzinger1_next(i + 1, size) - 1;
+}
+
+static inline unsigned eytzinger0_prev(unsigned i, unsigned size)
+{
+ return eytzinger1_prev(i + 1, size) - 1;
+}
+
+static inline unsigned eytzinger0_extra(unsigned size)
+{
+ return eytzinger1_extra(size);
+}
+
+static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size,
+ unsigned extra)
+{
+ return __eytzinger1_to_inorder(i + 1, size, extra) - 1;
+}
+
+static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size,
+ unsigned extra)
+{
+ return __inorder_to_eytzinger1(i + 1, size, extra) - 1;
+}
+
+static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size)
+{
+ return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size));
+}
+
+static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
+{
+ return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size));
+}
+
+#define eytzinger0_for_each(_i, _size) \
+ for ((_i) = eytzinger0_first((_size)); \
+ (_i) != -1; \
+ (_i) = eytzinger0_next((_i), (_size)))
+
+typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
+
+/* return greatest node <= @search, or -1 if not found */
+static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
+ eytzinger_cmp_fn cmp, const void *search)
+{
+ unsigned i, n = 0;
+
+ if (!nr)
+ return -1;
+
+ do {
+ i = n;
+ n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
+ } while (n < nr);
+
+ if (n & 1) {
+ /* @i was greater than @search, return previous node: */
+
+ if (i == eytzinger0_first(nr))
+ return -1;
+
+ return eytzinger0_prev(i, nr);
+ } else {
+ return i;
+ }
+}
+
+#define eytzinger0_find(base, nr, size, _cmp, search) \
+({ \
+ void *_base = (base); \
+ void *_search = (search); \
+ size_t _nr = (nr); \
+ size_t _size = (size); \
+ size_t _i = 0; \
+ int _res; \
+ \
+ while (_i < _nr && \
+ (_res = _cmp(_search, _base + _i * _size, _size))) \
+ _i = eytzinger0_child(_i, _res > 0); \
+ _i; \
+})
+
+void eytzinger0_sort(void *, size_t, size_t,
+ int (*cmp_func)(const void *, const void *, size_t),
+ void (*swap_func)(void *, void *, size_t));
+
+#endif /* _EYTZINGER_H */
diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h
new file mode 100644
index 000000000000..66b945be10c2
--- /dev/null
+++ b/fs/bcachefs/fifo.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FIFO_H
+#define _BCACHEFS_FIFO_H
+
+#include "util.h"
+
+#define FIFO(type) \
+struct { \
+ size_t front, back, size, mask; \
+ type *data; \
+}
+
+#define DECLARE_FIFO(type, name) FIFO(type) name
+
+#define fifo_buf_size(fifo) \
+ ((fifo)->size \
+ ? roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0]) \
+ : 0)
+
+#define init_fifo(fifo, _size, _gfp) \
+({ \
+ (fifo)->front = (fifo)->back = 0; \
+ (fifo)->size = (_size); \
+ (fifo)->mask = (fifo)->size \
+ ? roundup_pow_of_two((fifo)->size) - 1 \
+ : 0; \
+ (fifo)->data = kvpmalloc(fifo_buf_size(fifo), (_gfp)); \
+})
+
+#define free_fifo(fifo) \
+do { \
+ kvpfree((fifo)->data, fifo_buf_size(fifo)); \
+ (fifo)->data = NULL; \
+} while (0)
+
+#define fifo_swap(l, r) \
+do { \
+ swap((l)->front, (r)->front); \
+ swap((l)->back, (r)->back); \
+ swap((l)->size, (r)->size); \
+ swap((l)->mask, (r)->mask); \
+ swap((l)->data, (r)->data); \
+} while (0)
+
+#define fifo_move(dest, src) \
+do { \
+ typeof(*((dest)->data)) _t; \
+ while (!fifo_full(dest) && \
+ fifo_pop(src, _t)) \
+ fifo_push(dest, _t); \
+} while (0)
+
+#define fifo_used(fifo) (((fifo)->back - (fifo)->front))
+#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo))
+
+#define fifo_empty(fifo) ((fifo)->front == (fifo)->back)
+#define fifo_full(fifo) (fifo_used(fifo) == (fifo)->size)
+
+#define fifo_peek_front(fifo) ((fifo)->data[(fifo)->front & (fifo)->mask])
+#define fifo_peek_back(fifo) ((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
+
+#define fifo_entry_idx_abs(fifo, p) \
+ ((((p) >= &fifo_peek_front(fifo) \
+ ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) + \
+ (((p) - (fifo)->data)))
+
+#define fifo_entry_idx(fifo, p) (((p) - &fifo_peek_front(fifo)) & (fifo)->mask)
+#define fifo_idx_entry(fifo, i) ((fifo)->data[((fifo)->front + (i)) & (fifo)->mask])
+
+#define fifo_push_back_ref(f) \
+ (fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask])
+
+#define fifo_push_front_ref(f) \
+ (fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask])
+
+#define fifo_push_back(fifo, new) \
+({ \
+ typeof((fifo)->data) _r = fifo_push_back_ref(fifo); \
+ if (_r) \
+ *_r = (new); \
+ _r != NULL; \
+})
+
+#define fifo_push_front(fifo, new) \
+({ \
+ typeof((fifo)->data) _r = fifo_push_front_ref(fifo); \
+ if (_r) \
+ *_r = (new); \
+ _r != NULL; \
+})
+
+#define fifo_pop_front(fifo, i) \
+({ \
+ bool _r = !fifo_empty((fifo)); \
+ if (_r) \
+ (i) = (fifo)->data[(fifo)->front++ & (fifo)->mask]; \
+ _r; \
+})
+
+#define fifo_pop_back(fifo, i) \
+({ \
+ bool _r = !fifo_empty((fifo)); \
+ if (_r) \
+ (i) = (fifo)->data[--(fifo)->back & (fifo)->mask]; \
+ _r; \
+})
+
+#define fifo_push_ref(fifo) fifo_push_back_ref(fifo)
+#define fifo_push(fifo, i) fifo_push_back(fifo, (i))
+#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i))
+#define fifo_peek(fifo) fifo_peek_front(fifo)
+
+#define fifo_for_each_entry(_entry, _fifo, _iter) \
+ for (typecheck(typeof((_fifo)->front), _iter), \
+ (_iter) = (_fifo)->front; \
+ ((_iter != (_fifo)->back) && \
+ (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \
+ (_iter)++)
+
+#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter) \
+ for (typecheck(typeof((_fifo)->front), _iter), \
+ (_iter) = (_fifo)->front; \
+ ((_iter != (_fifo)->back) && \
+ (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true)); \
+ (_iter)++)
+
+#endif /* _BCACHEFS_FIFO_H */
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
new file mode 100644
index 000000000000..4496cf91a4c1
--- /dev/null
+++ b/fs/bcachefs/fs-common.c
@@ -0,0 +1,501 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "acl.h"
+#include "btree_update.h"
+#include "dirent.h"
+#include "fs-common.h"
+#include "inode.h"
+#include "subvolume.h"
+#include "xattr.h"
+
+#include <linux/posix_acl.h>
+
+static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode)
+{
+ return S_ISDIR(inode->bi_mode) && !inode->bi_subvol;
+}
+
+int bch2_create_trans(struct btree_trans *trans,
+ subvol_inum dir,
+ struct bch_inode_unpacked *dir_u,
+ struct bch_inode_unpacked *new_inode,
+ const struct qstr *name,
+ uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
+ struct posix_acl *default_acl,
+ struct posix_acl *acl,
+ subvol_inum snapshot_src,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter dir_iter = { NULL };
+ struct btree_iter inode_iter = { NULL };
+ subvol_inum new_inum = dir;
+ u64 now = bch2_current_time(c);
+ u64 cpu = raw_smp_processor_id();
+ u64 dir_target;
+ u32 snapshot;
+ unsigned dir_type = mode_to_type(mode);
+ int ret;
+
+ ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
+ if (ret)
+ goto err;
+
+ if (!(flags & BCH_CREATE_SNAPSHOT)) {
+ /* Normal create path - allocate a new inode: */
+ bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
+
+ if (flags & BCH_CREATE_TMPFILE)
+ new_inode->bi_flags |= BCH_INODE_unlinked;
+
+ ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu);
+ if (ret)
+ goto err;
+
+ snapshot_src = (subvol_inum) { 0 };
+ } else {
+ /*
+ * Creating a snapshot - we're not allocating a new inode, but
+ * we do have to lookup the root inode of the subvolume we're
+ * snapshotting and update it (in the new snapshot):
+ */
+
+ if (!snapshot_src.inum) {
+ /* Inode wasn't specified, just snapshot: */
+ struct bch_subvolume s;
+
+ ret = bch2_subvolume_get(trans, snapshot_src.subvol, true,
+ BTREE_ITER_CACHED, &s);
+ if (ret)
+ goto err;
+
+ snapshot_src.inum = le64_to_cpu(s.inode);
+ }
+
+ ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src,
+ BTREE_ITER_INTENT);
+ if (ret)
+ goto err;
+
+ if (new_inode->bi_subvol != snapshot_src.subvol) {
+ /* Not a subvolume root: */
+ ret = -EINVAL;
+ goto err;
+ }
+
+ /*
+ * If we're not root, we have to own the subvolume being
+ * snapshotted:
+ */
+ if (uid && new_inode->bi_uid != uid) {
+ ret = -EPERM;
+ goto err;
+ }
+
+ flags |= BCH_CREATE_SUBVOL;
+ }
+
+ new_inum.inum = new_inode->bi_inum;
+ dir_target = new_inode->bi_inum;
+
+ if (flags & BCH_CREATE_SUBVOL) {
+ u32 new_subvol, dir_snapshot;
+
+ ret = bch2_subvolume_create(trans, new_inode->bi_inum,
+ snapshot_src.subvol,
+ &new_subvol, &snapshot,
+ (flags & BCH_CREATE_SNAPSHOT_RO) != 0);
+ if (ret)
+ goto err;
+
+ new_inode->bi_parent_subvol = dir.subvol;
+ new_inode->bi_subvol = new_subvol;
+ new_inum.subvol = new_subvol;
+ dir_target = new_subvol;
+ dir_type = DT_SUBVOL;
+
+ ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot);
+ if (ret)
+ goto err;
+
+ bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot);
+ ret = bch2_btree_iter_traverse(&dir_iter);
+ if (ret)
+ goto err;
+ }
+
+ if (!(flags & BCH_CREATE_SNAPSHOT)) {
+ if (default_acl) {
+ ret = bch2_set_acl_trans(trans, new_inum, new_inode,
+ default_acl, ACL_TYPE_DEFAULT);
+ if (ret)
+ goto err;
+ }
+
+ if (acl) {
+ ret = bch2_set_acl_trans(trans, new_inum, new_inode,
+ acl, ACL_TYPE_ACCESS);
+ if (ret)
+ goto err;
+ }
+ }
+
+ if (!(flags & BCH_CREATE_TMPFILE)) {
+ struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u);
+ u64 dir_offset;
+
+ if (is_subdir_for_nlink(new_inode))
+ dir_u->bi_nlink++;
+ dir_u->bi_mtime = dir_u->bi_ctime = now;
+
+ ret = bch2_inode_write(trans, &dir_iter, dir_u);
+ if (ret)
+ goto err;
+
+ ret = bch2_dirent_create(trans, dir, &dir_hash,
+ dir_type,
+ name,
+ dir_target,
+ &dir_offset,
+ BCH_HASH_SET_MUST_CREATE);
+ if (ret)
+ goto err;
+
+ if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+ new_inode->bi_dir = dir_u->bi_inum;
+ new_inode->bi_dir_offset = dir_offset;
+ }
+ }
+
+ inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
+ bch2_btree_iter_set_snapshot(&inode_iter, snapshot);
+
+ ret = bch2_btree_iter_traverse(&inode_iter) ?:
+ bch2_inode_write(trans, &inode_iter, new_inode);
+err:
+ bch2_trans_iter_exit(trans, &inode_iter);
+ bch2_trans_iter_exit(trans, &dir_iter);
+ return ret;
+}
+
+int bch2_link_trans(struct btree_trans *trans,
+ subvol_inum dir, struct bch_inode_unpacked *dir_u,
+ subvol_inum inum, struct bch_inode_unpacked *inode_u,
+ const struct qstr *name)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter dir_iter = { NULL };
+ struct btree_iter inode_iter = { NULL };
+ struct bch_hash_info dir_hash;
+ u64 now = bch2_current_time(c);
+ u64 dir_offset = 0;
+ int ret;
+
+ if (dir.subvol != inum.subvol)
+ return -EXDEV;
+
+ ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
+ if (ret)
+ goto err;
+
+ inode_u->bi_ctime = now;
+ ret = bch2_inode_nlink_inc(inode_u);
+ if (ret)
+ return ret;
+
+ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
+ if (ret)
+ goto err;
+
+ if (bch2_reinherit_attrs(inode_u, dir_u)) {
+ ret = -EXDEV;
+ goto err;
+ }
+
+ dir_u->bi_mtime = dir_u->bi_ctime = now;
+
+ dir_hash = bch2_hash_info_init(c, dir_u);
+
+ ret = bch2_dirent_create(trans, dir, &dir_hash,
+ mode_to_type(inode_u->bi_mode),
+ name, inum.inum, &dir_offset,
+ BCH_HASH_SET_MUST_CREATE);
+ if (ret)
+ goto err;
+
+ if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+ inode_u->bi_dir = dir.inum;
+ inode_u->bi_dir_offset = dir_offset;
+ }
+
+ ret = bch2_inode_write(trans, &dir_iter, dir_u) ?:
+ bch2_inode_write(trans, &inode_iter, inode_u);
+err:
+ bch2_trans_iter_exit(trans, &dir_iter);
+ bch2_trans_iter_exit(trans, &inode_iter);
+ return ret;
+}
+
+int bch2_unlink_trans(struct btree_trans *trans,
+ subvol_inum dir,
+ struct bch_inode_unpacked *dir_u,
+ struct bch_inode_unpacked *inode_u,
+ const struct qstr *name,
+ bool deleting_snapshot)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter dir_iter = { NULL };
+ struct btree_iter dirent_iter = { NULL };
+ struct btree_iter inode_iter = { NULL };
+ struct bch_hash_info dir_hash;
+ subvol_inum inum;
+ u64 now = bch2_current_time(c);
+ struct bkey_s_c k;
+ int ret;
+
+ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
+ if (ret)
+ goto err;
+
+ dir_hash = bch2_hash_info_init(c, dir_u);
+
+ ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
+ name, &inum, BTREE_ITER_INTENT);
+ if (ret)
+ goto err;
+
+ ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum,
+ BTREE_ITER_INTENT);
+ if (ret)
+ goto err;
+
+ if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) {
+ ret = bch2_empty_dir_trans(trans, inum);
+ if (ret)
+ goto err;
+ }
+
+ if (deleting_snapshot && !inode_u->bi_subvol) {
+ ret = -BCH_ERR_ENOENT_not_subvol;
+ goto err;
+ }
+
+ if (deleting_snapshot || inode_u->bi_subvol) {
+ ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol);
+ if (ret)
+ goto err;
+
+ k = bch2_btree_iter_peek_slot(&dirent_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ /*
+ * If we're deleting a subvolume, we need to really delete the
+ * dirent, not just emit a whiteout in the current snapshot:
+ */
+ bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot);
+ ret = bch2_btree_iter_traverse(&dirent_iter);
+ if (ret)
+ goto err;
+ } else {
+ bch2_inode_nlink_dec(trans, inode_u);
+ }
+
+ if (inode_u->bi_dir == dirent_iter.pos.inode &&
+ inode_u->bi_dir_offset == dirent_iter.pos.offset) {
+ inode_u->bi_dir = 0;
+ inode_u->bi_dir_offset = 0;
+ }
+
+ dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
+ dir_u->bi_nlink -= is_subdir_for_nlink(inode_u);
+
+ ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+ &dir_hash, &dirent_iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ bch2_inode_write(trans, &dir_iter, dir_u) ?:
+ bch2_inode_write(trans, &inode_iter, inode_u);
+err:
+ bch2_trans_iter_exit(trans, &inode_iter);
+ bch2_trans_iter_exit(trans, &dirent_iter);
+ bch2_trans_iter_exit(trans, &dir_iter);
+ return ret;
+}
+
+bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
+ struct bch_inode_unpacked *src_u)
+{
+ u64 src, dst;
+ unsigned id;
+ bool ret = false;
+
+ for (id = 0; id < Inode_opt_nr; id++) {
+ /* Skip attributes that were explicitly set on this inode */
+ if (dst_u->bi_fields_set & (1 << id))
+ continue;
+
+ src = bch2_inode_opt_get(src_u, id);
+ dst = bch2_inode_opt_get(dst_u, id);
+
+ if (src == dst)
+ continue;
+
+ bch2_inode_opt_set(dst_u, id, src);
+ ret = true;
+ }
+
+ return ret;
+}
+
+int bch2_rename_trans(struct btree_trans *trans,
+ subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u,
+ subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u,
+ struct bch_inode_unpacked *src_inode_u,
+ struct bch_inode_unpacked *dst_inode_u,
+ const struct qstr *src_name,
+ const struct qstr *dst_name,
+ enum bch_rename_mode mode)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter src_dir_iter = { NULL };
+ struct btree_iter dst_dir_iter = { NULL };
+ struct btree_iter src_inode_iter = { NULL };
+ struct btree_iter dst_inode_iter = { NULL };
+ struct bch_hash_info src_hash, dst_hash;
+ subvol_inum src_inum, dst_inum;
+ u64 src_offset, dst_offset;
+ u64 now = bch2_current_time(c);
+ int ret;
+
+ ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir,
+ BTREE_ITER_INTENT);
+ if (ret)
+ goto err;
+
+ src_hash = bch2_hash_info_init(c, src_dir_u);
+
+ if (dst_dir.inum != src_dir.inum ||
+ dst_dir.subvol != src_dir.subvol) {
+ ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir,
+ BTREE_ITER_INTENT);
+ if (ret)
+ goto err;
+
+ dst_hash = bch2_hash_info_init(c, dst_dir_u);
+ } else {
+ dst_dir_u = src_dir_u;
+ dst_hash = src_hash;
+ }
+
+ ret = bch2_dirent_rename(trans,
+ src_dir, &src_hash,
+ dst_dir, &dst_hash,
+ src_name, &src_inum, &src_offset,
+ dst_name, &dst_inum, &dst_offset,
+ mode);
+ if (ret)
+ goto err;
+
+ ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum,
+ BTREE_ITER_INTENT);
+ if (ret)
+ goto err;
+
+ if (dst_inum.inum) {
+ ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum,
+ BTREE_ITER_INTENT);
+ if (ret)
+ goto err;
+ }
+
+ if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+ src_inode_u->bi_dir = dst_dir_u->bi_inum;
+ src_inode_u->bi_dir_offset = dst_offset;
+
+ if (mode == BCH_RENAME_EXCHANGE) {
+ dst_inode_u->bi_dir = src_dir_u->bi_inum;
+ dst_inode_u->bi_dir_offset = src_offset;
+ }
+
+ if (mode == BCH_RENAME_OVERWRITE &&
+ dst_inode_u->bi_dir == dst_dir_u->bi_inum &&
+ dst_inode_u->bi_dir_offset == src_offset) {
+ dst_inode_u->bi_dir = 0;
+ dst_inode_u->bi_dir_offset = 0;
+ }
+ }
+
+ if (mode == BCH_RENAME_OVERWRITE) {
+ if (S_ISDIR(src_inode_u->bi_mode) !=
+ S_ISDIR(dst_inode_u->bi_mode)) {
+ ret = -ENOTDIR;
+ goto err;
+ }
+
+ if (S_ISDIR(dst_inode_u->bi_mode) &&
+ bch2_empty_dir_trans(trans, dst_inum)) {
+ ret = -ENOTEMPTY;
+ goto err;
+ }
+ }
+
+ if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) &&
+ S_ISDIR(src_inode_u->bi_mode)) {
+ ret = -EXDEV;
+ goto err;
+ }
+
+ if (mode == BCH_RENAME_EXCHANGE &&
+ bch2_reinherit_attrs(dst_inode_u, src_dir_u) &&
+ S_ISDIR(dst_inode_u->bi_mode)) {
+ ret = -EXDEV;
+ goto err;
+ }
+
+ if (is_subdir_for_nlink(src_inode_u)) {
+ src_dir_u->bi_nlink--;
+ dst_dir_u->bi_nlink++;
+ }
+
+ if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) {
+ dst_dir_u->bi_nlink--;
+ src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE;
+ }
+
+ if (mode == BCH_RENAME_OVERWRITE)
+ bch2_inode_nlink_dec(trans, dst_inode_u);
+
+ src_dir_u->bi_mtime = now;
+ src_dir_u->bi_ctime = now;
+
+ if (src_dir.inum != dst_dir.inum) {
+ dst_dir_u->bi_mtime = now;
+ dst_dir_u->bi_ctime = now;
+ }
+
+ src_inode_u->bi_ctime = now;
+
+ if (dst_inum.inum)
+ dst_inode_u->bi_ctime = now;
+
+ ret = bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?:
+ (src_dir.inum != dst_dir.inum
+ ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u)
+ : 0) ?:
+ bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?:
+ (dst_inum.inum
+ ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u)
+ : 0);
+err:
+ bch2_trans_iter_exit(trans, &dst_inode_iter);
+ bch2_trans_iter_exit(trans, &src_inode_iter);
+ bch2_trans_iter_exit(trans, &dst_dir_iter);
+ bch2_trans_iter_exit(trans, &src_dir_iter);
+ return ret;
+}
diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h
new file mode 100644
index 000000000000..dde237859514
--- /dev/null
+++ b/fs/bcachefs/fs-common.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_COMMON_H
+#define _BCACHEFS_FS_COMMON_H
+
+struct posix_acl;
+
+#define BCH_CREATE_TMPFILE (1U << 0)
+#define BCH_CREATE_SUBVOL (1U << 1)
+#define BCH_CREATE_SNAPSHOT (1U << 2)
+#define BCH_CREATE_SNAPSHOT_RO (1U << 3)
+
+int bch2_create_trans(struct btree_trans *, subvol_inum,
+ struct bch_inode_unpacked *,
+ struct bch_inode_unpacked *,
+ const struct qstr *,
+ uid_t, gid_t, umode_t, dev_t,
+ struct posix_acl *,
+ struct posix_acl *,
+ subvol_inum, unsigned);
+
+int bch2_link_trans(struct btree_trans *,
+ subvol_inum, struct bch_inode_unpacked *,
+ subvol_inum, struct bch_inode_unpacked *,
+ const struct qstr *);
+
+int bch2_unlink_trans(struct btree_trans *, subvol_inum,
+ struct bch_inode_unpacked *,
+ struct bch_inode_unpacked *,
+ const struct qstr *, bool);
+
+int bch2_rename_trans(struct btree_trans *,
+ subvol_inum, struct bch_inode_unpacked *,
+ subvol_inum, struct bch_inode_unpacked *,
+ struct bch_inode_unpacked *,
+ struct bch_inode_unpacked *,
+ const struct qstr *,
+ const struct qstr *,
+ enum bch_rename_mode);
+
+bool bch2_reinherit_attrs(struct bch_inode_unpacked *,
+ struct bch_inode_unpacked *);
+
+#endif /* _BCACHEFS_FS_COMMON_H */
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
new file mode 100644
index 000000000000..52f0e7acda3d
--- /dev/null
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -0,0 +1,1106 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "fs-io.h"
+#include "fs-io-buffered.h"
+#include "fs-io-direct.h"
+#include "fs-io-pagecache.h"
+#include "io_read.h"
+#include "io_write.h"
+
+#include <linux/backing-dev.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+
+static inline bool bio_full(struct bio *bio, unsigned len)
+{
+ if (bio->bi_vcnt >= bio->bi_max_vecs)
+ return true;
+ if (bio->bi_iter.bi_size > UINT_MAX - len)
+ return true;
+ return false;
+}
+
+/* readpage(s): */
+
+static void bch2_readpages_end_io(struct bio *bio)
+{
+ struct folio_iter fi;
+
+ bio_for_each_folio_all(fi, bio) {
+ if (!bio->bi_status) {
+ folio_mark_uptodate(fi.folio);
+ } else {
+ folio_clear_uptodate(fi.folio);
+ folio_set_error(fi.folio);
+ }
+ folio_unlock(fi.folio);
+ }
+
+ bio_put(bio);
+}
+
+struct readpages_iter {
+ struct address_space *mapping;
+ unsigned idx;
+ folios folios;
+};
+
+static int readpages_iter_init(struct readpages_iter *iter,
+ struct readahead_control *ractl)
+{
+ struct folio **fi;
+ int ret;
+
+ memset(iter, 0, sizeof(*iter));
+
+ iter->mapping = ractl->mapping;
+
+ ret = bch2_filemap_get_contig_folios_d(iter->mapping,
+ ractl->_index << PAGE_SHIFT,
+ (ractl->_index + ractl->_nr_pages) << PAGE_SHIFT,
+ 0, mapping_gfp_mask(iter->mapping),
+ &iter->folios);
+ if (ret)
+ return ret;
+
+ darray_for_each(iter->folios, fi) {
+ ractl->_nr_pages -= 1U << folio_order(*fi);
+ __bch2_folio_create(*fi, __GFP_NOFAIL|GFP_KERNEL);
+ folio_put(*fi);
+ folio_put(*fi);
+ }
+
+ return 0;
+}
+
+static inline struct folio *readpage_iter_peek(struct readpages_iter *iter)
+{
+ if (iter->idx >= iter->folios.nr)
+ return NULL;
+ return iter->folios.data[iter->idx];
+}
+
+static inline void readpage_iter_advance(struct readpages_iter *iter)
+{
+ iter->idx++;
+}
+
+static bool extent_partial_reads_expensive(struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ struct bch_extent_crc_unpacked crc;
+ const union bch_extent_entry *i;
+
+ bkey_for_each_crc(k.k, ptrs, crc, i)
+ if (crc.csum_type || crc.compression_type)
+ return true;
+ return false;
+}
+
+static int readpage_bio_extend(struct btree_trans *trans,
+ struct readpages_iter *iter,
+ struct bio *bio,
+ unsigned sectors_this_extent,
+ bool get_more)
+{
+ /* Don't hold btree locks while allocating memory: */
+ bch2_trans_unlock(trans);
+
+ while (bio_sectors(bio) < sectors_this_extent &&
+ bio->bi_vcnt < bio->bi_max_vecs) {
+ struct folio *folio = readpage_iter_peek(iter);
+ int ret;
+
+ if (folio) {
+ readpage_iter_advance(iter);
+ } else {
+ pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
+
+ if (!get_more)
+ break;
+
+ folio = xa_load(&iter->mapping->i_pages, folio_offset);
+ if (folio && !xa_is_value(folio))
+ break;
+
+ folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0);
+ if (!folio)
+ break;
+
+ if (!__bch2_folio_create(folio, GFP_KERNEL)) {
+ folio_put(folio);
+ break;
+ }
+
+ ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL);
+ if (ret) {
+ __bch2_folio_release(folio);
+ folio_put(folio);
+ break;
+ }
+
+ folio_put(folio);
+ }
+
+ BUG_ON(folio_sector(folio) != bio_end_sector(bio));
+
+ BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0));
+ }
+
+ return bch2_trans_relock(trans);
+}
+
+static void bchfs_read(struct btree_trans *trans,
+ struct bch_read_bio *rbio,
+ subvol_inum inum,
+ struct readpages_iter *readpages_iter)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_buf sk;
+ int flags = BCH_READ_RETRY_IF_STALE|
+ BCH_READ_MAY_PROMOTE;
+ u32 snapshot;
+ int ret = 0;
+
+ rbio->c = c;
+ rbio->start_time = local_clock();
+ rbio->subvol = inum.subvol;
+
+ bch2_bkey_buf_init(&sk);
+retry:
+ bch2_trans_begin(trans);
+ iter = (struct btree_iter) { NULL };
+
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+ SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
+ BTREE_ITER_SLOTS);
+ while (1) {
+ struct bkey_s_c k;
+ unsigned bytes, sectors, offset_into_extent;
+ enum btree_id data_btree = BTREE_ID_extents;
+
+ /*
+ * read_extent -> io_time_reset may cause a transaction restart
+ * without returning an error, we need to check for that here:
+ */
+ ret = bch2_trans_relock(trans);
+ if (ret)
+ break;
+
+ bch2_btree_iter_set_pos(&iter,
+ POS(inum.inum, rbio->bio.bi_iter.bi_sector));
+
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ break;
+
+ offset_into_extent = iter.pos.offset -
+ bkey_start_offset(k.k);
+ sectors = k.k->size - offset_into_extent;
+
+ bch2_bkey_buf_reassemble(&sk, c, k);
+
+ ret = bch2_read_indirect_extent(trans, &data_btree,
+ &offset_into_extent, &sk);
+ if (ret)
+ break;
+
+ k = bkey_i_to_s_c(sk.k);
+
+ sectors = min(sectors, k.k->size - offset_into_extent);
+
+ if (readpages_iter) {
+ ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
+ extent_partial_reads_expensive(k));
+ if (ret)
+ break;
+ }
+
+ bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
+ swap(rbio->bio.bi_iter.bi_size, bytes);
+
+ if (rbio->bio.bi_iter.bi_size == bytes)
+ flags |= BCH_READ_LAST_FRAGMENT;
+
+ bch2_bio_page_state_set(&rbio->bio, k);
+
+ bch2_read_extent(trans, rbio, iter.pos,
+ data_btree, k, offset_into_extent, flags);
+
+ if (flags & BCH_READ_LAST_FRAGMENT)
+ break;
+
+ swap(rbio->bio.bi_iter.bi_size, bytes);
+ bio_advance(&rbio->bio, bytes);
+
+ ret = btree_trans_too_many_iters(trans);
+ if (ret)
+ break;
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
+
+ if (ret) {
+ bch_err_inum_offset_ratelimited(c,
+ iter.pos.inode,
+ iter.pos.offset << 9,
+ "read error %i from btree lookup", ret);
+ rbio->bio.bi_status = BLK_STS_IOERR;
+ bio_endio(&rbio->bio);
+ }
+
+ bch2_bkey_buf_exit(&sk, c);
+}
+
+void bch2_readahead(struct readahead_control *ractl)
+{
+ struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch_io_opts opts;
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct folio *folio;
+ struct readpages_iter readpages_iter;
+ int ret;
+
+ bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+ ret = readpages_iter_init(&readpages_iter, ractl);
+ BUG_ON(ret);
+
+ bch2_pagecache_add_get(inode);
+
+ while ((folio = readpage_iter_peek(&readpages_iter))) {
+ unsigned n = min_t(unsigned,
+ readpages_iter.folios.nr -
+ readpages_iter.idx,
+ BIO_MAX_VECS);
+ struct bch_read_bio *rbio =
+ rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
+ GFP_KERNEL, &c->bio_read),
+ opts);
+
+ readpage_iter_advance(&readpages_iter);
+
+ rbio->bio.bi_iter.bi_sector = folio_sector(folio);
+ rbio->bio.bi_end_io = bch2_readpages_end_io;
+ BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
+
+ bchfs_read(trans, rbio, inode_inum(inode),
+ &readpages_iter);
+ bch2_trans_unlock(trans);
+ }
+
+ bch2_pagecache_add_put(inode);
+
+ bch2_trans_put(trans);
+ darray_exit(&readpages_iter.folios);
+}
+
+static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio,
+ subvol_inum inum, struct folio *folio)
+{
+ bch2_folio_create(folio, __GFP_NOFAIL);
+
+ rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
+ rbio->bio.bi_iter.bi_sector = folio_sector(folio);
+ BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
+
+ bch2_trans_run(c, (bchfs_read(trans, rbio, inum, NULL), 0));
+}
+
+static void bch2_read_single_folio_end_io(struct bio *bio)
+{
+ complete(bio->bi_private);
+}
+
+int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
+{
+ struct bch_inode_info *inode = to_bch_ei(mapping->host);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch_read_bio *rbio;
+ struct bch_io_opts opts;
+ int ret;
+ DECLARE_COMPLETION_ONSTACK(done);
+
+ bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+ rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
+ opts);
+ rbio->bio.bi_private = &done;
+ rbio->bio.bi_end_io = bch2_read_single_folio_end_io;
+
+ __bchfs_readfolio(c, rbio, inode_inum(inode), folio);
+ wait_for_completion(&done);
+
+ ret = blk_status_to_errno(rbio->bio.bi_status);
+ bio_put(&rbio->bio);
+
+ if (ret < 0)
+ return ret;
+
+ folio_mark_uptodate(folio);
+ return 0;
+}
+
+int bch2_read_folio(struct file *file, struct folio *folio)
+{
+ int ret;
+
+ ret = bch2_read_single_folio(folio, folio->mapping);
+ folio_unlock(folio);
+ return bch2_err_class(ret);
+}
+
+/* writepages: */
+
+struct bch_writepage_io {
+ struct bch_inode_info *inode;
+
+ /* must be last: */
+ struct bch_write_op op;
+};
+
+struct bch_writepage_state {
+ struct bch_writepage_io *io;
+ struct bch_io_opts opts;
+ struct bch_folio_sector *tmp;
+ unsigned tmp_sectors;
+};
+
+static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
+ struct bch_inode_info *inode)
+{
+ struct bch_writepage_state ret = { 0 };
+
+ bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode);
+ return ret;
+}
+
+/*
+ * Determine when a writepage io is full. We have to limit writepage bios to a
+ * single page per bvec (i.e. 1MB with 4k pages) because that is the limit to
+ * what the bounce path in bch2_write_extent() can handle. In theory we could
+ * loosen this restriction for non-bounce I/O, but we don't have that context
+ * here. Ideally, we can up this limit and make it configurable in the future
+ * when the bounce path can be enhanced to accommodate larger source bios.
+ */
+static inline bool bch_io_full(struct bch_writepage_io *io, unsigned len)
+{
+ struct bio *bio = &io->op.wbio.bio;
+ return bio_full(bio, len) ||
+ (bio->bi_iter.bi_size + len > BIO_MAX_VECS * PAGE_SIZE);
+}
+
+static void bch2_writepage_io_done(struct bch_write_op *op)
+{
+ struct bch_writepage_io *io =
+ container_of(op, struct bch_writepage_io, op);
+ struct bch_fs *c = io->op.c;
+ struct bio *bio = &io->op.wbio.bio;
+ struct folio_iter fi;
+ unsigned i;
+
+ if (io->op.error) {
+ set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
+
+ bio_for_each_folio_all(fi, bio) {
+ struct bch_folio *s;
+
+ folio_set_error(fi.folio);
+ mapping_set_error(fi.folio->mapping, -EIO);
+
+ s = __bch2_folio(fi.folio);
+ spin_lock(&s->lock);
+ for (i = 0; i < folio_sectors(fi.folio); i++)
+ s->s[i].nr_replicas = 0;
+ spin_unlock(&s->lock);
+ }
+ }
+
+ if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
+ bio_for_each_folio_all(fi, bio) {
+ struct bch_folio *s;
+
+ s = __bch2_folio(fi.folio);
+ spin_lock(&s->lock);
+ for (i = 0; i < folio_sectors(fi.folio); i++)
+ s->s[i].nr_replicas = 0;
+ spin_unlock(&s->lock);
+ }
+ }
+
+ /*
+ * racing with fallocate can cause us to add fewer sectors than
+ * expected - but we shouldn't add more sectors than expected:
+ */
+ WARN_ON_ONCE(io->op.i_sectors_delta > 0);
+
+ /*
+ * (error (due to going RO) halfway through a page can screw that up
+ * slightly)
+ * XXX wtf?
+ BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS);
+ */
+
+ /*
+ * PageWriteback is effectively our ref on the inode - fixup i_blocks
+ * before calling end_page_writeback:
+ */
+ bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
+
+ bio_for_each_folio_all(fi, bio) {
+ struct bch_folio *s = __bch2_folio(fi.folio);
+
+ if (atomic_dec_and_test(&s->write_count))
+ folio_end_writeback(fi.folio);
+ }
+
+ bio_put(&io->op.wbio.bio);
+}
+
+static void bch2_writepage_do_io(struct bch_writepage_state *w)
+{
+ struct bch_writepage_io *io = w->io;
+
+ w->io = NULL;
+ closure_call(&io->op.cl, bch2_write, NULL, NULL);
+}
+
+/*
+ * Get a bch_writepage_io and add @page to it - appending to an existing one if
+ * possible, else allocating a new one:
+ */
+static void bch2_writepage_io_alloc(struct bch_fs *c,
+ struct writeback_control *wbc,
+ struct bch_writepage_state *w,
+ struct bch_inode_info *inode,
+ u64 sector,
+ unsigned nr_replicas)
+{
+ struct bch_write_op *op;
+
+ w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS,
+ REQ_OP_WRITE,
+ GFP_KERNEL,
+ &c->writepage_bioset),
+ struct bch_writepage_io, op.wbio.bio);
+
+ w->io->inode = inode;
+ op = &w->io->op;
+ bch2_write_op_init(op, c, w->opts);
+ op->target = w->opts.foreground_target;
+ op->nr_replicas = nr_replicas;
+ op->res.nr_replicas = nr_replicas;
+ op->write_point = writepoint_hashed(inode->ei_last_dirtied);
+ op->subvol = inode->ei_subvol;
+ op->pos = POS(inode->v.i_ino, sector);
+ op->end_io = bch2_writepage_io_done;
+ op->devs_need_flush = &inode->ei_devs_need_flush;
+ op->wbio.bio.bi_iter.bi_sector = sector;
+ op->wbio.bio.bi_opf = wbc_to_write_flags(wbc);
+}
+
+static int __bch2_writepage(struct folio *folio,
+ struct writeback_control *wbc,
+ void *data)
+{
+ struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch_writepage_state *w = data;
+ struct bch_folio *s;
+ unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX;
+ loff_t i_size = i_size_read(&inode->v);
+ int ret;
+
+ EBUG_ON(!folio_test_uptodate(folio));
+
+ /* Is the folio fully inside i_size? */
+ if (folio_end_pos(folio) <= i_size)
+ goto do_io;
+
+ /* Is the folio fully outside i_size? (truncate in progress) */
+ if (folio_pos(folio) >= i_size) {
+ folio_unlock(folio);
+ return 0;
+ }
+
+ /*
+ * The folio straddles i_size. It must be zeroed out on each and every
+ * writepage invocation because it may be mmapped. "A file is mapped
+ * in multiples of the folio size. For a file that is not a multiple of
+ * the folio size, the remaining memory is zeroed when mapped, and
+ * writes to that region are not written out to the file."
+ */
+ folio_zero_segment(folio,
+ i_size - folio_pos(folio),
+ folio_size(folio));
+do_io:
+ f_sectors = folio_sectors(folio);
+ s = bch2_folio(folio);
+
+ if (f_sectors > w->tmp_sectors) {
+ kfree(w->tmp);
+ w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), __GFP_NOFAIL);
+ w->tmp_sectors = f_sectors;
+ }
+
+ /*
+ * Things get really hairy with errors during writeback:
+ */
+ ret = bch2_get_folio_disk_reservation(c, inode, folio, false);
+ BUG_ON(ret);
+
+ /* Before unlocking the page, get copy of reservations: */
+ spin_lock(&s->lock);
+ memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors);
+
+ for (i = 0; i < f_sectors; i++) {
+ if (s->s[i].state < SECTOR_dirty)
+ continue;
+
+ nr_replicas_this_write =
+ min_t(unsigned, nr_replicas_this_write,
+ s->s[i].nr_replicas +
+ s->s[i].replicas_reserved);
+ }
+
+ for (i = 0; i < f_sectors; i++) {
+ if (s->s[i].state < SECTOR_dirty)
+ continue;
+
+ s->s[i].nr_replicas = w->opts.compression
+ ? 0 : nr_replicas_this_write;
+
+ s->s[i].replicas_reserved = 0;
+ bch2_folio_sector_set(folio, s, i, SECTOR_allocated);
+ }
+ spin_unlock(&s->lock);
+
+ BUG_ON(atomic_read(&s->write_count));
+ atomic_set(&s->write_count, 1);
+
+ BUG_ON(folio_test_writeback(folio));
+ folio_start_writeback(folio);
+
+ folio_unlock(folio);
+
+ offset = 0;
+ while (1) {
+ unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0;
+ u64 sector;
+
+ while (offset < f_sectors &&
+ w->tmp[offset].state < SECTOR_dirty)
+ offset++;
+
+ if (offset == f_sectors)
+ break;
+
+ while (offset + sectors < f_sectors &&
+ w->tmp[offset + sectors].state >= SECTOR_dirty) {
+ reserved_sectors += w->tmp[offset + sectors].replicas_reserved;
+ dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty;
+ sectors++;
+ }
+ BUG_ON(!sectors);
+
+ sector = folio_sector(folio) + offset;
+
+ if (w->io &&
+ (w->io->op.res.nr_replicas != nr_replicas_this_write ||
+ bch_io_full(w->io, sectors << 9) ||
+ bio_end_sector(&w->io->op.wbio.bio) != sector))
+ bch2_writepage_do_io(w);
+
+ if (!w->io)
+ bch2_writepage_io_alloc(c, wbc, w, inode, sector,
+ nr_replicas_this_write);
+
+ atomic_inc(&s->write_count);
+
+ BUG_ON(inode != w->io->inode);
+ BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio,
+ sectors << 9, offset << 9));
+
+ /* Check for writing past i_size: */
+ WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
+ round_up(i_size, block_bytes(c)) &&
+ !test_bit(BCH_FS_EMERGENCY_RO, &c->flags),
+ "writing past i_size: %llu > %llu (unrounded %llu)\n",
+ bio_end_sector(&w->io->op.wbio.bio) << 9,
+ round_up(i_size, block_bytes(c)),
+ i_size);
+
+ w->io->op.res.sectors += reserved_sectors;
+ w->io->op.i_sectors_delta -= dirty_sectors;
+ w->io->op.new_i_size = i_size;
+
+ offset += sectors;
+ }
+
+ if (atomic_dec_and_test(&s->write_count))
+ folio_end_writeback(folio);
+
+ return 0;
+}
+
+int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+ struct bch_fs *c = mapping->host->i_sb->s_fs_info;
+ struct bch_writepage_state w =
+ bch_writepage_state_init(c, to_bch_ei(mapping->host));
+ struct blk_plug plug;
+ int ret;
+
+ blk_start_plug(&plug);
+ ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w);
+ if (w.io)
+ bch2_writepage_do_io(&w);
+ blk_finish_plug(&plug);
+ kfree(w.tmp);
+ return bch2_err_class(ret);
+}
+
+/* buffered writes: */
+
+int bch2_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct page **pagep, void **fsdata)
+{
+ struct bch_inode_info *inode = to_bch_ei(mapping->host);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch2_folio_reservation *res;
+ struct folio *folio;
+ unsigned offset;
+ int ret = -ENOMEM;
+
+ res = kmalloc(sizeof(*res), GFP_KERNEL);
+ if (!res)
+ return -ENOMEM;
+
+ bch2_folio_reservation_init(c, inode, res);
+ *fsdata = res;
+
+ bch2_pagecache_add_get(inode);
+
+ folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
+ FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE,
+ mapping_gfp_mask(mapping));
+ if (IS_ERR_OR_NULL(folio))
+ goto err_unlock;
+
+ offset = pos - folio_pos(folio);
+ len = min_t(size_t, len, folio_end_pos(folio) - pos);
+
+ if (folio_test_uptodate(folio))
+ goto out;
+
+ /* If we're writing entire folio, don't need to read it in first: */
+ if (!offset && len == folio_size(folio))
+ goto out;
+
+ if (!offset && pos + len >= inode->v.i_size) {
+ folio_zero_segment(folio, len, folio_size(folio));
+ flush_dcache_folio(folio);
+ goto out;
+ }
+
+ if (folio_pos(folio) >= inode->v.i_size) {
+ folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio));
+ flush_dcache_folio(folio);
+ goto out;
+ }
+readpage:
+ ret = bch2_read_single_folio(folio, mapping);
+ if (ret)
+ goto err;
+out:
+ ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
+ if (ret)
+ goto err;
+
+ ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len);
+ if (ret) {
+ if (!folio_test_uptodate(folio)) {
+ /*
+ * If the folio hasn't been read in, we won't know if we
+ * actually need a reservation - we don't actually need
+ * to read here, we just need to check if the folio is
+ * fully backed by uncompressed data:
+ */
+ goto readpage;
+ }
+
+ goto err;
+ }
+
+ *pagep = &folio->page;
+ return 0;
+err:
+ folio_unlock(folio);
+ folio_put(folio);
+ *pagep = NULL;
+err_unlock:
+ bch2_pagecache_add_put(inode);
+ kfree(res);
+ *fsdata = NULL;
+ return bch2_err_class(ret);
+}
+
+int bch2_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct bch_inode_info *inode = to_bch_ei(mapping->host);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch2_folio_reservation *res = fsdata;
+ struct folio *folio = page_folio(page);
+ unsigned offset = pos - folio_pos(folio);
+
+ lockdep_assert_held(&inode->v.i_rwsem);
+ BUG_ON(offset + copied > folio_size(folio));
+
+ if (unlikely(copied < len && !folio_test_uptodate(folio))) {
+ /*
+ * The folio needs to be read in, but that would destroy
+ * our partial write - simplest thing is to just force
+ * userspace to redo the write:
+ */
+ folio_zero_range(folio, 0, folio_size(folio));
+ flush_dcache_folio(folio);
+ copied = 0;
+ }
+
+ spin_lock(&inode->v.i_lock);
+ if (pos + copied > inode->v.i_size)
+ i_size_write(&inode->v, pos + copied);
+ spin_unlock(&inode->v.i_lock);
+
+ if (copied) {
+ if (!folio_test_uptodate(folio))
+ folio_mark_uptodate(folio);
+
+ bch2_set_folio_dirty(c, inode, folio, res, offset, copied);
+
+ inode->ei_last_dirtied = (unsigned long) current;
+ }
+
+ folio_unlock(folio);
+ folio_put(folio);
+ bch2_pagecache_add_put(inode);
+
+ bch2_folio_reservation_put(c, inode, res);
+ kfree(res);
+
+ return copied;
+}
+
+static noinline void folios_trunc(folios *fs, struct folio **fi)
+{
+ while (fs->data + fs->nr > fi) {
+ struct folio *f = darray_pop(fs);
+
+ folio_unlock(f);
+ folio_put(f);
+ }
+}
+
+static int __bch2_buffered_write(struct bch_inode_info *inode,
+ struct address_space *mapping,
+ struct iov_iter *iter,
+ loff_t pos, unsigned len)
+{
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch2_folio_reservation res;
+ folios fs;
+ struct folio **fi, *f;
+ unsigned copied = 0, f_offset, f_copied;
+ u64 end = pos + len, f_pos, f_len;
+ loff_t last_folio_pos = inode->v.i_size;
+ int ret = 0;
+
+ BUG_ON(!len);
+
+ bch2_folio_reservation_init(c, inode, &res);
+ darray_init(&fs);
+
+ ret = bch2_filemap_get_contig_folios_d(mapping, pos, end,
+ FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT,
+ mapping_gfp_mask(mapping),
+ &fs);
+ if (ret)
+ goto out;
+
+ BUG_ON(!fs.nr);
+
+ f = darray_first(fs);
+ if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
+ ret = bch2_read_single_folio(f, mapping);
+ if (ret)
+ goto out;
+ }
+
+ f = darray_last(fs);
+ end = min(end, folio_end_pos(f));
+ last_folio_pos = folio_pos(f);
+ if (end != folio_end_pos(f) && !folio_test_uptodate(f)) {
+ if (end >= inode->v.i_size) {
+ folio_zero_range(f, 0, folio_size(f));
+ } else {
+ ret = bch2_read_single_folio(f, mapping);
+ if (ret)
+ goto out;
+ }
+ }
+
+ ret = bch2_folio_set(c, inode_inum(inode), fs.data, fs.nr);
+ if (ret)
+ goto out;
+
+ f_pos = pos;
+ f_offset = pos - folio_pos(darray_first(fs));
+ darray_for_each(fs, fi) {
+ f = *fi;
+ f_len = min(end, folio_end_pos(f)) - f_pos;
+
+ /*
+ * XXX: per POSIX and fstests generic/275, on -ENOSPC we're
+ * supposed to write as much as we have disk space for.
+ *
+ * On failure here we should still write out a partial page if
+ * we aren't completely out of disk space - we don't do that
+ * yet:
+ */
+ ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len);
+ if (unlikely(ret)) {
+ folios_trunc(&fs, fi);
+ if (!fs.nr)
+ goto out;
+
+ end = min(end, folio_end_pos(darray_last(fs)));
+ break;
+ }
+
+ f_pos = folio_end_pos(f);
+ f_offset = 0;
+ }
+
+ if (mapping_writably_mapped(mapping))
+ darray_for_each(fs, fi)
+ flush_dcache_folio(*fi);
+
+ f_pos = pos;
+ f_offset = pos - folio_pos(darray_first(fs));
+ darray_for_each(fs, fi) {
+ f = *fi;
+ f_len = min(end, folio_end_pos(f)) - f_pos;
+ f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter);
+ if (!f_copied) {
+ folios_trunc(&fs, fi);
+ break;
+ }
+
+ if (!folio_test_uptodate(f) &&
+ f_copied != folio_size(f) &&
+ pos + copied + f_copied < inode->v.i_size) {
+ iov_iter_revert(iter, f_copied);
+ folio_zero_range(f, 0, folio_size(f));
+ folios_trunc(&fs, fi);
+ break;
+ }
+
+ flush_dcache_folio(f);
+ copied += f_copied;
+
+ if (f_copied != f_len) {
+ folios_trunc(&fs, fi + 1);
+ break;
+ }
+
+ f_pos = folio_end_pos(f);
+ f_offset = 0;
+ }
+
+ if (!copied)
+ goto out;
+
+ end = pos + copied;
+
+ spin_lock(&inode->v.i_lock);
+ if (end > inode->v.i_size)
+ i_size_write(&inode->v, end);
+ spin_unlock(&inode->v.i_lock);
+
+ f_pos = pos;
+ f_offset = pos - folio_pos(darray_first(fs));
+ darray_for_each(fs, fi) {
+ f = *fi;
+ f_len = min(end, folio_end_pos(f)) - f_pos;
+
+ if (!folio_test_uptodate(f))
+ folio_mark_uptodate(f);
+
+ bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len);
+
+ f_pos = folio_end_pos(f);
+ f_offset = 0;
+ }
+
+ inode->ei_last_dirtied = (unsigned long) current;
+out:
+ darray_for_each(fs, fi) {
+ folio_unlock(*fi);
+ folio_put(*fi);
+ }
+
+ /*
+ * If the last folio added to the mapping starts beyond current EOF, we
+ * performed a short write but left around at least one post-EOF folio.
+ * Clean up the mapping before we return.
+ */
+ if (last_folio_pos >= inode->v.i_size)
+ truncate_pagecache(&inode->v, inode->v.i_size);
+
+ darray_exit(&fs);
+ bch2_folio_reservation_put(c, inode, &res);
+
+ return copied ?: ret;
+}
+
+static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct bch_inode_info *inode = file_bch_inode(file);
+ loff_t pos = iocb->ki_pos;
+ ssize_t written = 0;
+ int ret = 0;
+
+ bch2_pagecache_add_get(inode);
+
+ do {
+ unsigned offset = pos & (PAGE_SIZE - 1);
+ unsigned bytes = iov_iter_count(iter);
+again:
+ /*
+ * Bring in the user page that we will copy from _first_.
+ * Otherwise there's a nasty deadlock on copying from the
+ * same page as we're writing to, without it being marked
+ * up-to-date.
+ *
+ * Not only is this an optimisation, but it is also required
+ * to check that the address is actually valid, when atomic
+ * usercopies are used, below.
+ */
+ if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
+ bytes = min_t(unsigned long, iov_iter_count(iter),
+ PAGE_SIZE - offset);
+
+ if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
+ ret = -EFAULT;
+ break;
+ }
+ }
+
+ if (unlikely(fatal_signal_pending(current))) {
+ ret = -EINTR;
+ break;
+ }
+
+ ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
+ if (unlikely(ret < 0))
+ break;
+
+ cond_resched();
+
+ if (unlikely(ret == 0)) {
+ /*
+ * If we were unable to copy any data at all, we must
+ * fall back to a single segment length write.
+ *
+ * If we didn't fallback here, we could livelock
+ * because not all segments in the iov can be copied at
+ * once without a pagefault.
+ */
+ bytes = min_t(unsigned long, PAGE_SIZE - offset,
+ iov_iter_single_seg_count(iter));
+ goto again;
+ }
+ pos += ret;
+ written += ret;
+ ret = 0;
+
+ balance_dirty_pages_ratelimited(mapping);
+ } while (iov_iter_count(iter));
+
+ bch2_pagecache_add_put(inode);
+
+ return written ? written : ret;
+}
+
+ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct bch_inode_info *inode = file_bch_inode(file);
+ ssize_t ret;
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ ret = bch2_direct_write(iocb, from);
+ goto out;
+ }
+
+ inode_lock(&inode->v);
+
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0)
+ goto unlock;
+
+ ret = file_remove_privs(file);
+ if (ret)
+ goto unlock;
+
+ ret = file_update_time(file);
+ if (ret)
+ goto unlock;
+
+ ret = bch2_buffered_write(iocb, from);
+ if (likely(ret > 0))
+ iocb->ki_pos += ret;
+unlock:
+ inode_unlock(&inode->v);
+
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
+out:
+ return bch2_err_class(ret);
+}
+
+void bch2_fs_fs_io_buffered_exit(struct bch_fs *c)
+{
+ bioset_exit(&c->writepage_bioset);
+}
+
+int bch2_fs_fs_io_buffered_init(struct bch_fs *c)
+{
+ if (bioset_init(&c->writepage_bioset,
+ 4, offsetof(struct bch_writepage_io, op.wbio.bio),
+ BIOSET_NEED_BVECS))
+ return -BCH_ERR_ENOMEM_writepage_bioset_init;
+
+ return 0;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io-buffered.h b/fs/bcachefs/fs-io-buffered.h
new file mode 100644
index 000000000000..a6126ff790e6
--- /dev/null
+++ b/fs/bcachefs/fs-io-buffered.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IO_BUFFERED_H
+#define _BCACHEFS_FS_IO_BUFFERED_H
+
+#ifndef NO_BCACHEFS_FS
+
+int bch2_read_single_folio(struct folio *, struct address_space *);
+int bch2_read_folio(struct file *, struct folio *);
+
+int bch2_writepages(struct address_space *, struct writeback_control *);
+void bch2_readahead(struct readahead_control *);
+
+int bch2_write_begin(struct file *, struct address_space *, loff_t,
+ unsigned, struct page **, void **);
+int bch2_write_end(struct file *, struct address_space *, loff_t,
+ unsigned, unsigned, struct page *, void *);
+
+ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
+
+void bch2_fs_fs_io_buffered_exit(struct bch_fs *);
+int bch2_fs_fs_io_buffered_init(struct bch_fs *);
+#else
+static inline void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) {}
+static inline int bch2_fs_fs_io_buffered_init(struct bch_fs *c) { return 0; }
+#endif
+
+#endif /* _BCACHEFS_FS_IO_BUFFERED_H */
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
new file mode 100644
index 000000000000..84e20c3ada6c
--- /dev/null
+++ b/fs/bcachefs/fs-io-direct.c
@@ -0,0 +1,677 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "fs.h"
+#include "fs-io.h"
+#include "fs-io-direct.h"
+#include "fs-io-pagecache.h"
+#include "io_read.h"
+#include "io_write.h"
+
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+#include <linux/prefetch.h>
+#include <linux/task_io_accounting_ops.h>
+
+/* O_DIRECT reads */
+
+struct dio_read {
+ struct closure cl;
+ struct kiocb *req;
+ long ret;
+ bool should_dirty;
+ struct bch_read_bio rbio;
+};
+
+static void bio_check_or_release(struct bio *bio, bool check_dirty)
+{
+ if (check_dirty) {
+ bio_check_pages_dirty(bio);
+ } else {
+ bio_release_pages(bio, false);
+ bio_put(bio);
+ }
+}
+
+static CLOSURE_CALLBACK(bch2_dio_read_complete)
+{
+ closure_type(dio, struct dio_read, cl);
+
+ dio->req->ki_complete(dio->req, dio->ret);
+ bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
+}
+
+static void bch2_direct_IO_read_endio(struct bio *bio)
+{
+ struct dio_read *dio = bio->bi_private;
+
+ if (bio->bi_status)
+ dio->ret = blk_status_to_errno(bio->bi_status);
+
+ closure_put(&dio->cl);
+}
+
+static void bch2_direct_IO_read_split_endio(struct bio *bio)
+{
+ struct dio_read *dio = bio->bi_private;
+ bool should_dirty = dio->should_dirty;
+
+ bch2_direct_IO_read_endio(bio);
+ bio_check_or_release(bio, should_dirty);
+}
+
+static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
+{
+ struct file *file = req->ki_filp;
+ struct bch_inode_info *inode = file_bch_inode(file);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch_io_opts opts;
+ struct dio_read *dio;
+ struct bio *bio;
+ loff_t offset = req->ki_pos;
+ bool sync = is_sync_kiocb(req);
+ size_t shorten;
+ ssize_t ret;
+
+ bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+ if ((offset|iter->count) & (block_bytes(c) - 1))
+ return -EINVAL;
+
+ ret = min_t(loff_t, iter->count,
+ max_t(loff_t, 0, i_size_read(&inode->v) - offset));
+
+ if (!ret)
+ return ret;
+
+ shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
+ iter->count -= shorten;
+
+ bio = bio_alloc_bioset(NULL,
+ bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+ REQ_OP_READ,
+ GFP_KERNEL,
+ &c->dio_read_bioset);
+
+ bio->bi_end_io = bch2_direct_IO_read_endio;
+
+ dio = container_of(bio, struct dio_read, rbio.bio);
+ closure_init(&dio->cl, NULL);
+
+ /*
+ * this is a _really_ horrible hack just to avoid an atomic sub at the
+ * end:
+ */
+ if (!sync) {
+ set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL);
+ atomic_set(&dio->cl.remaining,
+ CLOSURE_REMAINING_INITIALIZER -
+ CLOSURE_RUNNING +
+ CLOSURE_DESTRUCTOR);
+ } else {
+ atomic_set(&dio->cl.remaining,
+ CLOSURE_REMAINING_INITIALIZER + 1);
+ dio->cl.closure_get_happened = true;
+ }
+
+ dio->req = req;
+ dio->ret = ret;
+ /*
+ * This is one of the sketchier things I've encountered: we have to skip
+ * the dirtying of requests that are internal from the kernel (i.e. from
+ * loopback), because we'll deadlock on page_lock.
+ */
+ dio->should_dirty = iter_is_iovec(iter);
+
+ goto start;
+ while (iter->count) {
+ bio = bio_alloc_bioset(NULL,
+ bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+ REQ_OP_READ,
+ GFP_KERNEL,
+ &c->bio_read);
+ bio->bi_end_io = bch2_direct_IO_read_split_endio;
+start:
+ bio->bi_opf = REQ_OP_READ|REQ_SYNC;
+ bio->bi_iter.bi_sector = offset >> 9;
+ bio->bi_private = dio;
+
+ ret = bio_iov_iter_get_pages(bio, iter);
+ if (ret < 0) {
+ /* XXX: fault inject this path */
+ bio->bi_status = BLK_STS_RESOURCE;
+ bio_endio(bio);
+ break;
+ }
+
+ offset += bio->bi_iter.bi_size;
+
+ if (dio->should_dirty)
+ bio_set_pages_dirty(bio);
+
+ if (iter->count)
+ closure_get(&dio->cl);
+
+ bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
+ }
+
+ iter->count += shorten;
+
+ if (sync) {
+ closure_sync(&dio->cl);
+ closure_debug_destroy(&dio->cl);
+ ret = dio->ret;
+ bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
+ return ret;
+ } else {
+ return -EIOCBQUEUED;
+ }
+}
+
+ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct file *file = iocb->ki_filp;
+ struct bch_inode_info *inode = file_bch_inode(file);
+ struct address_space *mapping = file->f_mapping;
+ size_t count = iov_iter_count(iter);
+ ssize_t ret;
+
+ if (!count)
+ return 0; /* skip atime */
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ struct blk_plug plug;
+
+ if (unlikely(mapping->nrpages)) {
+ ret = filemap_write_and_wait_range(mapping,
+ iocb->ki_pos,
+ iocb->ki_pos + count - 1);
+ if (ret < 0)
+ goto out;
+ }
+
+ file_accessed(file);
+
+ blk_start_plug(&plug);
+ ret = bch2_direct_IO_read(iocb, iter);
+ blk_finish_plug(&plug);
+
+ if (ret >= 0)
+ iocb->ki_pos += ret;
+ } else {
+ bch2_pagecache_add_get(inode);
+ ret = generic_file_read_iter(iocb, iter);
+ bch2_pagecache_add_put(inode);
+ }
+out:
+ return bch2_err_class(ret);
+}
+
+/* O_DIRECT writes */
+
+struct dio_write {
+ struct kiocb *req;
+ struct address_space *mapping;
+ struct bch_inode_info *inode;
+ struct mm_struct *mm;
+ const struct iovec *iov;
+ unsigned loop:1,
+ extending:1,
+ sync:1,
+ flush:1;
+ struct quota_res quota_res;
+ u64 written;
+
+ struct iov_iter iter;
+ struct iovec inline_vecs[2];
+
+ /* must be last: */
+ struct bch_write_op op;
+};
+
+static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
+ u64 offset, u64 size,
+ unsigned nr_replicas, bool compressed)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u64 end = offset + size;
+ u32 snapshot;
+ bool ret = true;
+ int err;
+retry:
+ bch2_trans_begin(trans);
+
+ err = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (err)
+ goto err;
+
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
+ SPOS(inum.inum, offset, snapshot),
+ BTREE_ITER_SLOTS, k, err) {
+ if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end)))
+ break;
+
+ if (k.k->p.snapshot != snapshot ||
+ nr_replicas > bch2_bkey_replicas(c, k) ||
+ (!compressed && bch2_bkey_sectors_compressed(k))) {
+ ret = false;
+ break;
+ }
+ }
+
+ offset = iter.pos.offset;
+ bch2_trans_iter_exit(trans, &iter);
+err:
+ if (bch2_err_matches(err, BCH_ERR_transaction_restart))
+ goto retry;
+ bch2_trans_put(trans);
+
+ return err ? false : ret;
+}
+
+static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio)
+{
+ struct bch_fs *c = dio->op.c;
+ struct bch_inode_info *inode = dio->inode;
+ struct bio *bio = &dio->op.wbio.bio;
+
+ return bch2_check_range_allocated(c, inode_inum(inode),
+ dio->op.pos.offset, bio_sectors(bio),
+ dio->op.opts.data_replicas,
+ dio->op.opts.compression != 0);
+}
+
+static void bch2_dio_write_loop_async(struct bch_write_op *);
+static __always_inline long bch2_dio_write_done(struct dio_write *dio);
+
+/*
+ * We're going to return -EIOCBQUEUED, but we haven't finished consuming the
+ * iov_iter yet, so we need to stash a copy of the iovec: it might be on the
+ * caller's stack, we're not guaranteed that it will live for the duration of
+ * the IO:
+ */
+static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
+{
+ struct iovec *iov = dio->inline_vecs;
+
+ /*
+ * iov_iter has a single embedded iovec - nothing to do:
+ */
+ if (iter_is_ubuf(&dio->iter))
+ return 0;
+
+ /*
+ * We don't currently handle non-iovec iov_iters here - return an error,
+ * and we'll fall back to doing the IO synchronously:
+ */
+ if (!iter_is_iovec(&dio->iter))
+ return -1;
+
+ if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
+ dio->iov = iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
+ GFP_KERNEL);
+ if (unlikely(!iov))
+ return -ENOMEM;
+ }
+
+ memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov));
+ dio->iter.__iov = iov;
+ return 0;
+}
+
+static CLOSURE_CALLBACK(bch2_dio_write_flush_done)
+{
+ closure_type(dio, struct dio_write, op.cl);
+ struct bch_fs *c = dio->op.c;
+
+ closure_debug_destroy(cl);
+
+ dio->op.error = bch2_journal_error(&c->journal);
+
+ bch2_dio_write_done(dio);
+}
+
+static noinline void bch2_dio_write_flush(struct dio_write *dio)
+{
+ struct bch_fs *c = dio->op.c;
+ struct bch_inode_unpacked inode;
+ int ret;
+
+ dio->flush = 0;
+
+ closure_init(&dio->op.cl, NULL);
+
+ if (!dio->op.error) {
+ ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode);
+ if (ret) {
+ dio->op.error = ret;
+ } else {
+ bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq,
+ &dio->op.cl);
+ bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl);
+ }
+ }
+
+ if (dio->sync) {
+ closure_sync(&dio->op.cl);
+ closure_debug_destroy(&dio->op.cl);
+ } else {
+ continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL);
+ }
+}
+
+static __always_inline long bch2_dio_write_done(struct dio_write *dio)
+{
+ struct kiocb *req = dio->req;
+ struct bch_inode_info *inode = dio->inode;
+ bool sync = dio->sync;
+ long ret;
+
+ if (unlikely(dio->flush)) {
+ bch2_dio_write_flush(dio);
+ if (!sync)
+ return -EIOCBQUEUED;
+ }
+
+ bch2_pagecache_block_put(inode);
+
+ kfree(dio->iov);
+
+ ret = dio->op.error ?: ((long) dio->written << 9);
+ bio_put(&dio->op.wbio.bio);
+
+ /* inode->i_dio_count is our ref on inode and thus bch_fs */
+ inode_dio_end(&inode->v);
+
+ if (ret < 0)
+ ret = bch2_err_class(ret);
+
+ if (!sync) {
+ req->ki_complete(req, ret);
+ ret = -EIOCBQUEUED;
+ }
+ return ret;
+}
+
+static __always_inline void bch2_dio_write_end(struct dio_write *dio)
+{
+ struct bch_fs *c = dio->op.c;
+ struct kiocb *req = dio->req;
+ struct bch_inode_info *inode = dio->inode;
+ struct bio *bio = &dio->op.wbio.bio;
+
+ req->ki_pos += (u64) dio->op.written << 9;
+ dio->written += dio->op.written;
+
+ if (dio->extending) {
+ spin_lock(&inode->v.i_lock);
+ if (req->ki_pos > inode->v.i_size)
+ i_size_write(&inode->v, req->ki_pos);
+ spin_unlock(&inode->v.i_lock);
+ }
+
+ if (dio->op.i_sectors_delta || dio->quota_res.sectors) {
+ mutex_lock(&inode->ei_quota_lock);
+ __bch2_i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta);
+ __bch2_quota_reservation_put(c, inode, &dio->quota_res);
+ mutex_unlock(&inode->ei_quota_lock);
+ }
+
+ bio_release_pages(bio, false);
+
+ if (unlikely(dio->op.error))
+ set_bit(EI_INODE_ERROR, &inode->ei_flags);
+}
+
+static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
+{
+ struct bch_fs *c = dio->op.c;
+ struct kiocb *req = dio->req;
+ struct address_space *mapping = dio->mapping;
+ struct bch_inode_info *inode = dio->inode;
+ struct bch_io_opts opts;
+ struct bio *bio = &dio->op.wbio.bio;
+ unsigned unaligned, iter_count;
+ bool sync = dio->sync, dropped_locks;
+ long ret;
+
+ bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+ while (1) {
+ iter_count = dio->iter.count;
+
+ EBUG_ON(current->faults_disabled_mapping);
+ current->faults_disabled_mapping = mapping;
+
+ ret = bio_iov_iter_get_pages(bio, &dio->iter);
+
+ dropped_locks = fdm_dropped_locks();
+
+ current->faults_disabled_mapping = NULL;
+
+ /*
+ * If the fault handler returned an error but also signalled
+ * that it dropped & retook ei_pagecache_lock, we just need to
+ * re-shoot down the page cache and retry:
+ */
+ if (dropped_locks && ret)
+ ret = 0;
+
+ if (unlikely(ret < 0))
+ goto err;
+
+ if (unlikely(dropped_locks)) {
+ ret = bch2_write_invalidate_inode_pages_range(mapping,
+ req->ki_pos,
+ req->ki_pos + iter_count - 1);
+ if (unlikely(ret))
+ goto err;
+
+ if (!bio->bi_iter.bi_size)
+ continue;
+ }
+
+ unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
+ bio->bi_iter.bi_size -= unaligned;
+ iov_iter_revert(&dio->iter, unaligned);
+
+ if (!bio->bi_iter.bi_size) {
+ /*
+ * bio_iov_iter_get_pages was only able to get <
+ * blocksize worth of pages:
+ */
+ ret = -EFAULT;
+ goto err;
+ }
+
+ bch2_write_op_init(&dio->op, c, opts);
+ dio->op.end_io = sync
+ ? NULL
+ : bch2_dio_write_loop_async;
+ dio->op.target = dio->op.opts.foreground_target;
+ dio->op.write_point = writepoint_hashed((unsigned long) current);
+ dio->op.nr_replicas = dio->op.opts.data_replicas;
+ dio->op.subvol = inode->ei_subvol;
+ dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
+ dio->op.devs_need_flush = &inode->ei_devs_need_flush;
+
+ if (sync)
+ dio->op.flags |= BCH_WRITE_SYNC;
+ dio->op.flags |= BCH_WRITE_CHECK_ENOSPC;
+
+ ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
+ bio_sectors(bio), true);
+ if (unlikely(ret))
+ goto err;
+
+ ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
+ dio->op.opts.data_replicas, 0);
+ if (unlikely(ret) &&
+ !bch2_dio_write_check_allocated(dio))
+ goto err;
+
+ task_io_account_write(bio->bi_iter.bi_size);
+
+ if (unlikely(dio->iter.count) &&
+ !dio->sync &&
+ !dio->loop &&
+ bch2_dio_write_copy_iov(dio))
+ dio->sync = sync = true;
+
+ dio->loop = true;
+ closure_call(&dio->op.cl, bch2_write, NULL, NULL);
+
+ if (!sync)
+ return -EIOCBQUEUED;
+
+ bch2_dio_write_end(dio);
+
+ if (likely(!dio->iter.count) || dio->op.error)
+ break;
+
+ bio_reset(bio, NULL, REQ_OP_WRITE);
+ }
+out:
+ return bch2_dio_write_done(dio);
+err:
+ dio->op.error = ret;
+
+ bio_release_pages(bio, false);
+
+ bch2_quota_reservation_put(c, inode, &dio->quota_res);
+ goto out;
+}
+
+static noinline __cold void bch2_dio_write_continue(struct dio_write *dio)
+{
+ struct mm_struct *mm = dio->mm;
+
+ bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE);
+
+ if (mm)
+ kthread_use_mm(mm);
+ bch2_dio_write_loop(dio);
+ if (mm)
+ kthread_unuse_mm(mm);
+}
+
+static void bch2_dio_write_loop_async(struct bch_write_op *op)
+{
+ struct dio_write *dio = container_of(op, struct dio_write, op);
+
+ bch2_dio_write_end(dio);
+
+ if (likely(!dio->iter.count) || dio->op.error)
+ bch2_dio_write_done(dio);
+ else
+ bch2_dio_write_continue(dio);
+}
+
+ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
+{
+ struct file *file = req->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct bch_inode_info *inode = file_bch_inode(file);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct dio_write *dio;
+ struct bio *bio;
+ bool locked = true, extending;
+ ssize_t ret;
+
+ prefetch(&c->opts);
+ prefetch((void *) &c->opts + 64);
+ prefetch(&inode->ei_inode);
+ prefetch((void *) &inode->ei_inode + 64);
+
+ inode_lock(&inode->v);
+
+ ret = generic_write_checks(req, iter);
+ if (unlikely(ret <= 0))
+ goto err;
+
+ ret = file_remove_privs(file);
+ if (unlikely(ret))
+ goto err;
+
+ ret = file_update_time(file);
+ if (unlikely(ret))
+ goto err;
+
+ if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
+ goto err;
+
+ inode_dio_begin(&inode->v);
+ bch2_pagecache_block_get(inode);
+
+ extending = req->ki_pos + iter->count > inode->v.i_size;
+ if (!extending) {
+ inode_unlock(&inode->v);
+ locked = false;
+ }
+
+ bio = bio_alloc_bioset(NULL,
+ bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+ REQ_OP_WRITE,
+ GFP_KERNEL,
+ &c->dio_write_bioset);
+ dio = container_of(bio, struct dio_write, op.wbio.bio);
+ dio->req = req;
+ dio->mapping = mapping;
+ dio->inode = inode;
+ dio->mm = current->mm;
+ dio->iov = NULL;
+ dio->loop = false;
+ dio->extending = extending;
+ dio->sync = is_sync_kiocb(req) || extending;
+ dio->flush = iocb_is_dsync(req) && !c->opts.journal_flush_disabled;
+ dio->quota_res.sectors = 0;
+ dio->written = 0;
+ dio->iter = *iter;
+ dio->op.c = c;
+
+ if (unlikely(mapping->nrpages)) {
+ ret = bch2_write_invalidate_inode_pages_range(mapping,
+ req->ki_pos,
+ req->ki_pos + iter->count - 1);
+ if (unlikely(ret))
+ goto err_put_bio;
+ }
+
+ ret = bch2_dio_write_loop(dio);
+err:
+ if (locked)
+ inode_unlock(&inode->v);
+ return ret;
+err_put_bio:
+ bch2_pagecache_block_put(inode);
+ bio_put(bio);
+ inode_dio_end(&inode->v);
+ goto err;
+}
+
+void bch2_fs_fs_io_direct_exit(struct bch_fs *c)
+{
+ bioset_exit(&c->dio_write_bioset);
+ bioset_exit(&c->dio_read_bioset);
+}
+
+int bch2_fs_fs_io_direct_init(struct bch_fs *c)
+{
+ if (bioset_init(&c->dio_read_bioset,
+ 4, offsetof(struct dio_read, rbio.bio),
+ BIOSET_NEED_BVECS))
+ return -BCH_ERR_ENOMEM_dio_read_bioset_init;
+
+ if (bioset_init(&c->dio_write_bioset,
+ 4, offsetof(struct dio_write, op.wbio.bio),
+ BIOSET_NEED_BVECS))
+ return -BCH_ERR_ENOMEM_dio_write_bioset_init;
+
+ return 0;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io-direct.h b/fs/bcachefs/fs-io-direct.h
new file mode 100644
index 000000000000..814621ec7f81
--- /dev/null
+++ b/fs/bcachefs/fs-io-direct.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IO_DIRECT_H
+#define _BCACHEFS_FS_IO_DIRECT_H
+
+#ifndef NO_BCACHEFS_FS
+ssize_t bch2_direct_write(struct kiocb *, struct iov_iter *);
+ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
+
+void bch2_fs_fs_io_direct_exit(struct bch_fs *);
+int bch2_fs_fs_io_direct_init(struct bch_fs *);
+#else
+static inline void bch2_fs_fs_io_direct_exit(struct bch_fs *c) {}
+static inline int bch2_fs_fs_io_direct_init(struct bch_fs *c) { return 0; }
+#endif
+
+#endif /* _BCACHEFS_FS_IO_DIRECT_H */
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
new file mode 100644
index 000000000000..ff664fd0d8ef
--- /dev/null
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -0,0 +1,791 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "btree_iter.h"
+#include "extents.h"
+#include "fs-io.h"
+#include "fs-io-pagecache.h"
+#include "subvolume.h"
+
+#include <linux/pagevec.h>
+#include <linux/writeback.h>
+
+int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
+ loff_t start, u64 end,
+ fgf_t fgp_flags, gfp_t gfp,
+ folios *fs)
+{
+ struct folio *f;
+ u64 pos = start;
+ int ret = 0;
+
+ while (pos < end) {
+ if ((u64) pos >= (u64) start + (1ULL << 20))
+ fgp_flags &= ~FGP_CREAT;
+
+ ret = darray_make_room_gfp(fs, 1, gfp & GFP_KERNEL);
+ if (ret)
+ break;
+
+ f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp);
+ if (IS_ERR_OR_NULL(f))
+ break;
+
+ BUG_ON(fs->nr && folio_pos(f) != pos);
+
+ pos = folio_end_pos(f);
+ darray_push(fs, f);
+ }
+
+ if (!fs->nr && !ret && (fgp_flags & FGP_CREAT))
+ ret = -ENOMEM;
+
+ return fs->nr ? 0 : ret;
+}
+
+/* pagecache_block must be held */
+int bch2_write_invalidate_inode_pages_range(struct address_space *mapping,
+ loff_t start, loff_t end)
+{
+ int ret;
+
+ /*
+ * XXX: the way this is currently implemented, we can spin if a process
+ * is continually redirtying a specific page
+ */
+ do {
+ if (!mapping->nrpages)
+ return 0;
+
+ ret = filemap_write_and_wait_range(mapping, start, end);
+ if (ret)
+ break;
+
+ if (!mapping->nrpages)
+ return 0;
+
+ ret = invalidate_inode_pages2_range(mapping,
+ start >> PAGE_SHIFT,
+ end >> PAGE_SHIFT);
+ } while (ret == -EBUSY);
+
+ return ret;
+}
+
+#if 0
+/* Useful for debug tracing: */
+static const char * const bch2_folio_sector_states[] = {
+#define x(n) #n,
+ BCH_FOLIO_SECTOR_STATE()
+#undef x
+ NULL
+};
+#endif
+
+static inline enum bch_folio_sector_state
+folio_sector_dirty(enum bch_folio_sector_state state)
+{
+ switch (state) {
+ case SECTOR_unallocated:
+ return SECTOR_dirty;
+ case SECTOR_reserved:
+ return SECTOR_dirty_reserved;
+ default:
+ return state;
+ }
+}
+
+static inline enum bch_folio_sector_state
+folio_sector_undirty(enum bch_folio_sector_state state)
+{
+ switch (state) {
+ case SECTOR_dirty:
+ return SECTOR_unallocated;
+ case SECTOR_dirty_reserved:
+ return SECTOR_reserved;
+ default:
+ return state;
+ }
+}
+
+static inline enum bch_folio_sector_state
+folio_sector_reserve(enum bch_folio_sector_state state)
+{
+ switch (state) {
+ case SECTOR_unallocated:
+ return SECTOR_reserved;
+ case SECTOR_dirty:
+ return SECTOR_dirty_reserved;
+ default:
+ return state;
+ }
+}
+
+/* for newly allocated folios: */
+struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp)
+{
+ struct bch_folio *s;
+
+ s = kzalloc(sizeof(*s) +
+ sizeof(struct bch_folio_sector) *
+ folio_sectors(folio), gfp);
+ if (!s)
+ return NULL;
+
+ spin_lock_init(&s->lock);
+ folio_attach_private(folio, s);
+ return s;
+}
+
+struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp)
+{
+ return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp);
+}
+
+static unsigned bkey_to_sector_state(struct bkey_s_c k)
+{
+ if (bkey_extent_is_reservation(k))
+ return SECTOR_reserved;
+ if (bkey_extent_is_allocation(k.k))
+ return SECTOR_allocated;
+ return SECTOR_unallocated;
+}
+
+static void __bch2_folio_set(struct folio *folio,
+ unsigned pg_offset, unsigned pg_len,
+ unsigned nr_ptrs, unsigned state)
+{
+ struct bch_folio *s = bch2_folio(folio);
+ unsigned i, sectors = folio_sectors(folio);
+
+ BUG_ON(pg_offset >= sectors);
+ BUG_ON(pg_offset + pg_len > sectors);
+
+ spin_lock(&s->lock);
+
+ for (i = pg_offset; i < pg_offset + pg_len; i++) {
+ s->s[i].nr_replicas = nr_ptrs;
+ bch2_folio_sector_set(folio, s, i, state);
+ }
+
+ if (i == sectors)
+ s->uptodate = true;
+
+ spin_unlock(&s->lock);
+}
+
+/*
+ * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the
+ * extents btree:
+ */
+int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
+ struct folio **fs, unsigned nr_folios)
+{
+ struct btree_trans *trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_folio *s;
+ u64 offset = folio_sector(fs[0]);
+ unsigned folio_idx;
+ u32 snapshot;
+ bool need_set = false;
+ int ret;
+
+ for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
+ s = bch2_folio_create(fs[folio_idx], GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ need_set |= !s->uptodate;
+ }
+
+ if (!need_set)
+ return 0;
+
+ folio_idx = 0;
+ trans = bch2_trans_get(c);
+retry:
+ bch2_trans_begin(trans);
+
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
+ SPOS(inum.inum, offset, snapshot),
+ BTREE_ITER_SLOTS, k, ret) {
+ unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
+ unsigned state = bkey_to_sector_state(k);
+
+ while (folio_idx < nr_folios) {
+ struct folio *folio = fs[folio_idx];
+ u64 folio_start = folio_sector(folio);
+ u64 folio_end = folio_end_sector(folio);
+ unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) -
+ folio_start;
+ unsigned folio_len = min(k.k->p.offset, folio_end) -
+ folio_offset - folio_start;
+
+ BUG_ON(k.k->p.offset < folio_start);
+ BUG_ON(bkey_start_offset(k.k) > folio_end);
+
+ if (!bch2_folio(folio)->uptodate)
+ __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
+
+ if (k.k->p.offset < folio_end)
+ break;
+ folio_idx++;
+ }
+
+ if (folio_idx == nr_folios)
+ break;
+ }
+
+ offset = iter.pos.offset;
+ bch2_trans_iter_exit(trans, &iter);
+err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
+ bch2_trans_put(trans);
+
+ return ret;
+}
+
+void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
+{
+ struct bvec_iter iter;
+ struct folio_vec fv;
+ unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
+ ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
+ unsigned state = bkey_to_sector_state(k);
+
+ bio_for_each_folio(fv, bio, iter)
+ __bch2_folio_set(fv.fv_folio,
+ fv.fv_offset >> 9,
+ fv.fv_len >> 9,
+ nr_ptrs, state);
+}
+
+void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode,
+ u64 start, u64 end)
+{
+ pgoff_t index = start >> PAGE_SECTORS_SHIFT;
+ pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
+ struct folio_batch fbatch;
+ unsigned i, j;
+
+ if (end <= start)
+ return;
+
+ folio_batch_init(&fbatch);
+
+ while (filemap_get_folios(inode->v.i_mapping,
+ &index, end_index, &fbatch)) {
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i];
+ u64 folio_start = folio_sector(folio);
+ u64 folio_end = folio_end_sector(folio);
+ unsigned folio_offset = max(start, folio_start) - folio_start;
+ unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
+ struct bch_folio *s;
+
+ BUG_ON(end <= folio_start);
+
+ folio_lock(folio);
+ s = bch2_folio(folio);
+
+ if (s) {
+ spin_lock(&s->lock);
+ for (j = folio_offset; j < folio_offset + folio_len; j++)
+ s->s[j].nr_replicas = 0;
+ spin_unlock(&s->lock);
+ }
+
+ folio_unlock(folio);
+ }
+ folio_batch_release(&fbatch);
+ cond_resched();
+ }
+}
+
+void bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
+ u64 start, u64 end)
+{
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ pgoff_t index = start >> PAGE_SECTORS_SHIFT;
+ pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
+ struct folio_batch fbatch;
+ s64 i_sectors_delta = 0;
+ unsigned i, j;
+
+ if (end <= start)
+ return;
+
+ folio_batch_init(&fbatch);
+
+ while (filemap_get_folios(inode->v.i_mapping,
+ &index, end_index, &fbatch)) {
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i];
+ u64 folio_start = folio_sector(folio);
+ u64 folio_end = folio_end_sector(folio);
+ unsigned folio_offset = max(start, folio_start) - folio_start;
+ unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
+ struct bch_folio *s;
+
+ BUG_ON(end <= folio_start);
+
+ folio_lock(folio);
+ s = bch2_folio(folio);
+
+ if (s) {
+ spin_lock(&s->lock);
+ for (j = folio_offset; j < folio_offset + folio_len; j++) {
+ i_sectors_delta -= s->s[j].state == SECTOR_dirty;
+ bch2_folio_sector_set(folio, s, j,
+ folio_sector_reserve(s->s[j].state));
+ }
+ spin_unlock(&s->lock);
+ }
+
+ folio_unlock(folio);
+ }
+ folio_batch_release(&fbatch);
+ cond_resched();
+ }
+
+ bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
+}
+
+static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
+ unsigned nr_replicas)
+{
+ return max(0, (int) nr_replicas -
+ s->nr_replicas -
+ s->replicas_reserved);
+}
+
+int bch2_get_folio_disk_reservation(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct folio *folio, bool check_enospc)
+{
+ struct bch_folio *s = bch2_folio_create(folio, 0);
+ unsigned nr_replicas = inode_nr_replicas(c, inode);
+ struct disk_reservation disk_res = { 0 };
+ unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0;
+ int ret;
+
+ if (!s)
+ return -ENOMEM;
+
+ for (i = 0; i < sectors; i++)
+ disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
+
+ if (!disk_res_sectors)
+ return 0;
+
+ ret = bch2_disk_reservation_get(c, &disk_res,
+ disk_res_sectors, 1,
+ !check_enospc
+ ? BCH_DISK_RESERVATION_NOFAIL
+ : 0);
+ if (unlikely(ret))
+ return ret;
+
+ for (i = 0; i < sectors; i++)
+ s->s[i].replicas_reserved +=
+ sectors_to_reserve(&s->s[i], nr_replicas);
+
+ return 0;
+}
+
+void bch2_folio_reservation_put(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct bch2_folio_reservation *res)
+{
+ bch2_disk_reservation_put(c, &res->disk);
+ bch2_quota_reservation_put(c, inode, &res->quota);
+}
+
+int bch2_folio_reservation_get(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct folio *folio,
+ struct bch2_folio_reservation *res,
+ unsigned offset, unsigned len)
+{
+ struct bch_folio *s = bch2_folio_create(folio, 0);
+ unsigned i, disk_sectors = 0, quota_sectors = 0;
+ int ret;
+
+ if (!s)
+ return -ENOMEM;
+
+ BUG_ON(!s->uptodate);
+
+ for (i = round_down(offset, block_bytes(c)) >> 9;
+ i < round_up(offset + len, block_bytes(c)) >> 9;
+ i++) {
+ disk_sectors += sectors_to_reserve(&s->s[i],
+ res->disk.nr_replicas);
+ quota_sectors += s->s[i].state == SECTOR_unallocated;
+ }
+
+ if (disk_sectors) {
+ ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ if (quota_sectors) {
+ ret = bch2_quota_reservation_add(c, inode, &res->quota,
+ quota_sectors, true);
+ if (unlikely(ret)) {
+ struct disk_reservation tmp = {
+ .sectors = disk_sectors
+ };
+
+ bch2_disk_reservation_put(c, &tmp);
+ res->disk.sectors -= disk_sectors;
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static void bch2_clear_folio_bits(struct folio *folio)
+{
+ struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch_folio *s = bch2_folio(folio);
+ struct disk_reservation disk_res = { 0 };
+ int i, sectors = folio_sectors(folio), dirty_sectors = 0;
+
+ if (!s)
+ return;
+
+ EBUG_ON(!folio_test_locked(folio));
+ EBUG_ON(folio_test_writeback(folio));
+
+ for (i = 0; i < sectors; i++) {
+ disk_res.sectors += s->s[i].replicas_reserved;
+ s->s[i].replicas_reserved = 0;
+
+ dirty_sectors -= s->s[i].state == SECTOR_dirty;
+ bch2_folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state));
+ }
+
+ bch2_disk_reservation_put(c, &disk_res);
+
+ bch2_i_sectors_acct(c, inode, NULL, dirty_sectors);
+
+ bch2_folio_release(folio);
+}
+
+void bch2_set_folio_dirty(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct folio *folio,
+ struct bch2_folio_reservation *res,
+ unsigned offset, unsigned len)
+{
+ struct bch_folio *s = bch2_folio(folio);
+ unsigned i, dirty_sectors = 0;
+
+ WARN_ON((u64) folio_pos(folio) + offset + len >
+ round_up((u64) i_size_read(&inode->v), block_bytes(c)));
+
+ BUG_ON(!s->uptodate);
+
+ spin_lock(&s->lock);
+
+ for (i = round_down(offset, block_bytes(c)) >> 9;
+ i < round_up(offset + len, block_bytes(c)) >> 9;
+ i++) {
+ unsigned sectors = sectors_to_reserve(&s->s[i],
+ res->disk.nr_replicas);
+
+ /*
+ * This can happen if we race with the error path in
+ * bch2_writepage_io_done():
+ */
+ sectors = min_t(unsigned, sectors, res->disk.sectors);
+
+ s->s[i].replicas_reserved += sectors;
+ res->disk.sectors -= sectors;
+
+ dirty_sectors += s->s[i].state == SECTOR_unallocated;
+
+ bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state));
+ }
+
+ spin_unlock(&s->lock);
+
+ bch2_i_sectors_acct(c, inode, &res->quota, dirty_sectors);
+
+ if (!folio_test_dirty(folio))
+ filemap_dirty_folio(inode->v.i_mapping, folio);
+}
+
+vm_fault_t bch2_page_fault(struct vm_fault *vmf)
+{
+ struct file *file = vmf->vma->vm_file;
+ struct address_space *mapping = file->f_mapping;
+ struct address_space *fdm = faults_disabled_mapping();
+ struct bch_inode_info *inode = file_bch_inode(file);
+ vm_fault_t ret;
+
+ if (fdm == mapping)
+ return VM_FAULT_SIGBUS;
+
+ /* Lock ordering: */
+ if (fdm > mapping) {
+ struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
+
+ if (bch2_pagecache_add_tryget(inode))
+ goto got_lock;
+
+ bch2_pagecache_block_put(fdm_host);
+
+ bch2_pagecache_add_get(inode);
+ bch2_pagecache_add_put(inode);
+
+ bch2_pagecache_block_get(fdm_host);
+
+ /* Signal that lock has been dropped: */
+ set_fdm_dropped_locks();
+ return VM_FAULT_SIGBUS;
+ }
+
+ bch2_pagecache_add_get(inode);
+got_lock:
+ ret = filemap_fault(vmf);
+ bch2_pagecache_add_put(inode);
+
+ return ret;
+}
+
+vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
+{
+ struct folio *folio = page_folio(vmf->page);
+ struct file *file = vmf->vma->vm_file;
+ struct bch_inode_info *inode = file_bch_inode(file);
+ struct address_space *mapping = file->f_mapping;
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch2_folio_reservation res;
+ unsigned len;
+ loff_t isize;
+ vm_fault_t ret;
+
+ bch2_folio_reservation_init(c, inode, &res);
+
+ sb_start_pagefault(inode->v.i_sb);
+ file_update_time(file);
+
+ /*
+ * Not strictly necessary, but helps avoid dio writes livelocking in
+ * bch2_write_invalidate_inode_pages_range() - can drop this if/when we get
+ * a bch2_write_invalidate_inode_pages_range() that works without dropping
+ * page lock before invalidating page
+ */
+ bch2_pagecache_add_get(inode);
+
+ folio_lock(folio);
+ isize = i_size_read(&inode->v);
+
+ if (folio->mapping != mapping || folio_pos(folio) >= isize) {
+ folio_unlock(folio);
+ ret = VM_FAULT_NOPAGE;
+ goto out;
+ }
+
+ len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio));
+
+ if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?:
+ bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) {
+ folio_unlock(folio);
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+
+ bch2_set_folio_dirty(c, inode, folio, &res, 0, len);
+ bch2_folio_reservation_put(c, inode, &res);
+
+ folio_wait_stable(folio);
+ ret = VM_FAULT_LOCKED;
+out:
+ bch2_pagecache_add_put(inode);
+ sb_end_pagefault(inode->v.i_sb);
+
+ return ret;
+}
+
+void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
+{
+ if (offset || length < folio_size(folio))
+ return;
+
+ bch2_clear_folio_bits(folio);
+}
+
+bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
+{
+ if (folio_test_dirty(folio) || folio_test_writeback(folio))
+ return false;
+
+ bch2_clear_folio_bits(folio);
+ return true;
+}
+
+/* fseek: */
+
+static int folio_data_offset(struct folio *folio, loff_t pos,
+ unsigned min_replicas)
+{
+ struct bch_folio *s = bch2_folio(folio);
+ unsigned i, sectors = folio_sectors(folio);
+
+ if (s)
+ for (i = folio_pos_to_s(folio, pos); i < sectors; i++)
+ if (s->s[i].state >= SECTOR_dirty &&
+ s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas)
+ return i << SECTOR_SHIFT;
+
+ return -1;
+}
+
+loff_t bch2_seek_pagecache_data(struct inode *vinode,
+ loff_t start_offset,
+ loff_t end_offset,
+ unsigned min_replicas,
+ bool nonblock)
+{
+ struct folio_batch fbatch;
+ pgoff_t start_index = start_offset >> PAGE_SHIFT;
+ pgoff_t end_index = end_offset >> PAGE_SHIFT;
+ pgoff_t index = start_index;
+ unsigned i;
+ loff_t ret;
+ int offset;
+
+ folio_batch_init(&fbatch);
+
+ while (filemap_get_folios(vinode->i_mapping,
+ &index, end_index, &fbatch)) {
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i];
+
+ if (!nonblock) {
+ folio_lock(folio);
+ } else if (!folio_trylock(folio)) {
+ folio_batch_release(&fbatch);
+ return -EAGAIN;
+ }
+
+ offset = folio_data_offset(folio,
+ max(folio_pos(folio), start_offset),
+ min_replicas);
+ if (offset >= 0) {
+ ret = clamp(folio_pos(folio) + offset,
+ start_offset, end_offset);
+ folio_unlock(folio);
+ folio_batch_release(&fbatch);
+ return ret;
+ }
+ folio_unlock(folio);
+ }
+ folio_batch_release(&fbatch);
+ cond_resched();
+ }
+
+ return end_offset;
+}
+
+/*
+ * Search for a hole in a folio.
+ *
+ * The filemap layer returns -ENOENT if no folio exists, so reuse the same error
+ * code to indicate a pagecache hole exists at the returned offset. Otherwise
+ * return 0 if the folio is filled with data, or an error code. This function
+ * can return -EAGAIN if nonblock is specified.
+ */
+static int folio_hole_offset(struct address_space *mapping, loff_t *offset,
+ unsigned min_replicas, bool nonblock)
+{
+ struct folio *folio;
+ struct bch_folio *s;
+ unsigned i, sectors;
+ int ret = -ENOENT;
+
+ folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT,
+ FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+
+ s = bch2_folio(folio);
+ if (!s)
+ goto unlock;
+
+ sectors = folio_sectors(folio);
+ for (i = folio_pos_to_s(folio, *offset); i < sectors; i++)
+ if (s->s[i].state < SECTOR_dirty ||
+ s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) {
+ *offset = max(*offset,
+ folio_pos(folio) + (i << SECTOR_SHIFT));
+ goto unlock;
+ }
+
+ *offset = folio_end_pos(folio);
+ ret = 0;
+unlock:
+ folio_unlock(folio);
+ folio_put(folio);
+ return ret;
+}
+
+loff_t bch2_seek_pagecache_hole(struct inode *vinode,
+ loff_t start_offset,
+ loff_t end_offset,
+ unsigned min_replicas,
+ bool nonblock)
+{
+ struct address_space *mapping = vinode->i_mapping;
+ loff_t offset = start_offset;
+ loff_t ret = 0;
+
+ while (!ret && offset < end_offset)
+ ret = folio_hole_offset(mapping, &offset, min_replicas, nonblock);
+
+ if (ret && ret != -ENOENT)
+ return ret;
+ return min(offset, end_offset);
+}
+
+int bch2_clamp_data_hole(struct inode *inode,
+ u64 *hole_start,
+ u64 *hole_end,
+ unsigned min_replicas,
+ bool nonblock)
+{
+ loff_t ret;
+
+ ret = bch2_seek_pagecache_hole(inode,
+ *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
+ if (ret < 0)
+ return ret;
+
+ *hole_start = ret;
+
+ if (*hole_start == *hole_end)
+ return 0;
+
+ ret = bch2_seek_pagecache_data(inode,
+ *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
+ if (ret < 0)
+ return ret;
+
+ *hole_end = ret;
+ return 0;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h
new file mode 100644
index 000000000000..27f712ae37a6
--- /dev/null
+++ b/fs/bcachefs/fs-io-pagecache.h
@@ -0,0 +1,176 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IO_PAGECACHE_H
+#define _BCACHEFS_FS_IO_PAGECACHE_H
+
+#include <linux/pagemap.h>
+
+typedef DARRAY(struct folio *) folios;
+
+int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t,
+ u64, fgf_t, gfp_t, folios *);
+int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t);
+
+/*
+ * Use u64 for the end pos and sector helpers because if the folio covers the
+ * max supported range of the mapping, the start offset of the next folio
+ * overflows loff_t. This breaks much of the range based processing in the
+ * buffered write path.
+ */
+static inline u64 folio_end_pos(struct folio *folio)
+{
+ return folio_pos(folio) + folio_size(folio);
+}
+
+static inline size_t folio_sectors(struct folio *folio)
+{
+ return PAGE_SECTORS << folio_order(folio);
+}
+
+static inline loff_t folio_sector(struct folio *folio)
+{
+ return folio_pos(folio) >> 9;
+}
+
+static inline u64 folio_end_sector(struct folio *folio)
+{
+ return folio_end_pos(folio) >> 9;
+}
+
+#define BCH_FOLIO_SECTOR_STATE() \
+ x(unallocated) \
+ x(reserved) \
+ x(dirty) \
+ x(dirty_reserved) \
+ x(allocated)
+
+enum bch_folio_sector_state {
+#define x(n) SECTOR_##n,
+ BCH_FOLIO_SECTOR_STATE()
+#undef x
+};
+
+struct bch_folio_sector {
+ /* Uncompressed, fully allocated replicas (or on disk reservation): */
+ unsigned nr_replicas:4;
+
+ /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
+ unsigned replicas_reserved:4;
+
+ /* i_sectors: */
+ enum bch_folio_sector_state state:8;
+};
+
+struct bch_folio {
+ spinlock_t lock;
+ atomic_t write_count;
+ /*
+ * Is the sector state up to date with the btree?
+ * (Not the data itself)
+ */
+ bool uptodate;
+ struct bch_folio_sector s[];
+};
+
+/* Helper for when we need to add debug instrumentation: */
+static inline void bch2_folio_sector_set(struct folio *folio,
+ struct bch_folio *s,
+ unsigned i, unsigned n)
+{
+ s->s[i].state = n;
+}
+
+/* file offset (to folio offset) to bch_folio_sector index */
+static inline int folio_pos_to_s(struct folio *folio, loff_t pos)
+{
+ u64 f_offset = pos - folio_pos(folio);
+
+ BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio));
+ return f_offset >> SECTOR_SHIFT;
+}
+
+/* for newly allocated folios: */
+static inline void __bch2_folio_release(struct folio *folio)
+{
+ kfree(folio_detach_private(folio));
+}
+
+static inline void bch2_folio_release(struct folio *folio)
+{
+ EBUG_ON(!folio_test_locked(folio));
+ __bch2_folio_release(folio);
+}
+
+static inline struct bch_folio *__bch2_folio(struct folio *folio)
+{
+ return folio_has_private(folio)
+ ? (struct bch_folio *) folio_get_private(folio)
+ : NULL;
+}
+
+static inline struct bch_folio *bch2_folio(struct folio *folio)
+{
+ EBUG_ON(!folio_test_locked(folio));
+
+ return __bch2_folio(folio);
+}
+
+struct bch_folio *__bch2_folio_create(struct folio *, gfp_t);
+struct bch_folio *bch2_folio_create(struct folio *, gfp_t);
+
+struct bch2_folio_reservation {
+ struct disk_reservation disk;
+ struct quota_res quota;
+};
+
+static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
+{
+ /* XXX: this should not be open coded */
+ return inode->ei_inode.bi_data_replicas
+ ? inode->ei_inode.bi_data_replicas - 1
+ : c->opts.data_replicas;
+}
+
+static inline void bch2_folio_reservation_init(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct bch2_folio_reservation *res)
+{
+ memset(res, 0, sizeof(*res));
+
+ res->disk.nr_replicas = inode_nr_replicas(c, inode);
+}
+
+int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned);
+void bch2_bio_page_state_set(struct bio *, struct bkey_s_c);
+
+void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64);
+void bch2_mark_pagecache_reserved(struct bch_inode_info *, u64, u64);
+
+int bch2_get_folio_disk_reservation(struct bch_fs *,
+ struct bch_inode_info *,
+ struct folio *, bool);
+
+void bch2_folio_reservation_put(struct bch_fs *,
+ struct bch_inode_info *,
+ struct bch2_folio_reservation *);
+int bch2_folio_reservation_get(struct bch_fs *,
+ struct bch_inode_info *,
+ struct folio *,
+ struct bch2_folio_reservation *,
+ unsigned, unsigned);
+
+void bch2_set_folio_dirty(struct bch_fs *,
+ struct bch_inode_info *,
+ struct folio *,
+ struct bch2_folio_reservation *,
+ unsigned, unsigned);
+
+vm_fault_t bch2_page_fault(struct vm_fault *);
+vm_fault_t bch2_page_mkwrite(struct vm_fault *);
+void bch2_invalidate_folio(struct folio *, size_t, size_t);
+bool bch2_release_folio(struct folio *, gfp_t);
+
+loff_t bch2_seek_pagecache_data(struct inode *, loff_t, loff_t, unsigned, bool);
+loff_t bch2_seek_pagecache_hole(struct inode *, loff_t, loff_t, unsigned, bool);
+int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool);
+
+#endif /* _BCACHEFS_FS_IO_PAGECACHE_H */
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
new file mode 100644
index 000000000000..b0e8144ec550
--- /dev/null
+++ b/fs/bcachefs/fs-io.c
@@ -0,0 +1,1072 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "clock.h"
+#include "error.h"
+#include "extents.h"
+#include "extent_update.h"
+#include "fs.h"
+#include "fs-io.h"
+#include "fs-io-buffered.h"
+#include "fs-io-pagecache.h"
+#include "fsck.h"
+#include "inode.h"
+#include "journal.h"
+#include "io_misc.h"
+#include "keylist.h"
+#include "quota.h"
+#include "reflink.h"
+#include "trace.h"
+
+#include <linux/aio.h>
+#include <linux/backing-dev.h>
+#include <linux/falloc.h>
+#include <linux/migrate.h>
+#include <linux/mmu_context.h>
+#include <linux/pagevec.h>
+#include <linux/rmap.h>
+#include <linux/sched/signal.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/uio.h>
+
+#include <trace/events/writeback.h>
+
+struct nocow_flush {
+ struct closure *cl;
+ struct bch_dev *ca;
+ struct bio bio;
+};
+
+static void nocow_flush_endio(struct bio *_bio)
+{
+
+ struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio);
+
+ closure_put(bio->cl);
+ percpu_ref_put(&bio->ca->io_ref);
+ bio_put(&bio->bio);
+}
+
+void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct closure *cl)
+{
+ struct nocow_flush *bio;
+ struct bch_dev *ca;
+ struct bch_devs_mask devs;
+ unsigned dev;
+
+ dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX);
+ if (dev == BCH_SB_MEMBERS_MAX)
+ return;
+
+ devs = inode->ei_devs_need_flush;
+ memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
+
+ for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) {
+ rcu_read_lock();
+ ca = rcu_dereference(c->devs[dev]);
+ if (ca && !percpu_ref_tryget(&ca->io_ref))
+ ca = NULL;
+ rcu_read_unlock();
+
+ if (!ca)
+ continue;
+
+ bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0,
+ REQ_OP_FLUSH,
+ GFP_KERNEL,
+ &c->nocow_flush_bioset),
+ struct nocow_flush, bio);
+ bio->cl = cl;
+ bio->ca = ca;
+ bio->bio.bi_end_io = nocow_flush_endio;
+ closure_bio_submit(&bio->bio, cl);
+ }
+}
+
+static int bch2_inode_flush_nocow_writes(struct bch_fs *c,
+ struct bch_inode_info *inode)
+{
+ struct closure cl;
+
+ closure_init_stack(&cl);
+ bch2_inode_flush_nocow_writes_async(c, inode, &cl);
+ closure_sync(&cl);
+
+ return 0;
+}
+
+/* i_size updates: */
+
+struct inode_new_size {
+ loff_t new_size;
+ u64 now;
+ unsigned fields;
+};
+
+static int inode_set_size(struct btree_trans *trans,
+ struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi,
+ void *p)
+{
+ struct inode_new_size *s = p;
+
+ bi->bi_size = s->new_size;
+ if (s->fields & ATTR_ATIME)
+ bi->bi_atime = s->now;
+ if (s->fields & ATTR_MTIME)
+ bi->bi_mtime = s->now;
+ if (s->fields & ATTR_CTIME)
+ bi->bi_ctime = s->now;
+
+ return 0;
+}
+
+int __must_check bch2_write_inode_size(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ loff_t new_size, unsigned fields)
+{
+ struct inode_new_size s = {
+ .new_size = new_size,
+ .now = bch2_current_time(c),
+ .fields = fields,
+ };
+
+ return bch2_write_inode(c, inode, inode_set_size, &s, fields);
+}
+
+void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
+ struct quota_res *quota_res, s64 sectors)
+{
+ bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c,
+ "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
+ inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
+ inode->ei_inode.bi_sectors);
+ inode->v.i_blocks += sectors;
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+ if (quota_res &&
+ !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) &&
+ sectors > 0) {
+ BUG_ON(sectors > quota_res->sectors);
+ BUG_ON(sectors > inode->ei_quota_reserved);
+
+ quota_res->sectors -= sectors;
+ inode->ei_quota_reserved -= sectors;
+ } else {
+ bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
+ }
+#endif
+}
+
+/* fsync: */
+
+/*
+ * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an
+ * insert trigger: look up the btree inode instead
+ */
+static int bch2_flush_inode(struct bch_fs *c,
+ struct bch_inode_info *inode)
+{
+ struct bch_inode_unpacked u;
+ int ret;
+
+ if (c->opts.journal_flush_disabled)
+ return 0;
+
+ ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u);
+ if (ret)
+ return ret;
+
+ return bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?:
+ bch2_inode_flush_nocow_writes(c, inode);
+}
+
+int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+ struct bch_inode_info *inode = file_bch_inode(file);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ int ret, ret2, ret3;
+
+ ret = file_write_and_wait_range(file, start, end);
+ ret2 = sync_inode_metadata(&inode->v, 1);
+ ret3 = bch2_flush_inode(c, inode);
+
+ return bch2_err_class(ret ?: ret2 ?: ret3);
+}
+
+/* truncate: */
+
+static inline int range_has_data(struct bch_fs *c, u32 subvol,
+ struct bpos start,
+ struct bpos end)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+retry:
+ bch2_trans_begin(trans);
+
+ ret = bch2_subvolume_get_snapshot(trans, subvol, &start.snapshot);
+ if (ret)
+ goto err;
+
+ for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, start, end, 0, k, ret)
+ if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) {
+ ret = 1;
+ break;
+ }
+ start = iter.pos;
+ bch2_trans_iter_exit(trans, &iter);
+err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
+
+ bch2_trans_put(trans);
+ return ret;
+}
+
+static int __bch2_truncate_folio(struct bch_inode_info *inode,
+ pgoff_t index, loff_t start, loff_t end)
+{
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct address_space *mapping = inode->v.i_mapping;
+ struct bch_folio *s;
+ unsigned start_offset;
+ unsigned end_offset;
+ unsigned i;
+ struct folio *folio;
+ s64 i_sectors_delta = 0;
+ int ret = 0;
+ u64 end_pos;
+
+ folio = filemap_lock_folio(mapping, index);
+ if (IS_ERR_OR_NULL(folio)) {
+ /*
+ * XXX: we're doing two index lookups when we end up reading the
+ * folio
+ */
+ ret = range_has_data(c, inode->ei_subvol,
+ POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)),
+ POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS));
+ if (ret <= 0)
+ return ret;
+
+ folio = __filemap_get_folio(mapping, index,
+ FGP_LOCK|FGP_CREAT, GFP_KERNEL);
+ if (IS_ERR_OR_NULL(folio)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ BUG_ON(start >= folio_end_pos(folio));
+ BUG_ON(end <= folio_pos(folio));
+
+ start_offset = max(start, folio_pos(folio)) - folio_pos(folio);
+ end_offset = min_t(u64, end, folio_end_pos(folio)) - folio_pos(folio);
+
+ /* Folio boundary? Nothing to do */
+ if (start_offset == 0 &&
+ end_offset == folio_size(folio)) {
+ ret = 0;
+ goto unlock;
+ }
+
+ s = bch2_folio_create(folio, 0);
+ if (!s) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ if (!folio_test_uptodate(folio)) {
+ ret = bch2_read_single_folio(folio, mapping);
+ if (ret)
+ goto unlock;
+ }
+
+ ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
+ if (ret)
+ goto unlock;
+
+ for (i = round_up(start_offset, block_bytes(c)) >> 9;
+ i < round_down(end_offset, block_bytes(c)) >> 9;
+ i++) {
+ s->s[i].nr_replicas = 0;
+
+ i_sectors_delta -= s->s[i].state == SECTOR_dirty;
+ bch2_folio_sector_set(folio, s, i, SECTOR_unallocated);
+ }
+
+ bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
+
+ /*
+ * Caller needs to know whether this folio will be written out by
+ * writeback - doing an i_size update if necessary - or whether it will
+ * be responsible for the i_size update.
+ *
+ * Note that we shouldn't ever see a folio beyond EOF, but check and
+ * warn if so. This has been observed by failure to clean up folios
+ * after a short write and there's still a chance reclaim will fix
+ * things up.
+ */
+ WARN_ON_ONCE(folio_pos(folio) >= inode->v.i_size);
+ end_pos = folio_end_pos(folio);
+ if (inode->v.i_size > folio_pos(folio))
+ end_pos = min_t(u64, inode->v.i_size, end_pos);
+ ret = s->s[folio_pos_to_s(folio, end_pos - 1)].state >= SECTOR_dirty;
+
+ folio_zero_segment(folio, start_offset, end_offset);
+
+ /*
+ * Bit of a hack - we don't want truncate to fail due to -ENOSPC.
+ *
+ * XXX: because we aren't currently tracking whether the folio has actual
+ * data in it (vs. just 0s, or only partially written) this wrong. ick.
+ */
+ BUG_ON(bch2_get_folio_disk_reservation(c, inode, folio, false));
+
+ /*
+ * This removes any writeable userspace mappings; we need to force
+ * .page_mkwrite to be called again before any mmapped writes, to
+ * redirty the full page:
+ */
+ folio_mkclean(folio);
+ filemap_dirty_folio(mapping, folio);
+unlock:
+ folio_unlock(folio);
+ folio_put(folio);
+out:
+ return ret;
+}
+
+static int bch2_truncate_folio(struct bch_inode_info *inode, loff_t from)
+{
+ return __bch2_truncate_folio(inode, from >> PAGE_SHIFT,
+ from, ANYSINT_MAX(loff_t));
+}
+
+static int bch2_truncate_folios(struct bch_inode_info *inode,
+ loff_t start, loff_t end)
+{
+ int ret = __bch2_truncate_folio(inode, start >> PAGE_SHIFT,
+ start, end);
+
+ if (ret >= 0 &&
+ start >> PAGE_SHIFT != end >> PAGE_SHIFT)
+ ret = __bch2_truncate_folio(inode,
+ (end - 1) >> PAGE_SHIFT,
+ start, end);
+ return ret;
+}
+
+static int bch2_extend(struct mnt_idmap *idmap,
+ struct bch_inode_info *inode,
+ struct bch_inode_unpacked *inode_u,
+ struct iattr *iattr)
+{
+ struct address_space *mapping = inode->v.i_mapping;
+ int ret;
+
+ /*
+ * sync appends:
+ *
+ * this has to be done _before_ extending i_size:
+ */
+ ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX);
+ if (ret)
+ return ret;
+
+ truncate_setsize(&inode->v, iattr->ia_size);
+
+ return bch2_setattr_nonsize(idmap, inode, iattr);
+}
+
+int bchfs_truncate(struct mnt_idmap *idmap,
+ struct bch_inode_info *inode, struct iattr *iattr)
+{
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct address_space *mapping = inode->v.i_mapping;
+ struct bch_inode_unpacked inode_u;
+ s64 i_sectors_delta = 0;
+ int ret = 0;
+
+ /*
+ * If the truncate call with change the size of the file, the
+ * cmtimes should be updated. If the size will not change, we
+ * do not need to update the cmtimes.
+ */
+ if (iattr->ia_size != inode->v.i_size) {
+ if (!(iattr->ia_valid & ATTR_MTIME))
+ ktime_get_coarse_real_ts64(&iattr->ia_mtime);
+ if (!(iattr->ia_valid & ATTR_CTIME))
+ ktime_get_coarse_real_ts64(&iattr->ia_ctime);
+ iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME;
+ }
+
+ inode_dio_wait(&inode->v);
+ bch2_pagecache_block_get(inode);
+
+ ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u);
+ if (ret)
+ goto err;
+
+ /*
+ * check this before next assertion; on filesystem error our normal
+ * invariants are a bit broken (truncate has to truncate the page cache
+ * before the inode).
+ */
+ ret = bch2_journal_error(&c->journal);
+ if (ret)
+ goto err;
+
+ WARN_ONCE(!test_bit(EI_INODE_ERROR, &inode->ei_flags) &&
+ inode->v.i_size < inode_u.bi_size,
+ "truncate spotted in mem i_size < btree i_size: %llu < %llu\n",
+ (u64) inode->v.i_size, inode_u.bi_size);
+
+ if (iattr->ia_size > inode->v.i_size) {
+ ret = bch2_extend(idmap, inode, &inode_u, iattr);
+ goto err;
+ }
+
+ iattr->ia_valid &= ~ATTR_SIZE;
+
+ ret = bch2_truncate_folio(inode, iattr->ia_size);
+ if (unlikely(ret < 0))
+ goto err;
+
+ truncate_setsize(&inode->v, iattr->ia_size);
+
+ /*
+ * When extending, we're going to write the new i_size to disk
+ * immediately so we need to flush anything above the current on disk
+ * i_size first:
+ *
+ * Also, when extending we need to flush the page that i_size currently
+ * straddles - if it's mapped to userspace, we need to ensure that
+ * userspace has to redirty it and call .mkwrite -> set_page_dirty
+ * again to allocate the part of the page that was extended.
+ */
+ if (iattr->ia_size > inode_u.bi_size)
+ ret = filemap_write_and_wait_range(mapping,
+ inode_u.bi_size,
+ iattr->ia_size - 1);
+ else if (iattr->ia_size & (PAGE_SIZE - 1))
+ ret = filemap_write_and_wait_range(mapping,
+ round_down(iattr->ia_size, PAGE_SIZE),
+ iattr->ia_size - 1);
+ if (ret)
+ goto err;
+
+ ret = bch2_truncate(c, inode_inum(inode), iattr->ia_size, &i_sectors_delta);
+ bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
+
+ if (unlikely(ret)) {
+ /*
+ * If we error here, VFS caches are now inconsistent with btree
+ */
+ set_bit(EI_INODE_ERROR, &inode->ei_flags);
+ goto err;
+ }
+
+ bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks &&
+ !bch2_journal_error(&c->journal), c,
+ "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
+ inode->v.i_ino, (u64) inode->v.i_blocks,
+ inode->ei_inode.bi_sectors);
+
+ ret = bch2_setattr_nonsize(idmap, inode, iattr);
+err:
+ bch2_pagecache_block_put(inode);
+ return bch2_err_class(ret);
+}
+
+/* fallocate: */
+
+static int inode_update_times_fn(struct btree_trans *trans,
+ struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi, void *p)
+{
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+ bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
+ return 0;
+}
+
+static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
+{
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ u64 end = offset + len;
+ u64 block_start = round_up(offset, block_bytes(c));
+ u64 block_end = round_down(end, block_bytes(c));
+ bool truncated_last_page;
+ int ret = 0;
+
+ ret = bch2_truncate_folios(inode, offset, end);
+ if (unlikely(ret < 0))
+ goto err;
+
+ truncated_last_page = ret;
+
+ truncate_pagecache_range(&inode->v, offset, end - 1);
+
+ if (block_start < block_end) {
+ s64 i_sectors_delta = 0;
+
+ ret = bch2_fpunch(c, inode_inum(inode),
+ block_start >> 9, block_end >> 9,
+ &i_sectors_delta);
+ bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
+ }
+
+ mutex_lock(&inode->ei_update_lock);
+ if (end >= inode->v.i_size && !truncated_last_page) {
+ ret = bch2_write_inode_size(c, inode, inode->v.i_size,
+ ATTR_MTIME|ATTR_CTIME);
+ } else {
+ ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
+ ATTR_MTIME|ATTR_CTIME);
+ }
+ mutex_unlock(&inode->ei_update_lock);
+err:
+ return ret;
+}
+
+static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
+ loff_t offset, loff_t len,
+ bool insert)
+{
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct address_space *mapping = inode->v.i_mapping;
+ s64 i_sectors_delta = 0;
+ int ret = 0;
+
+ if ((offset | len) & (block_bytes(c) - 1))
+ return -EINVAL;
+
+ if (insert) {
+ if (offset >= inode->v.i_size)
+ return -EINVAL;
+ } else {
+ if (offset + len >= inode->v.i_size)
+ return -EINVAL;
+ }
+
+ ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
+ if (ret)
+ return ret;
+
+ if (insert)
+ i_size_write(&inode->v, inode->v.i_size + len);
+
+ ret = bch2_fcollapse_finsert(c, inode_inum(inode), offset >> 9, len >> 9,
+ insert, &i_sectors_delta);
+ if (!ret && !insert)
+ i_size_write(&inode->v, inode->v.i_size - len);
+ bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
+
+ return ret;
+}
+
+static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
+ u64 start_sector, u64 end_sector)
+{
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bpos end_pos = POS(inode->v.i_ino, end_sector);
+ struct bch_io_opts opts;
+ int ret = 0;
+
+ bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+ POS(inode->v.i_ino, start_sector),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+ while (!ret && bkey_lt(iter.pos, end_pos)) {
+ s64 i_sectors_delta = 0;
+ struct quota_res quota_res = { 0 };
+ struct bkey_s_c k;
+ unsigned sectors;
+ bool is_allocation;
+ u64 hole_start, hole_end;
+ u32 snapshot;
+
+ bch2_trans_begin(trans);
+
+ ret = bch2_subvolume_get_snapshot(trans,
+ inode->ei_subvol, &snapshot);
+ if (ret)
+ goto bkey_err;
+
+ bch2_btree_iter_set_snapshot(&iter, snapshot);
+
+ k = bch2_btree_iter_peek_slot(&iter);
+ if ((ret = bkey_err(k)))
+ goto bkey_err;
+
+ hole_start = iter.pos.offset;
+ hole_end = bpos_min(k.k->p, end_pos).offset;
+ is_allocation = bkey_extent_is_allocation(k.k);
+
+ /* already reserved */
+ if (bkey_extent_is_reservation(k) &&
+ bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) {
+ bch2_btree_iter_advance(&iter);
+ continue;
+ }
+
+ if (bkey_extent_is_data(k.k) &&
+ !(mode & FALLOC_FL_ZERO_RANGE)) {
+ bch2_btree_iter_advance(&iter);
+ continue;
+ }
+
+ if (!(mode & FALLOC_FL_ZERO_RANGE)) {
+ /*
+ * Lock ordering - can't be holding btree locks while
+ * blocking on a folio lock:
+ */
+ if (bch2_clamp_data_hole(&inode->v,
+ &hole_start,
+ &hole_end,
+ opts.data_replicas, true))
+ ret = drop_locks_do(trans,
+ (bch2_clamp_data_hole(&inode->v,
+ &hole_start,
+ &hole_end,
+ opts.data_replicas, false), 0));
+ bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start));
+
+ if (ret)
+ goto bkey_err;
+
+ if (hole_start == hole_end)
+ continue;
+ }
+
+ sectors = hole_end - hole_start;
+
+ if (!is_allocation) {
+ ret = bch2_quota_reservation_add(c, inode,
+ &quota_res, sectors, true);
+ if (unlikely(ret))
+ goto bkey_err;
+ }
+
+ ret = bch2_extent_fallocate(trans, inode_inum(inode), &iter,
+ sectors, opts, &i_sectors_delta,
+ writepoint_hashed((unsigned long) current));
+ if (ret)
+ goto bkey_err;
+
+ bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
+
+ drop_locks_do(trans,
+ (bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0));
+bkey_err:
+ bch2_quota_reservation_put(c, inode, &quota_res);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ ret = 0;
+ }
+
+ if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) {
+ struct quota_res quota_res = { 0 };
+ s64 i_sectors_delta = 0;
+
+ bch2_fpunch_at(trans, &iter, inode_inum(inode),
+ end_sector, &i_sectors_delta);
+ bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
+ bch2_quota_reservation_put(c, inode, &quota_res);
+ }
+
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
+ return ret;
+}
+
+static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
+ loff_t offset, loff_t len)
+{
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ u64 end = offset + len;
+ u64 block_start = round_down(offset, block_bytes(c));
+ u64 block_end = round_up(end, block_bytes(c));
+ bool truncated_last_page = false;
+ int ret, ret2 = 0;
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
+ ret = inode_newsize_ok(&inode->v, end);
+ if (ret)
+ return ret;
+ }
+
+ if (mode & FALLOC_FL_ZERO_RANGE) {
+ ret = bch2_truncate_folios(inode, offset, end);
+ if (unlikely(ret < 0))
+ return ret;
+
+ truncated_last_page = ret;
+
+ truncate_pagecache_range(&inode->v, offset, end - 1);
+
+ block_start = round_up(offset, block_bytes(c));
+ block_end = round_down(end, block_bytes(c));
+ }
+
+ ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9);
+
+ /*
+ * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update,
+ * so that the VFS cache i_size is consistent with the btree i_size:
+ */
+ if (ret &&
+ !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)))
+ return ret;
+
+ if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size)
+ end = inode->v.i_size;
+
+ if (end >= inode->v.i_size &&
+ (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) ||
+ !(mode & FALLOC_FL_KEEP_SIZE))) {
+ spin_lock(&inode->v.i_lock);
+ i_size_write(&inode->v, end);
+ spin_unlock(&inode->v.i_lock);
+
+ mutex_lock(&inode->ei_update_lock);
+ ret2 = bch2_write_inode_size(c, inode, end, 0);
+ mutex_unlock(&inode->ei_update_lock);
+ }
+
+ return ret ?: ret2;
+}
+
+long bch2_fallocate_dispatch(struct file *file, int mode,
+ loff_t offset, loff_t len)
+{
+ struct bch_inode_info *inode = file_bch_inode(file);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ long ret;
+
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate))
+ return -EROFS;
+
+ inode_lock(&inode->v);
+ inode_dio_wait(&inode->v);
+ bch2_pagecache_block_get(inode);
+
+ ret = file_modified(file);
+ if (ret)
+ goto err;
+
+ if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
+ ret = bchfs_fallocate(inode, mode, offset, len);
+ else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
+ ret = bchfs_fpunch(inode, offset, len);
+ else if (mode == FALLOC_FL_INSERT_RANGE)
+ ret = bchfs_fcollapse_finsert(inode, offset, len, true);
+ else if (mode == FALLOC_FL_COLLAPSE_RANGE)
+ ret = bchfs_fcollapse_finsert(inode, offset, len, false);
+ else
+ ret = -EOPNOTSUPP;
+err:
+ bch2_pagecache_block_put(inode);
+ inode_unlock(&inode->v);
+ bch2_write_ref_put(c, BCH_WRITE_REF_fallocate);
+
+ return bch2_err_class(ret);
+}
+
+/*
+ * Take a quota reservation for unallocated blocks in a given file range
+ * Does not check pagecache
+ */
+static int quota_reserve_range(struct bch_inode_info *inode,
+ struct quota_res *res,
+ u64 start, u64 end)
+{
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u32 snapshot;
+ u64 sectors = end - start;
+ u64 pos = start;
+ int ret;
+retry:
+ bch2_trans_begin(trans);
+
+ ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+ SPOS(inode->v.i_ino, pos, snapshot), 0);
+
+ while (!(ret = btree_trans_too_many_iters(trans)) &&
+ (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k &&
+ !(ret = bkey_err(k))) {
+ if (bkey_extent_is_allocation(k.k)) {
+ u64 s = min(end, k.k->p.offset) -
+ max(start, bkey_start_offset(k.k));
+ BUG_ON(s > sectors);
+ sectors -= s;
+ }
+ bch2_btree_iter_advance(&iter);
+ }
+ pos = iter.pos.offset;
+ bch2_trans_iter_exit(trans, &iter);
+err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
+
+ bch2_trans_put(trans);
+
+ return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true);
+}
+
+loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
+ struct file *file_dst, loff_t pos_dst,
+ loff_t len, unsigned remap_flags)
+{
+ struct bch_inode_info *src = file_bch_inode(file_src);
+ struct bch_inode_info *dst = file_bch_inode(file_dst);
+ struct bch_fs *c = src->v.i_sb->s_fs_info;
+ struct quota_res quota_res = { 0 };
+ s64 i_sectors_delta = 0;
+ u64 aligned_len;
+ loff_t ret = 0;
+
+ if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
+ return -EINVAL;
+
+ if (remap_flags & REMAP_FILE_DEDUP)
+ return -EOPNOTSUPP;
+
+ if ((pos_src & (block_bytes(c) - 1)) ||
+ (pos_dst & (block_bytes(c) - 1)))
+ return -EINVAL;
+
+ if (src == dst &&
+ abs(pos_src - pos_dst) < len)
+ return -EINVAL;
+
+ bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
+
+ inode_dio_wait(&src->v);
+ inode_dio_wait(&dst->v);
+
+ ret = generic_remap_file_range_prep(file_src, pos_src,
+ file_dst, pos_dst,
+ &len, remap_flags);
+ if (ret < 0 || len == 0)
+ goto err;
+
+ aligned_len = round_up((u64) len, block_bytes(c));
+
+ ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping,
+ pos_dst, pos_dst + len - 1);
+ if (ret)
+ goto err;
+
+ ret = quota_reserve_range(dst, &quota_res, pos_dst >> 9,
+ (pos_dst + aligned_len) >> 9);
+ if (ret)
+ goto err;
+
+ file_update_time(file_dst);
+
+ bch2_mark_pagecache_unallocated(src, pos_src >> 9,
+ (pos_src + aligned_len) >> 9);
+
+ ret = bch2_remap_range(c,
+ inode_inum(dst), pos_dst >> 9,
+ inode_inum(src), pos_src >> 9,
+ aligned_len >> 9,
+ pos_dst + len, &i_sectors_delta);
+ if (ret < 0)
+ goto err;
+
+ /*
+ * due to alignment, we might have remapped slightly more than requsted
+ */
+ ret = min((u64) ret << 9, (u64) len);
+
+ bch2_i_sectors_acct(c, dst, &quota_res, i_sectors_delta);
+
+ spin_lock(&dst->v.i_lock);
+ if (pos_dst + ret > dst->v.i_size)
+ i_size_write(&dst->v, pos_dst + ret);
+ spin_unlock(&dst->v.i_lock);
+
+ if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) ||
+ IS_SYNC(file_inode(file_dst)))
+ ret = bch2_flush_inode(c, dst);
+err:
+ bch2_quota_reservation_put(c, dst, &quota_res);
+ bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
+
+ return bch2_err_class(ret);
+}
+
+/* fseek: */
+
+static loff_t bch2_seek_data(struct file *file, u64 offset)
+{
+ struct bch_inode_info *inode = file_bch_inode(file);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct btree_trans *trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ subvol_inum inum = inode_inum(inode);
+ u64 isize, next_data = MAX_LFS_FILESIZE;
+ u32 snapshot;
+ int ret;
+
+ isize = i_size_read(&inode->v);
+ if (offset >= isize)
+ return -ENXIO;
+
+ trans = bch2_trans_get(c);
+retry:
+ bch2_trans_begin(trans);
+
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents,
+ SPOS(inode->v.i_ino, offset >> 9, snapshot),
+ POS(inode->v.i_ino, U64_MAX),
+ 0, k, ret) {
+ if (bkey_extent_is_data(k.k)) {
+ next_data = max(offset, bkey_start_offset(k.k) << 9);
+ break;
+ } else if (k.k->p.offset >> 9 > isize)
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
+
+ bch2_trans_put(trans);
+ if (ret)
+ return ret;
+
+ if (next_data > offset)
+ next_data = bch2_seek_pagecache_data(&inode->v,
+ offset, next_data, 0, false);
+
+ if (next_data >= isize)
+ return -ENXIO;
+
+ return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
+}
+
+static loff_t bch2_seek_hole(struct file *file, u64 offset)
+{
+ struct bch_inode_info *inode = file_bch_inode(file);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct btree_trans *trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ subvol_inum inum = inode_inum(inode);
+ u64 isize, next_hole = MAX_LFS_FILESIZE;
+ u32 snapshot;
+ int ret;
+
+ isize = i_size_read(&inode->v);
+ if (offset >= isize)
+ return -ENXIO;
+
+ trans = bch2_trans_get(c);
+retry:
+ bch2_trans_begin(trans);
+
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
+ SPOS(inode->v.i_ino, offset >> 9, snapshot),
+ BTREE_ITER_SLOTS, k, ret) {
+ if (k.k->p.inode != inode->v.i_ino) {
+ next_hole = bch2_seek_pagecache_hole(&inode->v,
+ offset, MAX_LFS_FILESIZE, 0, false);
+ break;
+ } else if (!bkey_extent_is_data(k.k)) {
+ next_hole = bch2_seek_pagecache_hole(&inode->v,
+ max(offset, bkey_start_offset(k.k) << 9),
+ k.k->p.offset << 9, 0, false);
+
+ if (next_hole < k.k->p.offset << 9)
+ break;
+ } else {
+ offset = max(offset, bkey_start_offset(k.k) << 9);
+ }
+ }
+ bch2_trans_iter_exit(trans, &iter);
+err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
+
+ bch2_trans_put(trans);
+ if (ret)
+ return ret;
+
+ if (next_hole > isize)
+ next_hole = isize;
+
+ return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE);
+}
+
+loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
+{
+ loff_t ret;
+
+ switch (whence) {
+ case SEEK_SET:
+ case SEEK_CUR:
+ case SEEK_END:
+ ret = generic_file_llseek(file, offset, whence);
+ break;
+ case SEEK_DATA:
+ ret = bch2_seek_data(file, offset);
+ break;
+ case SEEK_HOLE:
+ ret = bch2_seek_hole(file, offset);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return bch2_err_class(ret);
+}
+
+void bch2_fs_fsio_exit(struct bch_fs *c)
+{
+ bioset_exit(&c->nocow_flush_bioset);
+}
+
+int bch2_fs_fsio_init(struct bch_fs *c)
+{
+ if (bioset_init(&c->nocow_flush_bioset,
+ 1, offsetof(struct nocow_flush, bio), 0))
+ return -BCH_ERR_ENOMEM_nocow_flush_bioset_init;
+
+ return 0;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
new file mode 100644
index 000000000000..ca70346e68dc
--- /dev/null
+++ b/fs/bcachefs/fs-io.h
@@ -0,0 +1,184 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IO_H
+#define _BCACHEFS_FS_IO_H
+
+#ifndef NO_BCACHEFS_FS
+
+#include "buckets.h"
+#include "fs.h"
+#include "io_write_types.h"
+#include "quota.h"
+
+#include <linux/uio.h>
+
+struct folio_vec {
+ struct folio *fv_folio;
+ size_t fv_offset;
+ size_t fv_len;
+};
+
+static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv)
+{
+
+ struct folio *folio = page_folio(bv.bv_page);
+ size_t offset = (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) +
+ bv.bv_offset;
+ size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len);
+
+ return (struct folio_vec) {
+ .fv_folio = folio,
+ .fv_offset = offset,
+ .fv_len = len,
+ };
+}
+
+static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio,
+ struct bvec_iter iter)
+{
+ return biovec_to_foliovec(bio_iter_iovec(bio, iter));
+}
+
+#define __bio_for_each_folio(bvl, bio, iter, start) \
+ for (iter = (start); \
+ (iter).bi_size && \
+ ((bvl = bio_iter_iovec_folio((bio), (iter))), 1); \
+ bio_advance_iter_single((bio), &(iter), (bvl).fv_len))
+
+/**
+ * bio_for_each_folio - iterate over folios within a bio
+ *
+ * Like other non-_all versions, this iterates over what bio->bi_iter currently
+ * points to. This version is for drivers, where the bio may have previously
+ * been split or cloned.
+ */
+#define bio_for_each_folio(bvl, bio, iter) \
+ __bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter)
+
+struct quota_res {
+ u64 sectors;
+};
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+
+static inline void __bch2_quota_reservation_put(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct quota_res *res)
+{
+ BUG_ON(res->sectors > inode->ei_quota_reserved);
+
+ bch2_quota_acct(c, inode->ei_qid, Q_SPC,
+ -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
+ inode->ei_quota_reserved -= res->sectors;
+ res->sectors = 0;
+}
+
+static inline void bch2_quota_reservation_put(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct quota_res *res)
+{
+ if (res->sectors) {
+ mutex_lock(&inode->ei_quota_lock);
+ __bch2_quota_reservation_put(c, inode, res);
+ mutex_unlock(&inode->ei_quota_lock);
+ }
+}
+
+static inline int bch2_quota_reservation_add(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct quota_res *res,
+ u64 sectors,
+ bool check_enospc)
+{
+ int ret;
+
+ if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags))
+ return 0;
+
+ mutex_lock(&inode->ei_quota_lock);
+ ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
+ check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
+ if (likely(!ret)) {
+ inode->ei_quota_reserved += sectors;
+ res->sectors += sectors;
+ }
+ mutex_unlock(&inode->ei_quota_lock);
+
+ return ret;
+}
+
+#else
+
+static inline void __bch2_quota_reservation_put(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct quota_res *res) {}
+
+static inline void bch2_quota_reservation_put(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct quota_res *res) {}
+
+static inline int bch2_quota_reservation_add(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct quota_res *res,
+ unsigned sectors,
+ bool check_enospc)
+{
+ return 0;
+}
+
+#endif
+
+void __bch2_i_sectors_acct(struct bch_fs *, struct bch_inode_info *,
+ struct quota_res *, s64);
+
+static inline void bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
+ struct quota_res *quota_res, s64 sectors)
+{
+ if (sectors) {
+ mutex_lock(&inode->ei_quota_lock);
+ __bch2_i_sectors_acct(c, inode, quota_res, sectors);
+ mutex_unlock(&inode->ei_quota_lock);
+ }
+}
+
+static inline struct address_space *faults_disabled_mapping(void)
+{
+ return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
+}
+
+static inline void set_fdm_dropped_locks(void)
+{
+ current->faults_disabled_mapping =
+ (void *) (((unsigned long) current->faults_disabled_mapping)|1);
+}
+
+static inline bool fdm_dropped_locks(void)
+{
+ return ((unsigned long) current->faults_disabled_mapping) & 1;
+}
+
+void bch2_inode_flush_nocow_writes_async(struct bch_fs *,
+ struct bch_inode_info *, struct closure *);
+
+int __must_check bch2_write_inode_size(struct bch_fs *,
+ struct bch_inode_info *,
+ loff_t, unsigned);
+
+int bch2_fsync(struct file *, loff_t, loff_t, int);
+
+int bchfs_truncate(struct mnt_idmap *,
+ struct bch_inode_info *, struct iattr *);
+long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
+
+loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
+ loff_t, loff_t, unsigned);
+
+loff_t bch2_llseek(struct file *, loff_t, int);
+
+void bch2_fs_fsio_exit(struct bch_fs *);
+int bch2_fs_fsio_init(struct bch_fs *);
+#else
+static inline void bch2_fs_fsio_exit(struct bch_fs *c) {}
+static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; }
+#endif
+
+#endif /* _BCACHEFS_FS_IO_H */
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
new file mode 100644
index 000000000000..14d5cc6f90d7
--- /dev/null
+++ b/fs/bcachefs/fs-ioctl.c
@@ -0,0 +1,570 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "chardev.h"
+#include "dirent.h"
+#include "fs.h"
+#include "fs-common.h"
+#include "fs-ioctl.h"
+#include "quota.h"
+
+#include <linux/compat.h>
+#include <linux/fsnotify.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+#include <linux/writeback.h>
+
+#define FS_IOC_GOINGDOWN _IOR('X', 125, __u32)
+#define FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */
+#define FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */
+#define FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */
+
+struct flags_set {
+ unsigned mask;
+ unsigned flags;
+
+ unsigned projid;
+
+ bool set_projinherit;
+ bool projinherit;
+};
+
+static int bch2_inode_flags_set(struct btree_trans *trans,
+ struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi,
+ void *p)
+{
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ /*
+ * We're relying on btree locking here for exclusion with other ioctl
+ * calls - use the flags in the btree (@bi), not inode->i_flags:
+ */
+ struct flags_set *s = p;
+ unsigned newflags = s->flags;
+ unsigned oldflags = bi->bi_flags & s->mask;
+
+ if (((newflags ^ oldflags) & (BCH_INODE_append|BCH_INODE_immutable)) &&
+ !capable(CAP_LINUX_IMMUTABLE))
+ return -EPERM;
+
+ if (!S_ISREG(bi->bi_mode) &&
+ !S_ISDIR(bi->bi_mode) &&
+ (newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags)
+ return -EINVAL;
+
+ if (s->set_projinherit) {
+ bi->bi_fields_set &= ~(1 << Inode_opt_project);
+ bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project);
+ }
+
+ bi->bi_flags &= ~s->mask;
+ bi->bi_flags |= newflags;
+
+ bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v));
+ return 0;
+}
+
+static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg)
+{
+ unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags);
+
+ return put_user(flags, arg);
+}
+
+static int bch2_ioc_setflags(struct bch_fs *c,
+ struct file *file,
+ struct bch_inode_info *inode,
+ void __user *arg)
+{
+ struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) };
+ unsigned uflags;
+ int ret;
+
+ if (get_user(uflags, (int __user *) arg))
+ return -EFAULT;
+
+ s.flags = map_flags_rev(bch_flags_to_uflags, uflags);
+ if (uflags)
+ return -EOPNOTSUPP;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ inode_lock(&inode->v);
+ if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
+ ret = -EACCES;
+ goto setflags_out;
+ }
+
+ mutex_lock(&inode->ei_update_lock);
+ ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+ bch2_write_inode(c, inode, bch2_inode_flags_set, &s,
+ ATTR_CTIME);
+ mutex_unlock(&inode->ei_update_lock);
+
+setflags_out:
+ inode_unlock(&inode->v);
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
+ struct fsxattr __user *arg)
+{
+ struct fsxattr fa = { 0 };
+
+ fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
+
+ if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project))
+ fa.fsx_xflags |= FS_XFLAG_PROJINHERIT;
+
+ fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
+
+ if (copy_to_user(arg, &fa, sizeof(fa)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int fssetxattr_inode_update_fn(struct btree_trans *trans,
+ struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi,
+ void *p)
+{
+ struct flags_set *s = p;
+
+ if (s->projid != bi->bi_project) {
+ bi->bi_fields_set |= 1U << Inode_opt_project;
+ bi->bi_project = s->projid;
+ }
+
+ return bch2_inode_flags_set(trans, inode, bi, p);
+}
+
+static int bch2_ioc_fssetxattr(struct bch_fs *c,
+ struct file *file,
+ struct bch_inode_info *inode,
+ struct fsxattr __user *arg)
+{
+ struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) };
+ struct fsxattr fa;
+ int ret;
+
+ if (copy_from_user(&fa, arg, sizeof(fa)))
+ return -EFAULT;
+
+ s.set_projinherit = true;
+ s.projinherit = (fa.fsx_xflags & FS_XFLAG_PROJINHERIT) != 0;
+ fa.fsx_xflags &= ~FS_XFLAG_PROJINHERIT;
+
+ s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags);
+ if (fa.fsx_xflags)
+ return -EOPNOTSUPP;
+
+ if (fa.fsx_projid >= U32_MAX)
+ return -EINVAL;
+
+ /*
+ * inode fields accessible via the xattr interface are stored with a +1
+ * bias, so that 0 means unset:
+ */
+ s.projid = fa.fsx_projid + 1;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ inode_lock(&inode->v);
+ if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
+ ret = -EACCES;
+ goto err;
+ }
+
+ mutex_lock(&inode->ei_update_lock);
+ ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+ bch2_set_projid(c, inode, fa.fsx_projid) ?:
+ bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
+ ATTR_CTIME);
+ mutex_unlock(&inode->ei_update_lock);
+err:
+ inode_unlock(&inode->v);
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+static int bch2_reinherit_attrs_fn(struct btree_trans *trans,
+ struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi,
+ void *p)
+{
+ struct bch_inode_info *dir = p;
+
+ return !bch2_reinherit_attrs(bi, &dir->ei_inode);
+}
+
+static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
+ struct file *file,
+ struct bch_inode_info *src,
+ const char __user *name)
+{
+ struct bch_hash_info hash = bch2_hash_info_init(c, &src->ei_inode);
+ struct bch_inode_info *dst;
+ struct inode *vinode = NULL;
+ char *kname = NULL;
+ struct qstr qstr;
+ int ret = 0;
+ subvol_inum inum;
+
+ kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL);
+ if (!kname)
+ return -ENOMEM;
+
+ ret = strncpy_from_user(kname, name, BCH_NAME_MAX);
+ if (unlikely(ret < 0))
+ goto err1;
+
+ qstr.len = ret;
+ qstr.name = kname;
+
+ ret = bch2_dirent_lookup(c, inode_inum(src), &hash, &qstr, &inum);
+ if (ret)
+ goto err1;
+
+ vinode = bch2_vfs_inode_get(c, inum);
+ ret = PTR_ERR_OR_ZERO(vinode);
+ if (ret)
+ goto err1;
+
+ dst = to_bch_ei(vinode);
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ goto err2;
+
+ bch2_lock_inodes(INODE_UPDATE_LOCK, src, dst);
+
+ if (inode_attr_changing(src, dst, Inode_opt_project)) {
+ ret = bch2_fs_quota_transfer(c, dst,
+ src->ei_qid,
+ 1 << QTYP_PRJ,
+ KEY_TYPE_QUOTA_PREALLOC);
+ if (ret)
+ goto err3;
+ }
+
+ ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0);
+err3:
+ bch2_unlock_inodes(INODE_UPDATE_LOCK, src, dst);
+
+ /* return true if we did work */
+ if (ret >= 0)
+ ret = !ret;
+
+ mnt_drop_write_file(file);
+err2:
+ iput(vinode);
+err1:
+ kfree(kname);
+
+ return ret;
+}
+
+static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
+{
+ u32 flags;
+ int ret = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (get_user(flags, arg))
+ return -EFAULT;
+
+ bch_notice(c, "shutdown by ioctl type %u", flags);
+
+ down_write(&c->vfs_sb->s_umount);
+
+ switch (flags) {
+ case FSOP_GOING_FLAGS_DEFAULT:
+ ret = freeze_bdev(c->vfs_sb->s_bdev);
+ if (ret)
+ goto err;
+
+ bch2_journal_flush(&c->journal);
+ c->vfs_sb->s_flags |= SB_RDONLY;
+ bch2_fs_emergency_read_only(c);
+ thaw_bdev(c->vfs_sb->s_bdev);
+ break;
+
+ case FSOP_GOING_FLAGS_LOGFLUSH:
+ bch2_journal_flush(&c->journal);
+ fallthrough;
+
+ case FSOP_GOING_FLAGS_NOLOGFLUSH:
+ c->vfs_sb->s_flags |= SB_RDONLY;
+ bch2_fs_emergency_read_only(c);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+err:
+ up_write(&c->vfs_sb->s_umount);
+ return ret;
+}
+
+static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
+ struct bch_ioctl_subvolume arg)
+{
+ struct inode *dir;
+ struct bch_inode_info *inode;
+ struct user_namespace *s_user_ns;
+ struct dentry *dst_dentry;
+ struct path src_path, dst_path;
+ int how = LOOKUP_FOLLOW;
+ int error;
+ subvol_inum snapshot_src = { 0 };
+ unsigned lookup_flags = 0;
+ unsigned create_flags = BCH_CREATE_SUBVOL;
+
+ if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE|
+ BCH_SUBVOL_SNAPSHOT_RO))
+ return -EINVAL;
+
+ if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
+ (arg.src_ptr ||
+ (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)))
+ return -EINVAL;
+
+ if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
+ create_flags |= BCH_CREATE_SNAPSHOT;
+
+ if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)
+ create_flags |= BCH_CREATE_SNAPSHOT_RO;
+
+ /* why do we need this lock? */
+ down_read(&c->vfs_sb->s_umount);
+
+ if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
+ sync_inodes_sb(c->vfs_sb);
+retry:
+ if (arg.src_ptr) {
+ error = user_path_at(arg.dirfd,
+ (const char __user *)(unsigned long)arg.src_ptr,
+ how, &src_path);
+ if (error)
+ goto err1;
+
+ if (src_path.dentry->d_sb->s_fs_info != c) {
+ path_put(&src_path);
+ error = -EXDEV;
+ goto err1;
+ }
+
+ snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode));
+ }
+
+ dst_dentry = user_path_create(arg.dirfd,
+ (const char __user *)(unsigned long)arg.dst_ptr,
+ &dst_path, lookup_flags);
+ error = PTR_ERR_OR_ZERO(dst_dentry);
+ if (error)
+ goto err2;
+
+ if (dst_dentry->d_sb->s_fs_info != c) {
+ error = -EXDEV;
+ goto err3;
+ }
+
+ if (dst_dentry->d_inode) {
+ error = -EEXIST;
+ goto err3;
+ }
+
+ dir = dst_path.dentry->d_inode;
+ if (IS_DEADDIR(dir)) {
+ error = -BCH_ERR_ENOENT_directory_dead;
+ goto err3;
+ }
+
+ s_user_ns = dir->i_sb->s_user_ns;
+ if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
+ !kgid_has_mapping(s_user_ns, current_fsgid())) {
+ error = -EOVERFLOW;
+ goto err3;
+ }
+
+ error = inode_permission(file_mnt_idmap(filp),
+ dir, MAY_WRITE | MAY_EXEC);
+ if (error)
+ goto err3;
+
+ if (!IS_POSIXACL(dir))
+ arg.mode &= ~current_umask();
+
+ error = security_path_mkdir(&dst_path, dst_dentry, arg.mode);
+ if (error)
+ goto err3;
+
+ if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
+ !arg.src_ptr)
+ snapshot_src.subvol = inode_inum(to_bch_ei(dir)).subvol;
+
+ inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir),
+ dst_dentry, arg.mode|S_IFDIR,
+ 0, snapshot_src, create_flags);
+ error = PTR_ERR_OR_ZERO(inode);
+ if (error)
+ goto err3;
+
+ d_instantiate(dst_dentry, &inode->v);
+ fsnotify_mkdir(dir, dst_dentry);
+err3:
+ done_path_create(&dst_path, dst_dentry);
+err2:
+ if (arg.src_ptr)
+ path_put(&src_path);
+
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
+err1:
+ up_read(&c->vfs_sb->s_umount);
+
+ return error;
+}
+
+static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
+ struct bch_ioctl_subvolume arg)
+{
+ down_write(&c->snapshot_create_lock);
+ long ret = __bch2_ioctl_subvolume_create(c, filp, arg);
+ up_write(&c->snapshot_create_lock);
+
+ return ret;
+}
+
+static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
+ struct bch_ioctl_subvolume arg)
+{
+ struct path path;
+ struct inode *dir;
+ int ret = 0;
+
+ if (arg.flags)
+ return -EINVAL;
+
+ ret = user_path_at(arg.dirfd,
+ (const char __user *)(unsigned long)arg.dst_ptr,
+ LOOKUP_FOLLOW, &path);
+ if (ret)
+ return ret;
+
+ if (path.dentry->d_sb->s_fs_info != c) {
+ ret = -EXDEV;
+ goto err;
+ }
+
+ dir = path.dentry->d_parent->d_inode;
+
+ ret = __bch2_unlink(dir, path.dentry, true);
+ if (ret)
+ goto err;
+
+ fsnotify_rmdir(dir, path.dentry);
+ d_delete(path.dentry);
+err:
+ path_put(&path);
+ return ret;
+}
+
+long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+{
+ struct bch_inode_info *inode = file_bch_inode(file);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ long ret;
+
+ switch (cmd) {
+ case FS_IOC_GETFLAGS:
+ ret = bch2_ioc_getflags(inode, (int __user *) arg);
+ break;
+
+ case FS_IOC_SETFLAGS:
+ ret = bch2_ioc_setflags(c, file, inode, (int __user *) arg);
+ break;
+
+ case FS_IOC_FSGETXATTR:
+ ret = bch2_ioc_fsgetxattr(inode, (void __user *) arg);
+ break;
+
+ case FS_IOC_FSSETXATTR:
+ ret = bch2_ioc_fssetxattr(c, file, inode,
+ (void __user *) arg);
+ break;
+
+ case BCHFS_IOC_REINHERIT_ATTRS:
+ ret = bch2_ioc_reinherit_attrs(c, file, inode,
+ (void __user *) arg);
+ break;
+
+ case FS_IOC_GETVERSION:
+ ret = -ENOTTY;
+ break;
+
+ case FS_IOC_SETVERSION:
+ ret = -ENOTTY;
+ break;
+
+ case FS_IOC_GOINGDOWN:
+ ret = bch2_ioc_goingdown(c, (u32 __user *) arg);
+ break;
+
+ case BCH_IOCTL_SUBVOLUME_CREATE: {
+ struct bch_ioctl_subvolume i;
+
+ ret = copy_from_user(&i, (void __user *) arg, sizeof(i))
+ ? -EFAULT
+ : bch2_ioctl_subvolume_create(c, file, i);
+ break;
+ }
+
+ case BCH_IOCTL_SUBVOLUME_DESTROY: {
+ struct bch_ioctl_subvolume i;
+
+ ret = copy_from_user(&i, (void __user *) arg, sizeof(i))
+ ? -EFAULT
+ : bch2_ioctl_subvolume_destroy(c, file, i);
+ break;
+ }
+
+ default:
+ ret = bch2_fs_ioctl(c, cmd, (void __user *) arg);
+ break;
+ }
+
+ return bch2_err_class(ret);
+}
+
+#ifdef CONFIG_COMPAT
+long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+{
+ /* These are just misnamed, they actually get/put from/to user an int */
+ switch (cmd) {
+ case FS_IOC_GETFLAGS:
+ cmd = FS_IOC_GETFLAGS;
+ break;
+ case FS_IOC32_SETFLAGS:
+ cmd = FS_IOC_SETFLAGS;
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+ return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h
new file mode 100644
index 000000000000..d30f9bb056fd
--- /dev/null
+++ b/fs/bcachefs/fs-ioctl.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IOCTL_H
+#define _BCACHEFS_FS_IOCTL_H
+
+/* Inode flags: */
+
+/* bcachefs inode flags -> vfs inode flags: */
+static const __maybe_unused unsigned bch_flags_to_vfs[] = {
+ [__BCH_INODE_sync] = S_SYNC,
+ [__BCH_INODE_immutable] = S_IMMUTABLE,
+ [__BCH_INODE_append] = S_APPEND,
+ [__BCH_INODE_noatime] = S_NOATIME,
+};
+
+/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
+static const __maybe_unused unsigned bch_flags_to_uflags[] = {
+ [__BCH_INODE_sync] = FS_SYNC_FL,
+ [__BCH_INODE_immutable] = FS_IMMUTABLE_FL,
+ [__BCH_INODE_append] = FS_APPEND_FL,
+ [__BCH_INODE_nodump] = FS_NODUMP_FL,
+ [__BCH_INODE_noatime] = FS_NOATIME_FL,
+};
+
+/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
+static const __maybe_unused unsigned bch_flags_to_xflags[] = {
+ [__BCH_INODE_sync] = FS_XFLAG_SYNC,
+ [__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE,
+ [__BCH_INODE_append] = FS_XFLAG_APPEND,
+ [__BCH_INODE_nodump] = FS_XFLAG_NODUMP,
+ [__BCH_INODE_noatime] = FS_XFLAG_NOATIME,
+ //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT;
+};
+
+#define set_flags(_map, _in, _out) \
+do { \
+ unsigned _i; \
+ \
+ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \
+ if ((_in) & (1 << _i)) \
+ (_out) |= _map[_i]; \
+ else \
+ (_out) &= ~_map[_i]; \
+} while (0)
+
+#define map_flags(_map, _in) \
+({ \
+ unsigned _out = 0; \
+ \
+ set_flags(_map, _in, _out); \
+ _out; \
+})
+
+#define map_flags_rev(_map, _in) \
+({ \
+ unsigned _i, _out = 0; \
+ \
+ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \
+ if ((_in) & _map[_i]) { \
+ (_out) |= 1 << _i; \
+ (_in) &= ~_map[_i]; \
+ } \
+ (_out); \
+})
+
+#define map_defined(_map) \
+({ \
+ unsigned _in = ~0; \
+ \
+ map_flags_rev(_map, _in); \
+})
+
+/* Set VFS inode flags from bcachefs inode: */
+static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
+{
+ set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
+}
+
+long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long);
+long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long);
+
+#endif /* _BCACHEFS_FS_IOCTL_H */
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
new file mode 100644
index 000000000000..49da8db1d9e9
--- /dev/null
+++ b/fs/bcachefs/fs.c
@@ -0,0 +1,2010 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "acl.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "chardev.h"
+#include "dirent.h"
+#include "errcode.h"
+#include "extents.h"
+#include "fs.h"
+#include "fs-common.h"
+#include "fs-io.h"
+#include "fs-ioctl.h"
+#include "fs-io-buffered.h"
+#include "fs-io-direct.h"
+#include "fs-io-pagecache.h"
+#include "fsck.h"
+#include "inode.h"
+#include "io_read.h"
+#include "journal.h"
+#include "keylist.h"
+#include "quota.h"
+#include "snapshot.h"
+#include "super.h"
+#include "xattr.h"
+
+#include <linux/aio.h>
+#include <linux/backing-dev.h>
+#include <linux/exportfs.h>
+#include <linux/fiemap.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/posix_acl.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+#include <linux/statfs.h>
+#include <linux/string.h>
+#include <linux/xattr.h>
+
+static struct kmem_cache *bch2_inode_cache;
+
+static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
+ struct bch_inode_info *,
+ struct bch_inode_unpacked *,
+ struct bch_subvolume *);
+
+void bch2_inode_update_after_write(struct btree_trans *trans,
+ struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi,
+ unsigned fields)
+{
+ struct bch_fs *c = trans->c;
+
+ BUG_ON(bi->bi_inum != inode->v.i_ino);
+
+ bch2_assert_pos_locked(trans, BTREE_ID_inodes,
+ POS(0, bi->bi_inum),
+ c->opts.inodes_use_key_cache);
+
+ set_nlink(&inode->v, bch2_inode_nlink_get(bi));
+ i_uid_write(&inode->v, bi->bi_uid);
+ i_gid_write(&inode->v, bi->bi_gid);
+ inode->v.i_mode = bi->bi_mode;
+
+ if (fields & ATTR_ATIME)
+ inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime));
+ if (fields & ATTR_MTIME)
+ inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime));
+ if (fields & ATTR_CTIME)
+ inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime));
+
+ inode->ei_inode = *bi;
+
+ bch2_inode_flags_to_vfs(inode);
+}
+
+int __must_check bch2_write_inode(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ inode_set_fn set,
+ void *p, unsigned fields)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter = { NULL };
+ struct bch_inode_unpacked inode_u;
+ int ret;
+retry:
+ bch2_trans_begin(trans);
+
+ ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode),
+ BTREE_ITER_INTENT) ?:
+ (set ? set(trans, inode, &inode_u, p) : 0) ?:
+ bch2_inode_write(trans, &iter, &inode_u) ?:
+ bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+
+ /*
+ * the btree node lock protects inode->ei_inode, not ei_update_lock;
+ * this is important for inode updates via bchfs_write_index_update
+ */
+ if (!ret)
+ bch2_inode_update_after_write(trans, inode, &inode_u, fields);
+
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
+
+ bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
+ "inode %u:%llu not found when updating",
+ inode_inum(inode).subvol,
+ inode_inum(inode).inum);
+
+ bch2_trans_put(trans);
+ return ret < 0 ? ret : 0;
+}
+
+int bch2_fs_quota_transfer(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct bch_qid new_qid,
+ unsigned qtypes,
+ enum quota_acct_mode mode)
+{
+ unsigned i;
+ int ret;
+
+ qtypes &= enabled_qtypes(c);
+
+ for (i = 0; i < QTYP_NR; i++)
+ if (new_qid.q[i] == inode->ei_qid.q[i])
+ qtypes &= ~(1U << i);
+
+ if (!qtypes)
+ return 0;
+
+ mutex_lock(&inode->ei_quota_lock);
+
+ ret = bch2_quota_transfer(c, qtypes, new_qid,
+ inode->ei_qid,
+ inode->v.i_blocks +
+ inode->ei_quota_reserved,
+ mode);
+ if (!ret)
+ for (i = 0; i < QTYP_NR; i++)
+ if (qtypes & (1 << i))
+ inode->ei_qid.q[i] = new_qid.q[i];
+
+ mutex_unlock(&inode->ei_quota_lock);
+
+ return ret;
+}
+
+static int bch2_iget5_test(struct inode *vinode, void *p)
+{
+ struct bch_inode_info *inode = to_bch_ei(vinode);
+ subvol_inum *inum = p;
+
+ return inode->ei_subvol == inum->subvol &&
+ inode->ei_inode.bi_inum == inum->inum;
+}
+
+static int bch2_iget5_set(struct inode *vinode, void *p)
+{
+ struct bch_inode_info *inode = to_bch_ei(vinode);
+ subvol_inum *inum = p;
+
+ inode->v.i_ino = inum->inum;
+ inode->ei_subvol = inum->subvol;
+ inode->ei_inode.bi_inum = inum->inum;
+ return 0;
+}
+
+static unsigned bch2_inode_hash(subvol_inum inum)
+{
+ return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
+}
+
+struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
+{
+ struct bch_inode_unpacked inode_u;
+ struct bch_inode_info *inode;
+ struct btree_trans *trans;
+ struct bch_subvolume subvol;
+ int ret;
+
+ inode = to_bch_ei(iget5_locked(c->vfs_sb,
+ bch2_inode_hash(inum),
+ bch2_iget5_test,
+ bch2_iget5_set,
+ &inum));
+ if (unlikely(!inode))
+ return ERR_PTR(-ENOMEM);
+ if (!(inode->v.i_state & I_NEW))
+ return &inode->v;
+
+ trans = bch2_trans_get(c);
+ ret = lockrestart_do(trans,
+ bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
+ bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
+
+ if (!ret)
+ bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
+ bch2_trans_put(trans);
+
+ if (ret) {
+ iget_failed(&inode->v);
+ return ERR_PTR(bch2_err_class(ret));
+ }
+
+ mutex_lock(&c->vfs_inodes_lock);
+ list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
+ mutex_unlock(&c->vfs_inodes_lock);
+
+ unlock_new_inode(&inode->v);
+
+ return &inode->v;
+}
+
+struct bch_inode_info *
+__bch2_create(struct mnt_idmap *idmap,
+ struct bch_inode_info *dir, struct dentry *dentry,
+ umode_t mode, dev_t rdev, subvol_inum snapshot_src,
+ unsigned flags)
+{
+ struct bch_fs *c = dir->v.i_sb->s_fs_info;
+ struct btree_trans *trans;
+ struct bch_inode_unpacked dir_u;
+ struct bch_inode_info *inode, *old;
+ struct bch_inode_unpacked inode_u;
+ struct posix_acl *default_acl = NULL, *acl = NULL;
+ subvol_inum inum;
+ struct bch_subvolume subvol;
+ u64 journal_seq = 0;
+ int ret;
+
+ /*
+ * preallocate acls + vfs inode before btree transaction, so that
+ * nothing can fail after the transaction succeeds:
+ */
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+ ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
+ if (ret)
+ return ERR_PTR(ret);
+#endif
+ inode = to_bch_ei(new_inode(c->vfs_sb));
+ if (unlikely(!inode)) {
+ inode = ERR_PTR(-ENOMEM);
+ goto err;
+ }
+
+ bch2_inode_init_early(c, &inode_u);
+
+ if (!(flags & BCH_CREATE_TMPFILE))
+ mutex_lock(&dir->ei_update_lock);
+
+ trans = bch2_trans_get(c);
+retry:
+ bch2_trans_begin(trans);
+
+ ret = bch2_subvol_is_ro_trans(trans, dir->ei_subvol) ?:
+ bch2_create_trans(trans,
+ inode_inum(dir), &dir_u, &inode_u,
+ !(flags & BCH_CREATE_TMPFILE)
+ ? &dentry->d_name : NULL,
+ from_kuid(i_user_ns(&dir->v), current_fsuid()),
+ from_kgid(i_user_ns(&dir->v), current_fsgid()),
+ mode, rdev,
+ default_acl, acl, snapshot_src, flags) ?:
+ bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
+ KEY_TYPE_QUOTA_PREALLOC);
+ if (unlikely(ret))
+ goto err_before_quota;
+
+ inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
+ inum.inum = inode_u.bi_inum;
+
+ ret = bch2_subvolume_get(trans, inum.subvol, true,
+ BTREE_ITER_WITH_UPDATES, &subvol) ?:
+ bch2_trans_commit(trans, NULL, &journal_seq, 0);
+ if (unlikely(ret)) {
+ bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
+ KEY_TYPE_QUOTA_WARN);
+err_before_quota:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
+ goto err_trans;
+ }
+
+ if (!(flags & BCH_CREATE_TMPFILE)) {
+ bch2_inode_update_after_write(trans, dir, &dir_u,
+ ATTR_MTIME|ATTR_CTIME);
+ mutex_unlock(&dir->ei_update_lock);
+ }
+
+ bch2_iget5_set(&inode->v, &inum);
+ bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
+
+ set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
+ set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
+
+ /*
+ * we must insert the new inode into the inode cache before calling
+ * bch2_trans_exit() and dropping locks, else we could race with another
+ * thread pulling the inode in and modifying it:
+ */
+
+ inode->v.i_state |= I_CREATING;
+
+ old = to_bch_ei(inode_insert5(&inode->v,
+ bch2_inode_hash(inum),
+ bch2_iget5_test,
+ bch2_iget5_set,
+ &inum));
+ BUG_ON(!old);
+
+ if (unlikely(old != inode)) {
+ /*
+ * We raced, another process pulled the new inode into cache
+ * before us:
+ */
+ make_bad_inode(&inode->v);
+ iput(&inode->v);
+
+ inode = old;
+ } else {
+ mutex_lock(&c->vfs_inodes_lock);
+ list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
+ mutex_unlock(&c->vfs_inodes_lock);
+ /*
+ * we really don't want insert_inode_locked2() to be setting
+ * I_NEW...
+ */
+ unlock_new_inode(&inode->v);
+ }
+
+ bch2_trans_put(trans);
+err:
+ posix_acl_release(default_acl);
+ posix_acl_release(acl);
+ return inode;
+err_trans:
+ if (!(flags & BCH_CREATE_TMPFILE))
+ mutex_unlock(&dir->ei_update_lock);
+
+ bch2_trans_put(trans);
+ make_bad_inode(&inode->v);
+ iput(&inode->v);
+ inode = ERR_PTR(ret);
+ goto err;
+}
+
+/* methods */
+
+static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct bch_fs *c = vdir->i_sb->s_fs_info;
+ struct bch_inode_info *dir = to_bch_ei(vdir);
+ struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
+ struct inode *vinode = NULL;
+ subvol_inum inum = { .subvol = 1 };
+ int ret;
+
+ ret = bch2_dirent_lookup(c, inode_inum(dir), &hash,
+ &dentry->d_name, &inum);
+
+ if (!ret)
+ vinode = bch2_vfs_inode_get(c, inum);
+
+ return d_splice_alias(vinode, dentry);
+}
+
+static int bch2_mknod(struct mnt_idmap *idmap,
+ struct inode *vdir, struct dentry *dentry,
+ umode_t mode, dev_t rdev)
+{
+ struct bch_inode_info *inode =
+ __bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev,
+ (subvol_inum) { 0 }, 0);
+
+ if (IS_ERR(inode))
+ return bch2_err_class(PTR_ERR(inode));
+
+ d_instantiate(dentry, &inode->v);
+ return 0;
+}
+
+static int bch2_create(struct mnt_idmap *idmap,
+ struct inode *vdir, struct dentry *dentry,
+ umode_t mode, bool excl)
+{
+ return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0);
+}
+
+static int __bch2_link(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct bch_inode_info *dir,
+ struct dentry *dentry)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct bch_inode_unpacked dir_u, inode_u;
+ int ret;
+
+ mutex_lock(&inode->ei_update_lock);
+
+ ret = commit_do(trans, NULL, NULL, 0,
+ bch2_link_trans(trans,
+ inode_inum(dir), &dir_u,
+ inode_inum(inode), &inode_u,
+ &dentry->d_name));
+
+ if (likely(!ret)) {
+ bch2_inode_update_after_write(trans, dir, &dir_u,
+ ATTR_MTIME|ATTR_CTIME);
+ bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME);
+ }
+
+ bch2_trans_put(trans);
+ mutex_unlock(&inode->ei_update_lock);
+ return ret;
+}
+
+static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
+ struct dentry *dentry)
+{
+ struct bch_fs *c = vdir->i_sb->s_fs_info;
+ struct bch_inode_info *dir = to_bch_ei(vdir);
+ struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
+ int ret;
+
+ lockdep_assert_held(&inode->v.i_rwsem);
+
+ ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
+ bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+ __bch2_link(c, inode, dir, dentry);
+ if (unlikely(ret))
+ return ret;
+
+ ihold(&inode->v);
+ d_instantiate(dentry, &inode->v);
+ return 0;
+}
+
+int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
+ bool deleting_snapshot)
+{
+ struct bch_fs *c = vdir->i_sb->s_fs_info;
+ struct bch_inode_info *dir = to_bch_ei(vdir);
+ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+ struct bch_inode_unpacked dir_u, inode_u;
+ struct btree_trans *trans = bch2_trans_get(c);
+ int ret;
+
+ bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
+
+ ret = commit_do(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL,
+ bch2_unlink_trans(trans,
+ inode_inum(dir), &dir_u,
+ &inode_u, &dentry->d_name,
+ deleting_snapshot));
+ if (unlikely(ret))
+ goto err;
+
+ bch2_inode_update_after_write(trans, dir, &dir_u,
+ ATTR_MTIME|ATTR_CTIME);
+ bch2_inode_update_after_write(trans, inode, &inode_u,
+ ATTR_MTIME);
+
+ if (inode_u.bi_subvol) {
+ /*
+ * Subvolume deletion is asynchronous, but we still want to tell
+ * the VFS that it's been deleted here:
+ */
+ set_nlink(&inode->v, 0);
+ }
+err:
+ bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
+ bch2_trans_put(trans);
+
+ return ret;
+}
+
+static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
+{
+ struct bch_inode_info *dir= to_bch_ei(vdir);
+ struct bch_fs *c = dir->v.i_sb->s_fs_info;
+
+ return bch2_subvol_is_ro(c, dir->ei_subvol) ?:
+ __bch2_unlink(vdir, dentry, false);
+}
+
+static int bch2_symlink(struct mnt_idmap *idmap,
+ struct inode *vdir, struct dentry *dentry,
+ const char *symname)
+{
+ struct bch_fs *c = vdir->i_sb->s_fs_info;
+ struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
+ int ret;
+
+ inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
+ (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
+ if (IS_ERR(inode))
+ return bch2_err_class(PTR_ERR(inode));
+
+ inode_lock(&inode->v);
+ ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
+ inode_unlock(&inode->v);
+
+ if (unlikely(ret))
+ goto err;
+
+ ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
+ if (unlikely(ret))
+ goto err;
+
+ ret = __bch2_link(c, inode, dir, dentry);
+ if (unlikely(ret))
+ goto err;
+
+ d_instantiate(dentry, &inode->v);
+ return 0;
+err:
+ iput(&inode->v);
+ return ret;
+}
+
+static int bch2_mkdir(struct mnt_idmap *idmap,
+ struct inode *vdir, struct dentry *dentry, umode_t mode)
+{
+ return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0);
+}
+
+static int bch2_rename2(struct mnt_idmap *idmap,
+ struct inode *src_vdir, struct dentry *src_dentry,
+ struct inode *dst_vdir, struct dentry *dst_dentry,
+ unsigned flags)
+{
+ struct bch_fs *c = src_vdir->i_sb->s_fs_info;
+ struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
+ struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
+ struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
+ struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
+ struct bch_inode_unpacked dst_dir_u, src_dir_u;
+ struct bch_inode_unpacked src_inode_u, dst_inode_u;
+ struct btree_trans *trans;
+ enum bch_rename_mode mode = flags & RENAME_EXCHANGE
+ ? BCH_RENAME_EXCHANGE
+ : dst_dentry->d_inode
+ ? BCH_RENAME_OVERWRITE : BCH_RENAME;
+ int ret;
+
+ if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
+ return -EINVAL;
+
+ if (mode == BCH_RENAME_OVERWRITE) {
+ ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
+ 0, LLONG_MAX);
+ if (ret)
+ return ret;
+ }
+
+ trans = bch2_trans_get(c);
+
+ bch2_lock_inodes(INODE_UPDATE_LOCK,
+ src_dir,
+ dst_dir,
+ src_inode,
+ dst_inode);
+
+ ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_subvol) ?:
+ bch2_subvol_is_ro_trans(trans, dst_dir->ei_subvol);
+ if (ret)
+ goto err;
+
+ if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
+ ret = bch2_fs_quota_transfer(c, src_inode,
+ dst_dir->ei_qid,
+ 1 << QTYP_PRJ,
+ KEY_TYPE_QUOTA_PREALLOC);
+ if (ret)
+ goto err;
+ }
+
+ if (mode == BCH_RENAME_EXCHANGE &&
+ inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
+ ret = bch2_fs_quota_transfer(c, dst_inode,
+ src_dir->ei_qid,
+ 1 << QTYP_PRJ,
+ KEY_TYPE_QUOTA_PREALLOC);
+ if (ret)
+ goto err;
+ }
+
+ ret = commit_do(trans, NULL, NULL, 0,
+ bch2_rename_trans(trans,
+ inode_inum(src_dir), &src_dir_u,
+ inode_inum(dst_dir), &dst_dir_u,
+ &src_inode_u,
+ &dst_inode_u,
+ &src_dentry->d_name,
+ &dst_dentry->d_name,
+ mode));
+ if (unlikely(ret))
+ goto err;
+
+ BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
+ BUG_ON(dst_inode &&
+ dst_inode->v.i_ino != dst_inode_u.bi_inum);
+
+ bch2_inode_update_after_write(trans, src_dir, &src_dir_u,
+ ATTR_MTIME|ATTR_CTIME);
+
+ if (src_dir != dst_dir)
+ bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u,
+ ATTR_MTIME|ATTR_CTIME);
+
+ bch2_inode_update_after_write(trans, src_inode, &src_inode_u,
+ ATTR_CTIME);
+
+ if (dst_inode)
+ bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u,
+ ATTR_CTIME);
+err:
+ bch2_trans_put(trans);
+
+ bch2_fs_quota_transfer(c, src_inode,
+ bch_qid(&src_inode->ei_inode),
+ 1 << QTYP_PRJ,
+ KEY_TYPE_QUOTA_NOCHECK);
+ if (dst_inode)
+ bch2_fs_quota_transfer(c, dst_inode,
+ bch_qid(&dst_inode->ei_inode),
+ 1 << QTYP_PRJ,
+ KEY_TYPE_QUOTA_NOCHECK);
+
+ bch2_unlock_inodes(INODE_UPDATE_LOCK,
+ src_dir,
+ dst_dir,
+ src_inode,
+ dst_inode);
+
+ return ret;
+}
+
+static void bch2_setattr_copy(struct mnt_idmap *idmap,
+ struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi,
+ struct iattr *attr)
+{
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ unsigned int ia_valid = attr->ia_valid;
+
+ if (ia_valid & ATTR_UID)
+ bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
+ if (ia_valid & ATTR_GID)
+ bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
+
+ if (ia_valid & ATTR_SIZE)
+ bi->bi_size = attr->ia_size;
+
+ if (ia_valid & ATTR_ATIME)
+ bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
+ if (ia_valid & ATTR_MTIME)
+ bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
+ if (ia_valid & ATTR_CTIME)
+ bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
+
+ if (ia_valid & ATTR_MODE) {
+ umode_t mode = attr->ia_mode;
+ kgid_t gid = ia_valid & ATTR_GID
+ ? attr->ia_gid
+ : inode->v.i_gid;
+
+ if (!in_group_p(gid) &&
+ !capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID))
+ mode &= ~S_ISGID;
+ bi->bi_mode = mode;
+ }
+}
+
+int bch2_setattr_nonsize(struct mnt_idmap *idmap,
+ struct bch_inode_info *inode,
+ struct iattr *attr)
+{
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch_qid qid;
+ struct btree_trans *trans;
+ struct btree_iter inode_iter = { NULL };
+ struct bch_inode_unpacked inode_u;
+ struct posix_acl *acl = NULL;
+ int ret;
+
+ mutex_lock(&inode->ei_update_lock);
+
+ qid = inode->ei_qid;
+
+ if (attr->ia_valid & ATTR_UID)
+ qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
+
+ if (attr->ia_valid & ATTR_GID)
+ qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
+
+ ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
+ KEY_TYPE_QUOTA_PREALLOC);
+ if (ret)
+ goto err;
+
+ trans = bch2_trans_get(c);
+retry:
+ bch2_trans_begin(trans);
+ kfree(acl);
+ acl = NULL;
+
+ ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
+ BTREE_ITER_INTENT);
+ if (ret)
+ goto btree_err;
+
+ bch2_setattr_copy(idmap, inode, &inode_u, attr);
+
+ if (attr->ia_valid & ATTR_MODE) {
+ ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u,
+ inode_u.bi_mode, &acl);
+ if (ret)
+ goto btree_err;
+ }
+
+ ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL);
+btree_err:
+ bch2_trans_iter_exit(trans, &inode_iter);
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
+ if (unlikely(ret))
+ goto err_trans;
+
+ bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid);
+
+ if (acl)
+ set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
+err_trans:
+ bch2_trans_put(trans);
+err:
+ mutex_unlock(&inode->ei_update_lock);
+
+ return bch2_err_class(ret);
+}
+
+static int bch2_getattr(struct mnt_idmap *idmap,
+ const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned query_flags)
+{
+ struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+ stat->dev = inode->v.i_sb->s_dev;
+ stat->ino = inode->v.i_ino;
+ stat->mode = inode->v.i_mode;
+ stat->nlink = inode->v.i_nlink;
+ stat->uid = inode->v.i_uid;
+ stat->gid = inode->v.i_gid;
+ stat->rdev = inode->v.i_rdev;
+ stat->size = i_size_read(&inode->v);
+ stat->atime = inode_get_atime(&inode->v);
+ stat->mtime = inode_get_mtime(&inode->v);
+ stat->ctime = inode_get_ctime(&inode->v);
+ stat->blksize = block_bytes(c);
+ stat->blocks = inode->v.i_blocks;
+
+ if (request_mask & STATX_BTIME) {
+ stat->result_mask |= STATX_BTIME;
+ stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
+ }
+
+ if (inode->ei_inode.bi_flags & BCH_INODE_immutable)
+ stat->attributes |= STATX_ATTR_IMMUTABLE;
+ stat->attributes_mask |= STATX_ATTR_IMMUTABLE;
+
+ if (inode->ei_inode.bi_flags & BCH_INODE_append)
+ stat->attributes |= STATX_ATTR_APPEND;
+ stat->attributes_mask |= STATX_ATTR_APPEND;
+
+ if (inode->ei_inode.bi_flags & BCH_INODE_nodump)
+ stat->attributes |= STATX_ATTR_NODUMP;
+ stat->attributes_mask |= STATX_ATTR_NODUMP;
+
+ return 0;
+}
+
+static int bch2_setattr(struct mnt_idmap *idmap,
+ struct dentry *dentry, struct iattr *iattr)
+{
+ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ int ret;
+
+ lockdep_assert_held(&inode->v.i_rwsem);
+
+ ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+ setattr_prepare(idmap, dentry, iattr);
+ if (ret)
+ return ret;
+
+ return iattr->ia_valid & ATTR_SIZE
+ ? bchfs_truncate(idmap, inode, iattr)
+ : bch2_setattr_nonsize(idmap, inode, iattr);
+}
+
+static int bch2_tmpfile(struct mnt_idmap *idmap,
+ struct inode *vdir, struct file *file, umode_t mode)
+{
+ struct bch_inode_info *inode =
+ __bch2_create(idmap, to_bch_ei(vdir),
+ file->f_path.dentry, mode, 0,
+ (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
+
+ if (IS_ERR(inode))
+ return bch2_err_class(PTR_ERR(inode));
+
+ d_mark_tmpfile(file, &inode->v);
+ d_instantiate(file->f_path.dentry, &inode->v);
+ return finish_open_simple(file, 0);
+}
+
+static int bch2_fill_extent(struct bch_fs *c,
+ struct fiemap_extent_info *info,
+ struct bkey_s_c k, unsigned flags)
+{
+ if (bkey_extent_is_direct_data(k.k)) {
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ int ret;
+
+ if (k.k->type == KEY_TYPE_reflink_v)
+ flags |= FIEMAP_EXTENT_SHARED;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ int flags2 = 0;
+ u64 offset = p.ptr.offset;
+
+ if (p.ptr.unwritten)
+ flags2 |= FIEMAP_EXTENT_UNWRITTEN;
+
+ if (p.crc.compression_type)
+ flags2 |= FIEMAP_EXTENT_ENCODED;
+ else
+ offset += p.crc.offset;
+
+ if ((offset & (block_sectors(c) - 1)) ||
+ (k.k->size & (block_sectors(c) - 1)))
+ flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
+
+ ret = fiemap_fill_next_extent(info,
+ bkey_start_offset(k.k) << 9,
+ offset << 9,
+ k.k->size << 9, flags|flags2);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+ } else if (bkey_extent_is_inline_data(k.k)) {
+ return fiemap_fill_next_extent(info,
+ bkey_start_offset(k.k) << 9,
+ 0, k.k->size << 9,
+ flags|
+ FIEMAP_EXTENT_DATA_INLINE);
+ } else if (k.k->type == KEY_TYPE_reservation) {
+ return fiemap_fill_next_extent(info,
+ bkey_start_offset(k.k) << 9,
+ 0, k.k->size << 9,
+ flags|
+ FIEMAP_EXTENT_DELALLOC|
+ FIEMAP_EXTENT_UNWRITTEN);
+ } else {
+ BUG();
+ }
+}
+
+static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
+ u64 start, u64 len)
+{
+ struct bch_fs *c = vinode->i_sb->s_fs_info;
+ struct bch_inode_info *ei = to_bch_ei(vinode);
+ struct btree_trans *trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_buf cur, prev;
+ struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
+ unsigned offset_into_extent, sectors;
+ bool have_extent = false;
+ u32 snapshot;
+ int ret = 0;
+
+ ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
+ if (ret)
+ return ret;
+
+ if (start + len < start)
+ return -EINVAL;
+
+ start >>= 9;
+
+ bch2_bkey_buf_init(&cur);
+ bch2_bkey_buf_init(&prev);
+ trans = bch2_trans_get(c);
+retry:
+ bch2_trans_begin(trans);
+
+ ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+ SPOS(ei->v.i_ino, start, snapshot), 0);
+
+ while (!(ret = btree_trans_too_many_iters(trans)) &&
+ (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
+ !(ret = bkey_err(k))) {
+ enum btree_id data_btree = BTREE_ID_extents;
+
+ if (!bkey_extent_is_data(k.k) &&
+ k.k->type != KEY_TYPE_reservation) {
+ bch2_btree_iter_advance(&iter);
+ continue;
+ }
+
+ offset_into_extent = iter.pos.offset -
+ bkey_start_offset(k.k);
+ sectors = k.k->size - offset_into_extent;
+
+ bch2_bkey_buf_reassemble(&cur, c, k);
+
+ ret = bch2_read_indirect_extent(trans, &data_btree,
+ &offset_into_extent, &cur);
+ if (ret)
+ break;
+
+ k = bkey_i_to_s_c(cur.k);
+ bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
+
+ sectors = min(sectors, k.k->size - offset_into_extent);
+
+ bch2_cut_front(POS(k.k->p.inode,
+ bkey_start_offset(k.k) +
+ offset_into_extent),
+ cur.k);
+ bch2_key_resize(&cur.k->k, sectors);
+ cur.k->k.p = iter.pos;
+ cur.k->k.p.offset += cur.k->k.size;
+
+ if (have_extent) {
+ bch2_trans_unlock(trans);
+ ret = bch2_fill_extent(c, info,
+ bkey_i_to_s_c(prev.k), 0);
+ if (ret)
+ break;
+ }
+
+ bkey_copy(prev.k, cur.k);
+ have_extent = true;
+
+ bch2_btree_iter_set_pos(&iter,
+ POS(iter.pos.inode, iter.pos.offset + sectors));
+ }
+ start = iter.pos.offset;
+ bch2_trans_iter_exit(trans, &iter);
+err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
+
+ if (!ret && have_extent) {
+ bch2_trans_unlock(trans);
+ ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
+ FIEMAP_EXTENT_LAST);
+ }
+
+ bch2_trans_put(trans);
+ bch2_bkey_buf_exit(&cur, c);
+ bch2_bkey_buf_exit(&prev, c);
+ return ret < 0 ? ret : 0;
+}
+
+static const struct vm_operations_struct bch_vm_ops = {
+ .fault = bch2_page_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = bch2_page_mkwrite,
+};
+
+static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ file_accessed(file);
+
+ vma->vm_ops = &bch_vm_ops;
+ return 0;
+}
+
+/* Directories: */
+
+static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ return generic_file_llseek_size(file, offset, whence,
+ S64_MAX, S64_MAX);
+}
+
+static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
+{
+ struct bch_inode_info *inode = file_bch_inode(file);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ int ret;
+
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+
+ ret = bch2_readdir(c, inode_inum(inode), ctx);
+ if (ret)
+ bch_err_fn(c, ret);
+
+ return bch2_err_class(ret);
+}
+
+static int bch2_open(struct inode *vinode, struct file *file)
+{
+ if (file->f_flags & (O_WRONLY|O_RDWR)) {
+ struct bch_inode_info *inode = to_bch_ei(vinode);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+ int ret = bch2_subvol_is_ro(c, inode->ei_subvol);
+ if (ret)
+ return ret;
+ }
+
+ return generic_file_open(vinode, file);
+}
+
+static const struct file_operations bch_file_operations = {
+ .open = bch2_open,
+ .llseek = bch2_llseek,
+ .read_iter = bch2_read_iter,
+ .write_iter = bch2_write_iter,
+ .mmap = bch2_mmap,
+ .fsync = bch2_fsync,
+ .splice_read = filemap_splice_read,
+ .splice_write = iter_file_splice_write,
+ .fallocate = bch2_fallocate_dispatch,
+ .unlocked_ioctl = bch2_fs_file_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = bch2_compat_fs_ioctl,
+#endif
+ .remap_file_range = bch2_remap_file_range,
+};
+
+static const struct inode_operations bch_file_inode_operations = {
+ .getattr = bch2_getattr,
+ .setattr = bch2_setattr,
+ .fiemap = bch2_fiemap,
+ .listxattr = bch2_xattr_list,
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+ .get_acl = bch2_get_acl,
+ .set_acl = bch2_set_acl,
+#endif
+};
+
+static const struct inode_operations bch_dir_inode_operations = {
+ .lookup = bch2_lookup,
+ .create = bch2_create,
+ .link = bch2_link,
+ .unlink = bch2_unlink,
+ .symlink = bch2_symlink,
+ .mkdir = bch2_mkdir,
+ .rmdir = bch2_unlink,
+ .mknod = bch2_mknod,
+ .rename = bch2_rename2,
+ .getattr = bch2_getattr,
+ .setattr = bch2_setattr,
+ .tmpfile = bch2_tmpfile,
+ .listxattr = bch2_xattr_list,
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+ .get_acl = bch2_get_acl,
+ .set_acl = bch2_set_acl,
+#endif
+};
+
+static const struct file_operations bch_dir_file_operations = {
+ .llseek = bch2_dir_llseek,
+ .read = generic_read_dir,
+ .iterate_shared = bch2_vfs_readdir,
+ .fsync = bch2_fsync,
+ .unlocked_ioctl = bch2_fs_file_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = bch2_compat_fs_ioctl,
+#endif
+};
+
+static const struct inode_operations bch_symlink_inode_operations = {
+ .get_link = page_get_link,
+ .getattr = bch2_getattr,
+ .setattr = bch2_setattr,
+ .listxattr = bch2_xattr_list,
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+ .get_acl = bch2_get_acl,
+ .set_acl = bch2_set_acl,
+#endif
+};
+
+static const struct inode_operations bch_special_inode_operations = {
+ .getattr = bch2_getattr,
+ .setattr = bch2_setattr,
+ .listxattr = bch2_xattr_list,
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+ .get_acl = bch2_get_acl,
+ .set_acl = bch2_set_acl,
+#endif
+};
+
+static const struct address_space_operations bch_address_space_operations = {
+ .read_folio = bch2_read_folio,
+ .writepages = bch2_writepages,
+ .readahead = bch2_readahead,
+ .dirty_folio = filemap_dirty_folio,
+ .write_begin = bch2_write_begin,
+ .write_end = bch2_write_end,
+ .invalidate_folio = bch2_invalidate_folio,
+ .release_folio = bch2_release_folio,
+ .direct_IO = noop_direct_IO,
+#ifdef CONFIG_MIGRATION
+ .migrate_folio = filemap_migrate_folio,
+#endif
+ .error_remove_page = generic_error_remove_page,
+};
+
+struct bcachefs_fid {
+ u64 inum;
+ u32 subvol;
+ u32 gen;
+} __packed;
+
+struct bcachefs_fid_with_parent {
+ struct bcachefs_fid fid;
+ struct bcachefs_fid dir;
+} __packed;
+
+static int bcachefs_fid_valid(int fh_len, int fh_type)
+{
+ switch (fh_type) {
+ case FILEID_BCACHEFS_WITHOUT_PARENT:
+ return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32);
+ case FILEID_BCACHEFS_WITH_PARENT:
+ return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32);
+ default:
+ return false;
+ }
+}
+
+static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
+{
+ return (struct bcachefs_fid) {
+ .inum = inode->ei_inode.bi_inum,
+ .subvol = inode->ei_subvol,
+ .gen = inode->ei_inode.bi_generation,
+ };
+}
+
+static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len,
+ struct inode *vdir)
+{
+ struct bch_inode_info *inode = to_bch_ei(vinode);
+ struct bch_inode_info *dir = to_bch_ei(vdir);
+ int min_len;
+
+ if (!S_ISDIR(inode->v.i_mode) && dir) {
+ struct bcachefs_fid_with_parent *fid = (void *) fh;
+
+ min_len = sizeof(*fid) / sizeof(u32);
+ if (*len < min_len) {
+ *len = min_len;
+ return FILEID_INVALID;
+ }
+
+ fid->fid = bch2_inode_to_fid(inode);
+ fid->dir = bch2_inode_to_fid(dir);
+
+ *len = min_len;
+ return FILEID_BCACHEFS_WITH_PARENT;
+ } else {
+ struct bcachefs_fid *fid = (void *) fh;
+
+ min_len = sizeof(*fid) / sizeof(u32);
+ if (*len < min_len) {
+ *len = min_len;
+ return FILEID_INVALID;
+ }
+ *fid = bch2_inode_to_fid(inode);
+
+ *len = min_len;
+ return FILEID_BCACHEFS_WITHOUT_PARENT;
+ }
+}
+
+static struct inode *bch2_nfs_get_inode(struct super_block *sb,
+ struct bcachefs_fid fid)
+{
+ struct bch_fs *c = sb->s_fs_info;
+ struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) {
+ .subvol = fid.subvol,
+ .inum = fid.inum,
+ });
+ if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) {
+ iput(vinode);
+ vinode = ERR_PTR(-ESTALE);
+ }
+ return vinode;
+}
+
+static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid,
+ int fh_len, int fh_type)
+{
+ struct bcachefs_fid *fid = (void *) _fid;
+
+ if (!bcachefs_fid_valid(fh_len, fh_type))
+ return NULL;
+
+ return d_obtain_alias(bch2_nfs_get_inode(sb, *fid));
+}
+
+static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid,
+ int fh_len, int fh_type)
+{
+ struct bcachefs_fid_with_parent *fid = (void *) _fid;
+
+ if (!bcachefs_fid_valid(fh_len, fh_type) ||
+ fh_type != FILEID_BCACHEFS_WITH_PARENT)
+ return NULL;
+
+ return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir));
+}
+
+static struct dentry *bch2_get_parent(struct dentry *child)
+{
+ struct bch_inode_info *inode = to_bch_ei(child->d_inode);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ subvol_inum parent_inum = {
+ .subvol = inode->ei_inode.bi_parent_subvol ?:
+ inode->ei_subvol,
+ .inum = inode->ei_inode.bi_dir,
+ };
+
+ return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
+}
+
+static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child)
+{
+ struct bch_inode_info *inode = to_bch_ei(child->d_inode);
+ struct bch_inode_info *dir = to_bch_ei(parent->d_inode);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct btree_trans *trans;
+ struct btree_iter iter1;
+ struct btree_iter iter2;
+ struct bkey_s_c k;
+ struct bkey_s_c_dirent d;
+ struct bch_inode_unpacked inode_u;
+ subvol_inum target;
+ u32 snapshot;
+ struct qstr dirent_name;
+ unsigned name_len = 0;
+ int ret;
+
+ if (!S_ISDIR(dir->v.i_mode))
+ return -EINVAL;
+
+ trans = bch2_trans_get(c);
+
+ bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents,
+ POS(dir->ei_inode.bi_inum, 0), 0);
+ bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents,
+ POS(dir->ei_inode.bi_inum, 0), 0);
+retry:
+ bch2_trans_begin(trans);
+
+ ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ bch2_btree_iter_set_snapshot(&iter1, snapshot);
+ bch2_btree_iter_set_snapshot(&iter2, snapshot);
+
+ ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
+ if (ret)
+ goto err;
+
+ if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
+ bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
+
+ k = bch2_btree_iter_peek_slot(&iter1);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_dirent) {
+ ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+ goto err;
+ }
+
+ d = bkey_s_c_to_dirent(k);
+ ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
+ if (ret > 0)
+ ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+ if (ret)
+ goto err;
+
+ if (target.subvol == inode->ei_subvol &&
+ target.inum == inode->ei_inode.bi_inum)
+ goto found;
+ } else {
+ /*
+ * File with multiple hardlinks and our backref is to the wrong
+ * directory - linear search:
+ */
+ for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
+ if (k.k->p.inode > dir->ei_inode.bi_inum)
+ break;
+
+ if (k.k->type != KEY_TYPE_dirent)
+ continue;
+
+ d = bkey_s_c_to_dirent(k);
+ ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
+ if (ret < 0)
+ break;
+ if (ret)
+ continue;
+
+ if (target.subvol == inode->ei_subvol &&
+ target.inum == inode->ei_inode.bi_inum)
+ goto found;
+ }
+ }
+
+ ret = -ENOENT;
+ goto err;
+found:
+ dirent_name = bch2_dirent_get_name(d);
+
+ name_len = min_t(unsigned, dirent_name.len, NAME_MAX);
+ memcpy(name, dirent_name.name, name_len);
+ name[name_len] = '\0';
+err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
+
+ bch2_trans_iter_exit(trans, &iter1);
+ bch2_trans_iter_exit(trans, &iter2);
+ bch2_trans_put(trans);
+
+ return ret;
+}
+
+static const struct export_operations bch_export_ops = {
+ .encode_fh = bch2_encode_fh,
+ .fh_to_dentry = bch2_fh_to_dentry,
+ .fh_to_parent = bch2_fh_to_parent,
+ .get_parent = bch2_get_parent,
+ .get_name = bch2_get_name,
+};
+
+static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
+ struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi,
+ struct bch_subvolume *subvol)
+{
+ bch2_inode_update_after_write(trans, inode, bi, ~0);
+
+ if (BCH_SUBVOLUME_SNAP(subvol))
+ set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
+ else
+ clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
+
+ inode->v.i_blocks = bi->bi_sectors;
+ inode->v.i_ino = bi->bi_inum;
+ inode->v.i_rdev = bi->bi_dev;
+ inode->v.i_generation = bi->bi_generation;
+ inode->v.i_size = bi->bi_size;
+
+ inode->ei_flags = 0;
+ inode->ei_quota_reserved = 0;
+ inode->ei_qid = bch_qid(bi);
+ inode->ei_subvol = inum.subvol;
+
+ inode->v.i_mapping->a_ops = &bch_address_space_operations;
+
+ switch (inode->v.i_mode & S_IFMT) {
+ case S_IFREG:
+ inode->v.i_op = &bch_file_inode_operations;
+ inode->v.i_fop = &bch_file_operations;
+ break;
+ case S_IFDIR:
+ inode->v.i_op = &bch_dir_inode_operations;
+ inode->v.i_fop = &bch_dir_file_operations;
+ break;
+ case S_IFLNK:
+ inode_nohighmem(&inode->v);
+ inode->v.i_op = &bch_symlink_inode_operations;
+ break;
+ default:
+ init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
+ inode->v.i_op = &bch_special_inode_operations;
+ break;
+ }
+
+ mapping_set_large_folios(inode->v.i_mapping);
+}
+
+static struct inode *bch2_alloc_inode(struct super_block *sb)
+{
+ struct bch_inode_info *inode;
+
+ inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
+ if (!inode)
+ return NULL;
+
+ inode_init_once(&inode->v);
+ mutex_init(&inode->ei_update_lock);
+ two_state_lock_init(&inode->ei_pagecache_lock);
+ INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
+ mutex_init(&inode->ei_quota_lock);
+
+ return &inode->v;
+}
+
+static void bch2_i_callback(struct rcu_head *head)
+{
+ struct inode *vinode = container_of(head, struct inode, i_rcu);
+ struct bch_inode_info *inode = to_bch_ei(vinode);
+
+ kmem_cache_free(bch2_inode_cache, inode);
+}
+
+static void bch2_destroy_inode(struct inode *vinode)
+{
+ call_rcu(&vinode->i_rcu, bch2_i_callback);
+}
+
+static int inode_update_times_fn(struct btree_trans *trans,
+ struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi,
+ void *p)
+{
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+ bi->bi_atime = timespec_to_bch2_time(c, inode_get_atime(&inode->v));
+ bi->bi_mtime = timespec_to_bch2_time(c, inode_get_mtime(&inode->v));
+ bi->bi_ctime = timespec_to_bch2_time(c, inode_get_ctime(&inode->v));
+
+ return 0;
+}
+
+static int bch2_vfs_write_inode(struct inode *vinode,
+ struct writeback_control *wbc)
+{
+ struct bch_fs *c = vinode->i_sb->s_fs_info;
+ struct bch_inode_info *inode = to_bch_ei(vinode);
+ int ret;
+
+ mutex_lock(&inode->ei_update_lock);
+ ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
+ ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
+ mutex_unlock(&inode->ei_update_lock);
+
+ return bch2_err_class(ret);
+}
+
+static void bch2_evict_inode(struct inode *vinode)
+{
+ struct bch_fs *c = vinode->i_sb->s_fs_info;
+ struct bch_inode_info *inode = to_bch_ei(vinode);
+
+ truncate_inode_pages_final(&inode->v.i_data);
+
+ clear_inode(&inode->v);
+
+ BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
+
+ if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
+ bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
+ KEY_TYPE_QUOTA_WARN);
+ bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
+ KEY_TYPE_QUOTA_WARN);
+ bch2_inode_rm(c, inode_inum(inode));
+ }
+
+ mutex_lock(&c->vfs_inodes_lock);
+ list_del_init(&inode->ei_vfs_inode_list);
+ mutex_unlock(&c->vfs_inodes_lock);
+}
+
+void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s)
+{
+ struct bch_inode_info *inode, **i;
+ DARRAY(struct bch_inode_info *) grabbed;
+ bool clean_pass = false, this_pass_clean;
+
+ /*
+ * Initially, we scan for inodes without I_DONTCACHE, then mark them to
+ * be pruned with d_mark_dontcache().
+ *
+ * Once we've had a clean pass where we didn't find any inodes without
+ * I_DONTCACHE, we wait for them to be freed:
+ */
+
+ darray_init(&grabbed);
+ darray_make_room(&grabbed, 1024);
+again:
+ cond_resched();
+ this_pass_clean = true;
+
+ mutex_lock(&c->vfs_inodes_lock);
+ list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
+ if (!snapshot_list_has_id(s, inode->ei_subvol))
+ continue;
+
+ if (!(inode->v.i_state & I_DONTCACHE) &&
+ !(inode->v.i_state & I_FREEING) &&
+ igrab(&inode->v)) {
+ this_pass_clean = false;
+
+ if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) {
+ iput(&inode->v);
+ break;
+ }
+ } else if (clean_pass && this_pass_clean) {
+ wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW);
+ DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
+
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&c->vfs_inodes_lock);
+
+ schedule();
+ finish_wait(wq, &wait.wq_entry);
+ goto again;
+ }
+ }
+ mutex_unlock(&c->vfs_inodes_lock);
+
+ darray_for_each(grabbed, i) {
+ inode = *i;
+ d_mark_dontcache(&inode->v);
+ d_prune_aliases(&inode->v);
+ iput(&inode->v);
+ }
+ grabbed.nr = 0;
+
+ if (!clean_pass || !this_pass_clean) {
+ clean_pass = this_pass_clean;
+ goto again;
+ }
+
+ darray_exit(&grabbed);
+}
+
+static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct super_block *sb = dentry->d_sb;
+ struct bch_fs *c = sb->s_fs_info;
+ struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
+ unsigned shift = sb->s_blocksize_bits - 9;
+ /*
+ * this assumes inodes take up 64 bytes, which is a decent average
+ * number:
+ */
+ u64 avail_inodes = ((usage.capacity - usage.used) << 3);
+ u64 fsid;
+
+ buf->f_type = BCACHEFS_STATFS_MAGIC;
+ buf->f_bsize = sb->s_blocksize;
+ buf->f_blocks = usage.capacity >> shift;
+ buf->f_bfree = usage.free >> shift;
+ buf->f_bavail = avail_factor(usage.free) >> shift;
+
+ buf->f_files = usage.nr_inodes + avail_inodes;
+ buf->f_ffree = avail_inodes;
+
+ fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
+ le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
+ buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
+ buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+ buf->f_namelen = BCH_NAME_MAX;
+
+ return 0;
+}
+
+static int bch2_sync_fs(struct super_block *sb, int wait)
+{
+ struct bch_fs *c = sb->s_fs_info;
+ int ret;
+
+ if (c->opts.journal_flush_disabled)
+ return 0;
+
+ if (!wait) {
+ bch2_journal_flush_async(&c->journal, NULL);
+ return 0;
+ }
+
+ ret = bch2_journal_flush(&c->journal);
+ return bch2_err_class(ret);
+}
+
+static struct bch_fs *bch2_path_to_fs(const char *path)
+{
+ struct bch_fs *c;
+ dev_t dev;
+ int ret;
+
+ ret = lookup_bdev(path, &dev);
+ if (ret)
+ return ERR_PTR(ret);
+
+ c = bch2_dev_to_fs(dev);
+ if (c)
+ closure_put(&c->cl);
+ return c ?: ERR_PTR(-ENOENT);
+}
+
+static char **split_devs(const char *_dev_name, unsigned *nr)
+{
+ char *dev_name = NULL, **devs = NULL, *s;
+ size_t i = 0, nr_devs = 0;
+
+ dev_name = kstrdup(_dev_name, GFP_KERNEL);
+ if (!dev_name)
+ return NULL;
+
+ for (s = dev_name; s; s = strchr(s + 1, ':'))
+ nr_devs++;
+
+ devs = kcalloc(nr_devs + 1, sizeof(const char *), GFP_KERNEL);
+ if (!devs) {
+ kfree(dev_name);
+ return NULL;
+ }
+
+ while ((s = strsep(&dev_name, ":")))
+ devs[i++] = s;
+
+ *nr = nr_devs;
+ return devs;
+}
+
+static int bch2_remount(struct super_block *sb, int *flags, char *data)
+{
+ struct bch_fs *c = sb->s_fs_info;
+ struct bch_opts opts = bch2_opts_empty();
+ int ret;
+
+ opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
+
+ ret = bch2_parse_mount_opts(c, &opts, data);
+ if (ret)
+ goto err;
+
+ if (opts.read_only != c->opts.read_only) {
+ down_write(&c->state_lock);
+
+ if (opts.read_only) {
+ bch2_fs_read_only(c);
+
+ sb->s_flags |= SB_RDONLY;
+ } else {
+ ret = bch2_fs_read_write(c);
+ if (ret) {
+ bch_err(c, "error going rw: %i", ret);
+ up_write(&c->state_lock);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ sb->s_flags &= ~SB_RDONLY;
+ }
+
+ c->opts.read_only = opts.read_only;
+
+ up_write(&c->state_lock);
+ }
+
+ if (opt_defined(opts, errors))
+ c->opts.errors = opts.errors;
+err:
+ return bch2_err_class(ret);
+}
+
+static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
+{
+ struct bch_fs *c = root->d_sb->s_fs_info;
+ struct bch_dev *ca;
+ unsigned i;
+ bool first = true;
+
+ for_each_online_member(ca, c, i) {
+ if (!first)
+ seq_putc(seq, ':');
+ first = false;
+ seq_puts(seq, ca->disk_sb.sb_name);
+ }
+
+ return 0;
+}
+
+static int bch2_show_options(struct seq_file *seq, struct dentry *root)
+{
+ struct bch_fs *c = root->d_sb->s_fs_info;
+ enum bch_opt_id i;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ for (i = 0; i < bch2_opts_nr; i++) {
+ const struct bch_option *opt = &bch2_opt_table[i];
+ u64 v = bch2_opt_get_by_id(&c->opts, i);
+
+ if (!(opt->flags & OPT_MOUNT))
+ continue;
+
+ if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
+ continue;
+
+ printbuf_reset(&buf);
+ bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v,
+ OPT_SHOW_MOUNT_STYLE);
+ seq_putc(seq, ',');
+ seq_puts(seq, buf.buf);
+ }
+
+ if (buf.allocation_failure)
+ ret = -ENOMEM;
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static void bch2_put_super(struct super_block *sb)
+{
+ struct bch_fs *c = sb->s_fs_info;
+
+ __bch2_fs_stop(c);
+}
+
+/*
+ * bcachefs doesn't currently integrate intwrite freeze protection but the
+ * internal write references serve the same purpose. Therefore reuse the
+ * read-only transition code to perform the quiesce. The caveat is that we don't
+ * currently have the ability to block tasks that want a write reference while
+ * the superblock is frozen. This is fine for now, but we should either add
+ * blocking support or find a way to integrate sb_start_intwrite() and friends.
+ */
+static int bch2_freeze(struct super_block *sb)
+{
+ struct bch_fs *c = sb->s_fs_info;
+
+ down_write(&c->state_lock);
+ bch2_fs_read_only(c);
+ up_write(&c->state_lock);
+ return 0;
+}
+
+static int bch2_unfreeze(struct super_block *sb)
+{
+ struct bch_fs *c = sb->s_fs_info;
+ int ret;
+
+ if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+ return 0;
+
+ down_write(&c->state_lock);
+ ret = bch2_fs_read_write(c);
+ up_write(&c->state_lock);
+ return ret;
+}
+
+static const struct super_operations bch_super_operations = {
+ .alloc_inode = bch2_alloc_inode,
+ .destroy_inode = bch2_destroy_inode,
+ .write_inode = bch2_vfs_write_inode,
+ .evict_inode = bch2_evict_inode,
+ .sync_fs = bch2_sync_fs,
+ .statfs = bch2_statfs,
+ .show_devname = bch2_show_devname,
+ .show_options = bch2_show_options,
+ .remount_fs = bch2_remount,
+ .put_super = bch2_put_super,
+ .freeze_fs = bch2_freeze,
+ .unfreeze_fs = bch2_unfreeze,
+};
+
+static int bch2_set_super(struct super_block *s, void *data)
+{
+ s->s_fs_info = data;
+ return 0;
+}
+
+static int bch2_noset_super(struct super_block *s, void *data)
+{
+ return -EBUSY;
+}
+
+static int bch2_test_super(struct super_block *s, void *data)
+{
+ struct bch_fs *c = s->s_fs_info;
+ struct bch_fs **devs = data;
+ unsigned i;
+
+ if (!c)
+ return false;
+
+ for (i = 0; devs[i]; i++)
+ if (c != devs[i])
+ return false;
+ return true;
+}
+
+static struct dentry *bch2_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
+{
+ struct bch_fs *c;
+ struct bch_dev *ca;
+ struct super_block *sb;
+ struct inode *vinode;
+ struct bch_opts opts = bch2_opts_empty();
+ char **devs;
+ struct bch_fs **devs_to_fs = NULL;
+ unsigned i, nr_devs;
+ int ret;
+
+ opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
+
+ ret = bch2_parse_mount_opts(NULL, &opts, data);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (!dev_name || strlen(dev_name) == 0)
+ return ERR_PTR(-EINVAL);
+
+ devs = split_devs(dev_name, &nr_devs);
+ if (!devs)
+ return ERR_PTR(-ENOMEM);
+
+ devs_to_fs = kcalloc(nr_devs + 1, sizeof(void *), GFP_KERNEL);
+ if (!devs_to_fs) {
+ sb = ERR_PTR(-ENOMEM);
+ goto got_sb;
+ }
+
+ for (i = 0; i < nr_devs; i++)
+ devs_to_fs[i] = bch2_path_to_fs(devs[i]);
+
+ sb = sget(fs_type, bch2_test_super, bch2_noset_super,
+ flags|SB_NOSEC, devs_to_fs);
+ if (!IS_ERR(sb))
+ goto got_sb;
+
+ c = bch2_fs_open(devs, nr_devs, opts);
+ if (IS_ERR(c)) {
+ sb = ERR_CAST(c);
+ goto got_sb;
+ }
+
+ /* Some options can't be parsed until after the fs is started: */
+ ret = bch2_parse_mount_opts(c, &opts, data);
+ if (ret) {
+ bch2_fs_stop(c);
+ sb = ERR_PTR(ret);
+ goto got_sb;
+ }
+
+ bch2_opts_apply(&c->opts, opts);
+
+ sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c);
+ if (IS_ERR(sb))
+ bch2_fs_stop(c);
+got_sb:
+ kfree(devs_to_fs);
+ kfree(devs[0]);
+ kfree(devs);
+
+ if (IS_ERR(sb)) {
+ ret = PTR_ERR(sb);
+ ret = bch2_err_class(ret);
+ return ERR_PTR(ret);
+ }
+
+ c = sb->s_fs_info;
+
+ if (sb->s_root) {
+ if ((flags ^ sb->s_flags) & SB_RDONLY) {
+ ret = -EBUSY;
+ goto err_put_super;
+ }
+ goto out;
+ }
+
+ sb->s_blocksize = block_bytes(c);
+ sb->s_blocksize_bits = ilog2(block_bytes(c));
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_op = &bch_super_operations;
+ sb->s_export_op = &bch_export_ops;
+#ifdef CONFIG_BCACHEFS_QUOTA
+ sb->s_qcop = &bch2_quotactl_operations;
+ sb->s_quota_types = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
+#endif
+ sb->s_xattr = bch2_xattr_handlers;
+ sb->s_magic = BCACHEFS_STATFS_MAGIC;
+ sb->s_time_gran = c->sb.nsec_per_time_unit;
+ sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
+ sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec);
+ c->vfs_sb = sb;
+ strscpy(sb->s_id, c->name, sizeof(sb->s_id));
+
+ ret = super_setup_bdi(sb);
+ if (ret)
+ goto err_put_super;
+
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
+
+ for_each_online_member(ca, c, i) {
+ struct block_device *bdev = ca->disk_sb.bdev;
+
+ /* XXX: create an anonymous device for multi device filesystems */
+ sb->s_bdev = bdev;
+ sb->s_dev = bdev->bd_dev;
+ percpu_ref_put(&ca->io_ref);
+ break;
+ }
+
+ c->dev = sb->s_dev;
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+ if (c->opts.acl)
+ sb->s_flags |= SB_POSIXACL;
+#endif
+
+ sb->s_shrink->seeks = 0;
+
+ vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
+ ret = PTR_ERR_OR_ZERO(vinode);
+ if (ret) {
+ bch_err_msg(c, ret, "mounting: error getting root inode");
+ goto err_put_super;
+ }
+
+ sb->s_root = d_make_root(vinode);
+ if (!sb->s_root) {
+ bch_err(c, "error mounting: error allocating root dentry");
+ ret = -ENOMEM;
+ goto err_put_super;
+ }
+
+ sb->s_flags |= SB_ACTIVE;
+out:
+ return dget(sb->s_root);
+
+err_put_super:
+ deactivate_locked_super(sb);
+ return ERR_PTR(bch2_err_class(ret));
+}
+
+static void bch2_kill_sb(struct super_block *sb)
+{
+ struct bch_fs *c = sb->s_fs_info;
+
+ generic_shutdown_super(sb);
+ bch2_fs_free(c);
+}
+
+static struct file_system_type bcache_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "bcachefs",
+ .mount = bch2_mount,
+ .kill_sb = bch2_kill_sb,
+ .fs_flags = FS_REQUIRES_DEV,
+};
+
+MODULE_ALIAS_FS("bcachefs");
+
+void bch2_vfs_exit(void)
+{
+ unregister_filesystem(&bcache_fs_type);
+ kmem_cache_destroy(bch2_inode_cache);
+}
+
+int __init bch2_vfs_init(void)
+{
+ int ret = -ENOMEM;
+
+ bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT);
+ if (!bch2_inode_cache)
+ goto err;
+
+ ret = register_filesystem(&bcache_fs_type);
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ bch2_vfs_exit();
+ return ret;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
new file mode 100644
index 000000000000..5edf1d4b9e6b
--- /dev/null
+++ b/fs/bcachefs/fs.h
@@ -0,0 +1,209 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_H
+#define _BCACHEFS_FS_H
+
+#include "inode.h"
+#include "opts.h"
+#include "str_hash.h"
+#include "quota_types.h"
+#include "two_state_shared_lock.h"
+
+#include <linux/seqlock.h>
+#include <linux/stat.h>
+
+struct bch_inode_info {
+ struct inode v;
+ struct list_head ei_vfs_inode_list;
+ unsigned long ei_flags;
+
+ struct mutex ei_update_lock;
+ u64 ei_quota_reserved;
+ unsigned long ei_last_dirtied;
+ two_state_lock_t ei_pagecache_lock;
+
+ struct mutex ei_quota_lock;
+ struct bch_qid ei_qid;
+
+ u32 ei_subvol;
+
+ /*
+ * When we've been doing nocow writes we'll need to issue flushes to the
+ * underlying block devices
+ *
+ * XXX: a device may have had a flush issued by some other codepath. It
+ * would be better to keep for each device a sequence number that's
+ * incremented when we isusue a cache flush, and track here the sequence
+ * number that needs flushing.
+ */
+ struct bch_devs_mask ei_devs_need_flush;
+
+ /* copy of inode in btree: */
+ struct bch_inode_unpacked ei_inode;
+};
+
+#define bch2_pagecache_add_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 0)
+#define bch2_pagecache_add_tryget(i) bch2_two_state_trylock(&i->ei_pagecache_lock, 0)
+#define bch2_pagecache_add_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 0)
+
+#define bch2_pagecache_block_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 1)
+#define bch2_pagecache_block_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 1)
+
+static inline subvol_inum inode_inum(struct bch_inode_info *inode)
+{
+ return (subvol_inum) {
+ .subvol = inode->ei_subvol,
+ .inum = inode->ei_inode.bi_inum,
+ };
+}
+
+/*
+ * Set if we've gotten a btree error for this inode, and thus the vfs inode and
+ * btree inode may be inconsistent:
+ */
+#define EI_INODE_ERROR 0
+
+/*
+ * Set in the inode is in a snapshot subvolume - we don't do quota accounting in
+ * those:
+ */
+#define EI_INODE_SNAPSHOT 1
+
+#define to_bch_ei(_inode) \
+ container_of_or_null(_inode, struct bch_inode_info, v)
+
+static inline int ptrcmp(void *l, void *r)
+{
+ return cmp_int(l, r);
+}
+
+enum bch_inode_lock_op {
+ INODE_LOCK = (1U << 0),
+ INODE_PAGECACHE_BLOCK = (1U << 1),
+ INODE_UPDATE_LOCK = (1U << 2),
+};
+
+#define bch2_lock_inodes(_locks, ...) \
+do { \
+ struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \
+ unsigned i; \
+ \
+ bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \
+ \
+ for (i = 1; i < ARRAY_SIZE(a); i++) \
+ if (a[i] != a[i - 1]) { \
+ if ((_locks) & INODE_LOCK) \
+ down_write_nested(&a[i]->v.i_rwsem, i); \
+ if ((_locks) & INODE_PAGECACHE_BLOCK) \
+ bch2_pagecache_block_get(a[i]);\
+ if ((_locks) & INODE_UPDATE_LOCK) \
+ mutex_lock_nested(&a[i]->ei_update_lock, i);\
+ } \
+} while (0)
+
+#define bch2_unlock_inodes(_locks, ...) \
+do { \
+ struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \
+ unsigned i; \
+ \
+ bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \
+ \
+ for (i = 1; i < ARRAY_SIZE(a); i++) \
+ if (a[i] != a[i - 1]) { \
+ if ((_locks) & INODE_LOCK) \
+ up_write(&a[i]->v.i_rwsem); \
+ if ((_locks) & INODE_PAGECACHE_BLOCK) \
+ bch2_pagecache_block_put(a[i]);\
+ if ((_locks) & INODE_UPDATE_LOCK) \
+ mutex_unlock(&a[i]->ei_update_lock); \
+ } \
+} while (0)
+
+static inline struct bch_inode_info *file_bch_inode(struct file *file)
+{
+ return to_bch_ei(file_inode(file));
+}
+
+static inline bool inode_attr_changing(struct bch_inode_info *dir,
+ struct bch_inode_info *inode,
+ enum inode_opt_id id)
+{
+ return !(inode->ei_inode.bi_fields_set & (1 << id)) &&
+ bch2_inode_opt_get(&dir->ei_inode, id) !=
+ bch2_inode_opt_get(&inode->ei_inode, id);
+}
+
+static inline bool inode_attrs_changing(struct bch_inode_info *dir,
+ struct bch_inode_info *inode)
+{
+ unsigned id;
+
+ for (id = 0; id < Inode_opt_nr; id++)
+ if (inode_attr_changing(dir, inode, id))
+ return true;
+
+ return false;
+}
+
+struct bch_inode_unpacked;
+
+#ifndef NO_BCACHEFS_FS
+
+struct bch_inode_info *
+__bch2_create(struct mnt_idmap *, struct bch_inode_info *,
+ struct dentry *, umode_t, dev_t, subvol_inum, unsigned);
+
+int bch2_fs_quota_transfer(struct bch_fs *,
+ struct bch_inode_info *,
+ struct bch_qid,
+ unsigned,
+ enum quota_acct_mode);
+
+static inline int bch2_set_projid(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ u32 projid)
+{
+ struct bch_qid qid = inode->ei_qid;
+
+ qid.q[QTYP_PRJ] = projid;
+
+ return bch2_fs_quota_transfer(c, inode, qid,
+ 1 << QTYP_PRJ,
+ KEY_TYPE_QUOTA_PREALLOC);
+}
+
+struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum);
+
+/* returns 0 if we want to do the update, or error is passed up */
+typedef int (*inode_set_fn)(struct btree_trans *,
+ struct bch_inode_info *,
+ struct bch_inode_unpacked *, void *);
+
+void bch2_inode_update_after_write(struct btree_trans *,
+ struct bch_inode_info *,
+ struct bch_inode_unpacked *,
+ unsigned);
+int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
+ inode_set_fn, void *, unsigned);
+
+int bch2_setattr_nonsize(struct mnt_idmap *,
+ struct bch_inode_info *,
+ struct iattr *);
+int __bch2_unlink(struct inode *, struct dentry *, bool);
+
+void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *);
+
+void bch2_vfs_exit(void);
+int bch2_vfs_init(void);
+
+#else
+
+#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); })
+
+static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
+ snapshot_id_list *s) {}
+static inline void bch2_vfs_exit(void) {}
+static inline int bch2_vfs_init(void) { return 0; }
+
+#endif /* NO_BCACHEFS_FS */
+
+#endif /* _BCACHEFS_FS_H */
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
new file mode 100644
index 000000000000..e0c5cd119acc
--- /dev/null
+++ b/fs/bcachefs/fsck.c
@@ -0,0 +1,2490 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "btree_cache.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "darray.h"
+#include "dirent.h"
+#include "error.h"
+#include "fs-common.h"
+#include "fsck.h"
+#include "inode.h"
+#include "keylist.h"
+#include "recovery.h"
+#include "snapshot.h"
+#include "super.h"
+#include "xattr.h"
+
+#include <linux/bsearch.h>
+#include <linux/dcache.h> /* struct qstr */
+
+#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+
+/*
+ * XXX: this is handling transaction restarts without returning
+ * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
+ */
+static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
+ u32 snapshot)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u64 sectors = 0;
+ int ret;
+
+ for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
+ SPOS(inum, 0, snapshot),
+ POS(inum, U64_MAX),
+ 0, k, ret)
+ if (bkey_extent_is_allocation(k.k))
+ sectors += k.k->size;
+
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret ?: sectors;
+}
+
+static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
+ u32 snapshot)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_s_c_dirent d;
+ u64 subdirs = 0;
+ int ret;
+
+ for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
+ SPOS(inum, 0, snapshot),
+ POS(inum, U64_MAX),
+ 0, k, ret) {
+ if (k.k->type != KEY_TYPE_dirent)
+ continue;
+
+ d = bkey_s_c_to_dirent(k);
+ if (d.v->d_type == DT_DIR)
+ subdirs++;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret ?: subdirs;
+}
+
+static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
+ u32 *subvol)
+{
+ struct bch_snapshot s;
+ int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots,
+ POS(0, snapshot), 0,
+ snapshot, &s);
+ if (!ret)
+ *subvol = le32_to_cpu(s.subvol);
+ else if (bch2_err_matches(ret, ENOENT))
+ bch_err(trans->c, "snapshot %u not found", snapshot);
+ return ret;
+
+}
+
+static int __subvol_lookup(struct btree_trans *trans, u32 subvol,
+ u32 *snapshot, u64 *inum)
+{
+ struct bch_subvolume s;
+ int ret;
+
+ ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
+
+ *snapshot = le32_to_cpu(s.snapshot);
+ *inum = le64_to_cpu(s.inode);
+ return ret;
+}
+
+static int subvol_lookup(struct btree_trans *trans, u32 subvol,
+ u32 *snapshot, u64 *inum)
+{
+ return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum));
+}
+
+static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
+ struct bch_inode_unpacked *inode)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+ POS(0, inode_nr),
+ BTREE_ITER_ALL_SNAPSHOTS);
+ k = bch2_btree_iter_peek(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) {
+ ret = -BCH_ERR_ENOENT_inode;
+ goto err;
+ }
+
+ ret = bch2_inode_unpack(k, inode);
+err:
+ bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
+ struct bch_inode_unpacked *inode,
+ u32 *snapshot)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, inode_nr, *snapshot), 0);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ ret = bkey_is_inode(k.k)
+ ? bch2_inode_unpack(k, inode)
+ : -BCH_ERR_ENOENT_inode;
+ if (!ret)
+ *snapshot = iter.pos.snapshot;
+err:
+ bch_err_msg(trans->c, ret, "fetching inode %llu:%u", inode_nr, *snapshot);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
+ struct bch_inode_unpacked *inode,
+ u32 *snapshot)
+{
+ return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot));
+}
+
+static int __lookup_dirent(struct btree_trans *trans,
+ struct bch_hash_info hash_info,
+ subvol_inum dir, struct qstr *name,
+ u64 *target, unsigned *type)
+{
+ struct btree_iter iter;
+ struct bkey_s_c_dirent d;
+ int ret;
+
+ ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc,
+ &hash_info, dir, name, 0);
+ if (ret)
+ return ret;
+
+ d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
+ *target = le64_to_cpu(d.v->d_inum);
+ *type = d.v->d_type;
+ bch2_trans_iter_exit(trans, &iter);
+ return 0;
+}
+
+static int __write_inode(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode,
+ u32 snapshot)
+{
+ struct bkey_inode_buf *inode_p =
+ bch2_trans_kmalloc(trans, sizeof(*inode_p));
+
+ if (IS_ERR(inode_p))
+ return PTR_ERR(inode_p);
+
+ bch2_inode_pack(inode_p, inode);
+ inode_p->inode.k.p.snapshot = snapshot;
+
+ return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
+ &inode_p->inode.k_i,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+}
+
+static int fsck_write_inode(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode,
+ u32 snapshot)
+{
+ int ret = commit_do(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ __write_inode(trans, inode, snapshot));
+ if (ret)
+ bch_err_fn(trans->c, ret);
+ return ret;
+}
+
+static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bch_inode_unpacked dir_inode;
+ struct bch_hash_info dir_hash_info;
+ int ret;
+
+ ret = lookup_first_inode(trans, pos.inode, &dir_inode);
+ if (ret)
+ goto err;
+
+ dir_hash_info = bch2_hash_info_init(c, &dir_inode);
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
+
+ ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+ &dir_hash_info, &iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ bch2_trans_iter_exit(trans, &iter);
+err:
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+/* Get lost+found, create if it doesn't exist: */
+static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
+ struct bch_inode_unpacked *lostfound)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_inode_unpacked root;
+ struct bch_hash_info root_hash_info;
+ struct qstr lostfound_str = QSTR("lost+found");
+ subvol_inum root_inum = { .subvol = subvol };
+ u64 inum = 0;
+ unsigned d_type = 0;
+ u32 snapshot;
+ int ret;
+
+ ret = __subvol_lookup(trans, subvol, &snapshot, &root_inum.inum);
+ if (ret)
+ return ret;
+
+ ret = __lookup_inode(trans, root_inum.inum, &root, &snapshot);
+ if (ret)
+ return ret;
+
+ root_hash_info = bch2_hash_info_init(c, &root);
+
+ ret = __lookup_dirent(trans, root_hash_info, root_inum,
+ &lostfound_str, &inum, &d_type);
+ if (bch2_err_matches(ret, ENOENT)) {
+ bch_notice(c, "creating lost+found");
+ goto create_lostfound;
+ }
+
+ bch_err_fn(c, ret);
+ if (ret)
+ return ret;
+
+ if (d_type != DT_DIR) {
+ bch_err(c, "error looking up lost+found: not a directory");
+ return -BCH_ERR_ENOENT_not_directory;
+ }
+
+ /*
+ * The bch2_check_dirents pass has already run, dangling dirents
+ * shouldn't exist here:
+ */
+ return __lookup_inode(trans, inum, lostfound, &snapshot);
+
+create_lostfound:
+ bch2_inode_init_early(c, lostfound);
+
+ ret = bch2_create_trans(trans, root_inum, &root,
+ lostfound, &lostfound_str,
+ 0, 0, S_IFDIR|0700, 0, NULL, NULL,
+ (subvol_inum) { }, 0);
+ bch_err_msg(c, ret, "creating lost+found");
+ return ret;
+}
+
+static int __reattach_inode(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode,
+ u32 inode_snapshot)
+{
+ struct bch_hash_info dir_hash;
+ struct bch_inode_unpacked lostfound;
+ char name_buf[20];
+ struct qstr name;
+ u64 dir_offset = 0;
+ u32 subvol;
+ int ret;
+
+ ret = __snapshot_lookup_subvol(trans, inode_snapshot, &subvol);
+ if (ret)
+ return ret;
+
+ ret = lookup_lostfound(trans, subvol, &lostfound);
+ if (ret)
+ return ret;
+
+ if (S_ISDIR(inode->bi_mode)) {
+ lostfound.bi_nlink++;
+
+ ret = __write_inode(trans, &lostfound, U32_MAX);
+ if (ret)
+ return ret;
+ }
+
+ dir_hash = bch2_hash_info_init(trans->c, &lostfound);
+
+ snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
+ name = (struct qstr) QSTR(name_buf);
+
+ ret = bch2_dirent_create(trans,
+ (subvol_inum) {
+ .subvol = subvol,
+ .inum = lostfound.bi_inum,
+ },
+ &dir_hash,
+ inode_d_type(inode),
+ &name, inode->bi_inum, &dir_offset,
+ BCH_HASH_SET_MUST_CREATE);
+ if (ret)
+ return ret;
+
+ inode->bi_dir = lostfound.bi_inum;
+ inode->bi_dir_offset = dir_offset;
+
+ return __write_inode(trans, inode, inode_snapshot);
+}
+
+static int reattach_inode(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode,
+ u32 inode_snapshot)
+{
+ int ret = commit_do(trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL,
+ __reattach_inode(trans, inode, inode_snapshot));
+ bch_err_msg(trans->c, ret, "reattaching inode %llu", inode->bi_inum);
+ return ret;
+}
+
+static int remove_backpointer(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode)
+{
+ struct btree_iter iter;
+ struct bkey_s_c_dirent d;
+ int ret;
+
+ d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
+ POS(inode->bi_dir, inode->bi_dir_offset), 0,
+ dirent);
+ ret = bkey_err(d) ?:
+ __remove_dirent(trans, d.k->p);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+struct snapshots_seen_entry {
+ u32 id;
+ u32 equiv;
+};
+
+struct snapshots_seen {
+ struct bpos pos;
+ DARRAY(struct snapshots_seen_entry) ids;
+};
+
+static inline void snapshots_seen_exit(struct snapshots_seen *s)
+{
+ darray_exit(&s->ids);
+}
+
+static inline void snapshots_seen_init(struct snapshots_seen *s)
+{
+ memset(s, 0, sizeof(*s));
+}
+
+static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
+{
+ struct snapshots_seen_entry *i, n = {
+ .id = id,
+ .equiv = bch2_snapshot_equiv(c, id),
+ };
+ int ret = 0;
+
+ darray_for_each(s->ids, i) {
+ if (i->id == id)
+ return 0;
+ if (i->id > id)
+ break;
+ }
+
+ ret = darray_insert_item(&s->ids, i - s->ids.data, n);
+ if (ret)
+ bch_err(c, "error reallocating snapshots_seen table (size %zu)",
+ s->ids.size);
+ return ret;
+}
+
+static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
+ enum btree_id btree_id, struct bpos pos)
+{
+ struct snapshots_seen_entry *i, n = {
+ .id = pos.snapshot,
+ .equiv = bch2_snapshot_equiv(c, pos.snapshot),
+ };
+ int ret = 0;
+
+ if (!bkey_eq(s->pos, pos))
+ s->ids.nr = 0;
+
+ s->pos = pos;
+ s->pos.snapshot = n.equiv;
+
+ darray_for_each(s->ids, i) {
+ if (i->id == n.id)
+ return 0;
+
+ /*
+ * We currently don't rigorously track for snapshot cleanup
+ * needing to be run, so it shouldn't be a fsck error yet:
+ */
+ if (i->equiv == n.equiv) {
+ bch_err(c, "snapshot deletion did not finish:\n"
+ " duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n",
+ bch2_btree_id_str(btree_id),
+ pos.inode, pos.offset,
+ i->id, n.id, n.equiv);
+ set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags);
+ return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots);
+ }
+ }
+
+ ret = darray_push(&s->ids, n);
+ if (ret)
+ bch_err(c, "error reallocating snapshots_seen table (size %zu)",
+ s->ids.size);
+ return ret;
+}
+
+/**
+ * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
+ * and @ancestor hasn't been overwritten in @seen
+ *
+ * @c: filesystem handle
+ * @seen: list of snapshot ids already seen at current position
+ * @id: descendent snapshot id
+ * @ancestor: ancestor snapshot id
+ *
+ * Returns: whether key in @ancestor snapshot is visible in @id snapshot
+ */
+static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
+ u32 id, u32 ancestor)
+{
+ ssize_t i;
+
+ EBUG_ON(id > ancestor);
+ EBUG_ON(!bch2_snapshot_is_equiv(c, id));
+ EBUG_ON(!bch2_snapshot_is_equiv(c, ancestor));
+
+ /* @ancestor should be the snapshot most recently added to @seen */
+ EBUG_ON(ancestor != seen->pos.snapshot);
+ EBUG_ON(ancestor != seen->ids.data[seen->ids.nr - 1].equiv);
+
+ if (id == ancestor)
+ return true;
+
+ if (!bch2_snapshot_is_ancestor(c, id, ancestor))
+ return false;
+
+ /*
+ * We know that @id is a descendant of @ancestor, we're checking if
+ * we've seen a key that overwrote @ancestor - i.e. also a descendent of
+ * @ascestor and with @id as a descendent.
+ *
+ * But we already know that we're scanning IDs between @id and @ancestor
+ * numerically, since snapshot ID lists are kept sorted, so if we find
+ * an id that's an ancestor of @id we're done:
+ */
+
+ for (i = seen->ids.nr - 2;
+ i >= 0 && seen->ids.data[i].equiv >= id;
+ --i)
+ if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv))
+ return false;
+
+ return true;
+}
+
+/**
+ * ref_visible - given a key with snapshot id @src that points to a key with
+ * snapshot id @dst, test whether there is some snapshot in which @dst is
+ * visible.
+ *
+ * @c: filesystem handle
+ * @s: list of snapshot IDs already seen at @src
+ * @src: snapshot ID of src key
+ * @dst: snapshot ID of dst key
+ * Returns: true if there is some snapshot in which @dst is visible
+ *
+ * Assumes we're visiting @src keys in natural key order
+ */
+static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
+ u32 src, u32 dst)
+{
+ return dst <= src
+ ? key_visible_in_snapshot(c, s, dst, src)
+ : bch2_snapshot_is_ancestor(c, src, dst);
+}
+
+static int ref_visible2(struct bch_fs *c,
+ u32 src, struct snapshots_seen *src_seen,
+ u32 dst, struct snapshots_seen *dst_seen)
+{
+ src = bch2_snapshot_equiv(c, src);
+ dst = bch2_snapshot_equiv(c, dst);
+
+ if (dst > src) {
+ swap(dst, src);
+ swap(dst_seen, src_seen);
+ }
+ return key_visible_in_snapshot(c, src_seen, dst, src);
+}
+
+#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \
+ for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \
+ (_i)->snapshot <= (_snapshot); _i++) \
+ if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
+
+struct inode_walker_entry {
+ struct bch_inode_unpacked inode;
+ u32 snapshot;
+ bool seen_this_pos;
+ u64 count;
+};
+
+struct inode_walker {
+ bool first_this_inode;
+ bool recalculate_sums;
+ struct bpos last_pos;
+
+ DARRAY(struct inode_walker_entry) inodes;
+};
+
+static void inode_walker_exit(struct inode_walker *w)
+{
+ darray_exit(&w->inodes);
+}
+
+static struct inode_walker inode_walker_init(void)
+{
+ return (struct inode_walker) { 0, };
+}
+
+static int add_inode(struct bch_fs *c, struct inode_walker *w,
+ struct bkey_s_c inode)
+{
+ struct bch_inode_unpacked u;
+
+ BUG_ON(bch2_inode_unpack(inode, &u));
+
+ return darray_push(&w->inodes, ((struct inode_walker_entry) {
+ .inode = u,
+ .snapshot = bch2_snapshot_equiv(c, inode.k->p.snapshot),
+ }));
+}
+
+static int get_inodes_all_snapshots(struct btree_trans *trans,
+ struct inode_walker *w, u64 inum)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u32 restart_count = trans->restart_count;
+ int ret;
+
+ w->recalculate_sums = false;
+ w->inodes.nr = 0;
+
+ for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ if (k.k->p.offset != inum)
+ break;
+
+ if (bkey_is_inode(k.k))
+ add_inode(c, w, k);
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (ret)
+ return ret;
+
+ w->first_this_inode = true;
+
+ return trans_was_restarted(trans, restart_count);
+}
+
+static struct inode_walker_entry *
+lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w,
+ u32 snapshot, bool is_whiteout)
+{
+ struct inode_walker_entry *i;
+
+ snapshot = bch2_snapshot_equiv(c, snapshot);
+
+ darray_for_each(w->inodes, i)
+ if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot))
+ goto found;
+
+ return NULL;
+found:
+ BUG_ON(snapshot > i->snapshot);
+
+ if (snapshot != i->snapshot && !is_whiteout) {
+ struct inode_walker_entry new = *i;
+ size_t pos;
+ int ret;
+
+ new.snapshot = snapshot;
+ new.count = 0;
+
+ bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u",
+ w->last_pos.inode, snapshot, i->snapshot);
+
+ while (i > w->inodes.data && i[-1].snapshot > snapshot)
+ --i;
+
+ pos = i - w->inodes.data;
+ ret = darray_insert_item(&w->inodes, pos, new);
+ if (ret)
+ return ERR_PTR(ret);
+
+ i = w->inodes.data + pos;
+ }
+
+ return i;
+}
+
+static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
+ struct inode_walker *w, struct bpos pos,
+ bool is_whiteout)
+{
+ if (w->last_pos.inode != pos.inode) {
+ int ret = get_inodes_all_snapshots(trans, w, pos.inode);
+ if (ret)
+ return ERR_PTR(ret);
+ } else if (bkey_cmp(w->last_pos, pos)) {
+ struct inode_walker_entry *i;
+
+ darray_for_each(w->inodes, i)
+ i->seen_this_pos = false;
+
+ }
+
+ w->last_pos = pos;
+
+ return lookup_inode_for_snapshot(trans->c, w, pos.snapshot, is_whiteout);
+}
+
+static int __get_visible_inodes(struct btree_trans *trans,
+ struct inode_walker *w,
+ struct snapshots_seen *s,
+ u64 inum)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ w->inodes.nr = 0;
+
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
+ if (k.k->p.offset != inum)
+ break;
+
+ if (!ref_visible(c, s, s->pos.snapshot, equiv))
+ continue;
+
+ if (bkey_is_inode(k.k))
+ add_inode(c, w, k);
+
+ if (equiv >= s->pos.snapshot)
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret;
+}
+
+static int check_key_has_snapshot(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
+ bkey_in_missing_snapshot,
+ "key in missing snapshot: %s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ ret = bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int hash_redo_key(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ struct bch_hash_info *hash_info,
+ struct btree_iter *k_iter, struct bkey_s_c k)
+{
+ struct bkey_i *delete;
+ struct bkey_i *tmp;
+
+ delete = bch2_trans_kmalloc(trans, sizeof(*delete));
+ if (IS_ERR(delete))
+ return PTR_ERR(delete);
+
+ tmp = bch2_bkey_make_mut_noupdate(trans, k);
+ if (IS_ERR(tmp))
+ return PTR_ERR(tmp);
+
+ bkey_init(&delete->k);
+ delete->k.p = k_iter->pos;
+ return bch2_btree_iter_traverse(k_iter) ?:
+ bch2_trans_update(trans, k_iter, delete, 0) ?:
+ bch2_hash_set_snapshot(trans, desc, hash_info,
+ (subvol_inum) { 0, k.k->p.inode },
+ k.k->p.snapshot, tmp,
+ BCH_HASH_SET_MUST_CREATE,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW);
+}
+
+static int hash_check_key(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ struct bch_hash_info *hash_info,
+ struct btree_iter *k_iter, struct bkey_s_c hash_k)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter = { NULL };
+ struct printbuf buf = PRINTBUF;
+ struct bkey_s_c k;
+ u64 hash;
+ int ret = 0;
+
+ if (hash_k.k->type != desc.key_type)
+ return 0;
+
+ hash = desc.hash_bkey(hash_info, hash_k);
+
+ if (likely(hash == hash_k.k->p.offset))
+ return 0;
+
+ if (hash_k.k->p.offset < hash)
+ goto bad_hash;
+
+ for_each_btree_key_norestart(trans, iter, desc.btree_id,
+ SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
+ BTREE_ITER_SLOTS, k, ret) {
+ if (bkey_eq(k.k->p, hash_k.k->p))
+ break;
+
+ if (fsck_err_on(k.k->type == desc.key_type &&
+ !desc.cmp_bkey(k, hash_k), c,
+ hash_table_key_duplicate,
+ "duplicate hash table keys:\n%s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, hash_k),
+ buf.buf))) {
+ ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
+ break;
+ }
+
+ if (bkey_deleted(k.k)) {
+ bch2_trans_iter_exit(trans, &iter);
+ goto bad_hash;
+ }
+ }
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
+ return ret;
+bad_hash:
+ if (fsck_err(c, hash_table_key_wrong_offset,
+ "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
+ bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
+ ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
+ bch_err_fn(c, ret);
+ if (ret)
+ return ret;
+ ret = -BCH_ERR_transaction_restart_nested;
+ }
+fsck_err:
+ goto out;
+}
+
+static int check_inode(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ struct bch_inode_unpacked *prev,
+ struct snapshots_seen *s,
+ bool full)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_inode_unpacked u;
+ bool do_update = false;
+ int ret;
+
+ ret = check_key_has_snapshot(trans, iter, k);
+ if (ret < 0)
+ goto err;
+ if (ret)
+ return 0;
+
+ ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
+ if (ret)
+ goto err;
+
+ if (!bkey_is_inode(k.k))
+ return 0;
+
+ BUG_ON(bch2_inode_unpack(k, &u));
+
+ if (!full &&
+ !(u.bi_flags & (BCH_INODE_i_size_dirty|
+ BCH_INODE_i_sectors_dirty|
+ BCH_INODE_unlinked)))
+ return 0;
+
+ if (prev->bi_inum != u.bi_inum)
+ *prev = u;
+
+ if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed ||
+ inode_d_type(prev) != inode_d_type(&u),
+ c, inode_snapshot_mismatch,
+ "inodes in different snapshots don't match")) {
+ bch_err(c, "repair not implemented yet");
+ return -EINVAL;
+ }
+
+ if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
+ bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
+ struct bpos new_min_pos;
+
+ ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
+ if (ret)
+ goto err;
+
+ u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
+
+ ret = __write_inode(trans, &u, iter->pos.snapshot);
+ bch_err_msg(c, ret, "in fsck updating inode");
+ if (ret)
+ return ret;
+
+ if (!bpos_eq(new_min_pos, POS_MIN))
+ bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
+ return 0;
+ }
+
+ if (u.bi_flags & BCH_INODE_unlinked &&
+ (!c->sb.clean ||
+ fsck_err(c, inode_unlinked_but_clean,
+ "filesystem marked clean, but inode %llu unlinked",
+ u.bi_inum))) {
+ bch2_trans_unlock(trans);
+ bch2_fs_lazy_rw(c);
+
+ ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
+ bch_err_msg(c, ret, "in fsck deleting inode");
+ return ret;
+ }
+
+ if (u.bi_flags & BCH_INODE_i_size_dirty &&
+ (!c->sb.clean ||
+ fsck_err(c, inode_i_size_dirty_but_clean,
+ "filesystem marked clean, but inode %llu has i_size dirty",
+ u.bi_inum))) {
+ bch_verbose(c, "truncating inode %llu", u.bi_inum);
+
+ bch2_trans_unlock(trans);
+ bch2_fs_lazy_rw(c);
+
+ /*
+ * XXX: need to truncate partial blocks too here - or ideally
+ * just switch units to bytes and that issue goes away
+ */
+ ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+ SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
+ iter->pos.snapshot),
+ POS(u.bi_inum, U64_MAX),
+ 0, NULL);
+ bch_err_msg(c, ret, "in fsck truncating inode");
+ if (ret)
+ return ret;
+
+ /*
+ * We truncated without our normal sector accounting hook, just
+ * make sure we recalculate it:
+ */
+ u.bi_flags |= BCH_INODE_i_sectors_dirty;
+
+ u.bi_flags &= ~BCH_INODE_i_size_dirty;
+ do_update = true;
+ }
+
+ if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
+ (!c->sb.clean ||
+ fsck_err(c, inode_i_sectors_dirty_but_clean,
+ "filesystem marked clean, but inode %llu has i_sectors dirty",
+ u.bi_inum))) {
+ s64 sectors;
+
+ bch_verbose(c, "recounting sectors for inode %llu",
+ u.bi_inum);
+
+ sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
+ if (sectors < 0) {
+ bch_err_msg(c, sectors, "in fsck recounting inode sectors");
+ return sectors;
+ }
+
+ u.bi_sectors = sectors;
+ u.bi_flags &= ~BCH_INODE_i_sectors_dirty;
+ do_update = true;
+ }
+
+ if (u.bi_flags & BCH_INODE_backptr_untrusted) {
+ u.bi_dir = 0;
+ u.bi_dir_offset = 0;
+ u.bi_flags &= ~BCH_INODE_backptr_untrusted;
+ do_update = true;
+ }
+
+ if (do_update) {
+ ret = __write_inode(trans, &u, iter->pos.snapshot);
+ bch_err_msg(c, ret, "in fsck updating inode");
+ if (ret)
+ return ret;
+ }
+err:
+fsck_err:
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+noinline_for_stack
+int bch2_check_inodes(struct bch_fs *c)
+{
+ bool full = c->opts.fsck;
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bch_inode_unpacked prev = { 0 };
+ struct snapshots_seen s;
+ struct bkey_s_c k;
+ int ret;
+
+ snapshots_seen_init(&s);
+
+ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
+ POS_MIN,
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ check_inode(trans, &iter, k, &prev, &s, full));
+
+ snapshots_seen_exit(&s);
+ bch2_trans_put(trans);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos pos)
+{
+ return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
+}
+
+static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
+ struct bkey_s_c_dirent d)
+{
+ return inode->bi_dir == d.k->p.inode &&
+ inode->bi_dir_offset == d.k->p.offset;
+}
+
+static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *inode)
+{
+ return d.v->d_type == DT_SUBVOL
+ ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol
+ : le64_to_cpu(d.v->d_inum) == inode->bi_inum;
+}
+
+static int inode_backpointer_exists(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode,
+ u32 snapshot)
+{
+ struct btree_iter iter;
+ struct bkey_s_c_dirent d;
+ int ret;
+
+ d = dirent_get_by_pos(trans, &iter,
+ SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
+ ret = bkey_err(d);
+ if (ret)
+ return bch2_err_matches(ret, ENOENT) ? 0 : ret;
+
+ ret = dirent_points_to_inode(d, inode);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
+{
+ struct bch_fs *c = trans->c;
+ struct inode_walker_entry *i;
+ u32 restart_count = trans->restart_count;
+ int ret = 0;
+ s64 count2;
+
+ darray_for_each(w->inodes, i) {
+ if (i->inode.bi_sectors == i->count)
+ continue;
+
+ count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
+
+ if (w->recalculate_sums)
+ i->count = count2;
+
+ if (i->count != count2) {
+ bch_err(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
+ w->last_pos.inode, i->snapshot, i->count, count2);
+ return -BCH_ERR_internal_fsck_err;
+ }
+
+ if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
+ c, inode_i_sectors_wrong,
+ "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
+ w->last_pos.inode, i->snapshot,
+ i->inode.bi_sectors, i->count)) {
+ i->inode.bi_sectors = i->count;
+ ret = fsck_write_inode(trans, &i->inode, i->snapshot);
+ if (ret)
+ break;
+ }
+ }
+fsck_err:
+ bch_err_fn(c, ret);
+ return ret ?: trans_was_restarted(trans, restart_count);
+}
+
+struct extent_end {
+ u32 snapshot;
+ u64 offset;
+ struct snapshots_seen seen;
+};
+
+struct extent_ends {
+ struct bpos last_pos;
+ DARRAY(struct extent_end) e;
+};
+
+static void extent_ends_reset(struct extent_ends *extent_ends)
+{
+ struct extent_end *i;
+
+ darray_for_each(extent_ends->e, i)
+ snapshots_seen_exit(&i->seen);
+
+ extent_ends->e.nr = 0;
+}
+
+static void extent_ends_exit(struct extent_ends *extent_ends)
+{
+ extent_ends_reset(extent_ends);
+ darray_exit(&extent_ends->e);
+}
+
+static void extent_ends_init(struct extent_ends *extent_ends)
+{
+ memset(extent_ends, 0, sizeof(*extent_ends));
+}
+
+static int extent_ends_at(struct bch_fs *c,
+ struct extent_ends *extent_ends,
+ struct snapshots_seen *seen,
+ struct bkey_s_c k)
+{
+ struct extent_end *i, n = (struct extent_end) {
+ .offset = k.k->p.offset,
+ .snapshot = k.k->p.snapshot,
+ .seen = *seen,
+ };
+
+ n.seen.ids.data = kmemdup(seen->ids.data,
+ sizeof(seen->ids.data[0]) * seen->ids.size,
+ GFP_KERNEL);
+ if (!n.seen.ids.data)
+ return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
+
+ darray_for_each(extent_ends->e, i) {
+ if (i->snapshot == k.k->p.snapshot) {
+ snapshots_seen_exit(&i->seen);
+ *i = n;
+ return 0;
+ }
+
+ if (i->snapshot >= k.k->p.snapshot)
+ break;
+ }
+
+ return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
+}
+
+static int overlapping_extents_found(struct btree_trans *trans,
+ enum btree_id btree,
+ struct bpos pos1, struct snapshots_seen *pos1_seen,
+ struct bkey pos2,
+ bool *fixed,
+ struct extent_end *extent_end)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+ struct btree_iter iter1, iter2 = { NULL };
+ struct bkey_s_c k1, k2;
+ int ret;
+
+ BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
+
+ bch2_trans_iter_init(trans, &iter1, btree, pos1,
+ BTREE_ITER_ALL_SNAPSHOTS|
+ BTREE_ITER_NOT_EXTENTS);
+ k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX));
+ ret = bkey_err(k1);
+ if (ret)
+ goto err;
+
+ prt_str(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, k1);
+
+ if (!bpos_eq(pos1, k1.k->p)) {
+ prt_str(&buf, "\n wanted\n ");
+ bch2_bpos_to_text(&buf, pos1);
+ prt_str(&buf, "\n ");
+ bch2_bkey_to_text(&buf, &pos2);
+
+ bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
+ __func__, buf.buf);
+ ret = -BCH_ERR_internal_fsck_err;
+ goto err;
+ }
+
+ bch2_trans_copy_iter(&iter2, &iter1);
+
+ while (1) {
+ bch2_btree_iter_advance(&iter2);
+
+ k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX));
+ ret = bkey_err(k2);
+ if (ret)
+ goto err;
+
+ if (bpos_ge(k2.k->p, pos2.p))
+ break;
+ }
+
+ prt_str(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, k2);
+
+ if (bpos_gt(k2.k->p, pos2.p) ||
+ pos2.size != k2.k->size) {
+ bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
+ __func__, buf.buf);
+ ret = -BCH_ERR_internal_fsck_err;
+ goto err;
+ }
+
+ prt_printf(&buf, "\n overwriting %s extent",
+ pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
+
+ if (fsck_err(c, extent_overlapping,
+ "overlapping extents%s", buf.buf)) {
+ struct btree_iter *old_iter = &iter1;
+ struct disk_reservation res = { 0 };
+
+ if (pos1.snapshot < pos2.p.snapshot) {
+ old_iter = &iter2;
+ swap(k1, k2);
+ }
+
+ trans->extra_journal_res += bch2_bkey_sectors_compressed(k2);
+
+ ret = bch2_trans_update_extent_overwrite(trans, old_iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
+ k1, k2) ?:
+ bch2_trans_commit(trans, &res, NULL,
+ BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL);
+ bch2_disk_reservation_put(c, &res);
+
+ if (ret)
+ goto err;
+
+ *fixed = true;
+
+ if (pos1.snapshot == pos2.p.snapshot) {
+ /*
+ * We overwrote the first extent, and did the overwrite
+ * in the same snapshot:
+ */
+ extent_end->offset = bkey_start_offset(&pos2);
+ } else if (pos1.snapshot > pos2.p.snapshot) {
+ /*
+ * We overwrote the first extent in pos2's snapshot:
+ */
+ ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
+ } else {
+ /*
+ * We overwrote the second extent - restart
+ * check_extent() from the top:
+ */
+ ret = -BCH_ERR_transaction_restart_nested;
+ }
+ }
+fsck_err:
+err:
+ bch2_trans_iter_exit(trans, &iter2);
+ bch2_trans_iter_exit(trans, &iter1);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int check_overlapping_extents(struct btree_trans *trans,
+ struct snapshots_seen *seen,
+ struct extent_ends *extent_ends,
+ struct bkey_s_c k,
+ u32 equiv,
+ struct btree_iter *iter,
+ bool *fixed)
+{
+ struct bch_fs *c = trans->c;
+ struct extent_end *i;
+ int ret = 0;
+
+ /* transaction restart, running again */
+ if (bpos_eq(extent_ends->last_pos, k.k->p))
+ return 0;
+
+ if (extent_ends->last_pos.inode != k.k->p.inode)
+ extent_ends_reset(extent_ends);
+
+ darray_for_each(extent_ends->e, i) {
+ if (i->offset <= bkey_start_offset(k.k))
+ continue;
+
+ if (!ref_visible2(c,
+ k.k->p.snapshot, seen,
+ i->snapshot, &i->seen))
+ continue;
+
+ ret = overlapping_extents_found(trans, iter->btree_id,
+ SPOS(iter->pos.inode,
+ i->offset,
+ i->snapshot),
+ &i->seen,
+ *k.k, fixed, i);
+ if (ret)
+ goto err;
+ }
+
+ ret = extent_ends_at(c, extent_ends, seen, k);
+ if (ret)
+ goto err;
+
+ extent_ends->last_pos = k.k->p;
+err:
+ return ret;
+}
+
+static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ struct bch_extent_crc_unpacked crc;
+ const union bch_extent_entry *i;
+ unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
+
+ bkey_for_each_crc(k.k, ptrs, crc, i)
+ if (crc_is_encoded(crc) &&
+ crc.uncompressed_size > encoded_extent_max_sectors) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch_err(c, "overbig encoded extent, please report this:\n %s", buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ return 0;
+}
+
+static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k,
+ struct inode_walker *inode,
+ struct snapshots_seen *s,
+ struct extent_ends *extent_ends)
+{
+ struct bch_fs *c = trans->c;
+ struct inode_walker_entry *i;
+ struct printbuf buf = PRINTBUF;
+ struct bpos equiv = k.k->p;
+ int ret = 0;
+
+ equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
+ ret = check_key_has_snapshot(trans, iter, k);
+ if (ret) {
+ ret = ret < 0 ? ret : 0;
+ goto out;
+ }
+
+ if (inode->last_pos.inode != k.k->p.inode) {
+ ret = check_i_sectors(trans, inode);
+ if (ret)
+ goto err;
+ }
+
+ i = walk_inode(trans, inode, equiv, k.k->type == KEY_TYPE_whiteout);
+ ret = PTR_ERR_OR_ZERO(i);
+ if (ret)
+ goto err;
+
+ ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_whiteout) {
+ if (fsck_err_on(!i, c, extent_in_missing_inode,
+ "extent in missing inode:\n %s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ goto delete;
+
+ if (fsck_err_on(i &&
+ !S_ISREG(i->inode.bi_mode) &&
+ !S_ISLNK(i->inode.bi_mode),
+ c, extent_in_non_reg_inode,
+ "extent in non regular inode mode %o:\n %s",
+ i->inode.bi_mode,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ goto delete;
+
+ ret = check_overlapping_extents(trans, s, extent_ends, k,
+ equiv.snapshot, iter,
+ &inode->recalculate_sums);
+ if (ret)
+ goto err;
+ }
+
+ /*
+ * Check inodes in reverse order, from oldest snapshots to newest,
+ * starting from the inode that matches this extent's snapshot. If we
+ * didn't have one, iterate over all inodes:
+ */
+ if (!i)
+ i = inode->inodes.data + inode->inodes.nr - 1;
+
+ for (;
+ inode->inodes.data && i >= inode->inodes.data;
+ --i) {
+ if (i->snapshot > equiv.snapshot ||
+ !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot))
+ continue;
+
+ if (k.k->type != KEY_TYPE_whiteout) {
+ if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
+ k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
+ !bkey_extent_is_reservation(k),
+ c, extent_past_end_of_inode,
+ "extent type past end of inode %llu:%u, i_size %llu\n %s",
+ i->inode.bi_inum, i->snapshot, i->inode.bi_size,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ struct btree_iter iter2;
+
+ bch2_trans_copy_iter(&iter2, iter);
+ bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
+ ret = bch2_btree_iter_traverse(&iter2) ?:
+ bch2_btree_delete_at(trans, &iter2,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ bch2_trans_iter_exit(trans, &iter2);
+ if (ret)
+ goto err;
+
+ iter->k.type = KEY_TYPE_whiteout;
+ }
+
+ if (bkey_extent_is_allocation(k.k))
+ i->count += k.k->size;
+ }
+
+ i->seen_this_pos = true;
+ }
+out:
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
+delete:
+ ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ goto out;
+}
+
+/*
+ * Walk extents: verify that extents have a corresponding S_ISREG inode, and
+ * that i_size an i_sectors are consistent
+ */
+int bch2_check_extents(struct bch_fs *c)
+{
+ struct inode_walker w = inode_walker_init();
+ struct snapshots_seen s;
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct extent_ends extent_ends;
+ struct disk_reservation res = { 0 };
+ int ret = 0;
+
+ snapshots_seen_init(&s);
+ extent_ends_init(&extent_ends);
+
+ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
+ POS(BCACHEFS_ROOT_INO, 0),
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ &res, NULL,
+ BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
+ bch2_disk_reservation_put(c, &res);
+ check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
+ check_extent_overbig(trans, &iter, k);
+ })) ?:
+ check_i_sectors(trans, &w);
+
+ bch2_disk_reservation_put(c, &res);
+ extent_ends_exit(&extent_ends);
+ inode_walker_exit(&w);
+ snapshots_seen_exit(&s);
+ bch2_trans_put(trans);
+
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+int bch2_check_indirect_extents(struct bch_fs *c)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct disk_reservation res = { 0 };
+ int ret = 0;
+
+ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
+ POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ &res, NULL,
+ BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
+ bch2_disk_reservation_put(c, &res);
+ check_extent_overbig(trans, &iter, k);
+ }));
+
+ bch2_disk_reservation_put(c, &res);
+ bch2_trans_put(trans);
+
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
+{
+ struct bch_fs *c = trans->c;
+ struct inode_walker_entry *i;
+ u32 restart_count = trans->restart_count;
+ int ret = 0;
+ s64 count2;
+
+ darray_for_each(w->inodes, i) {
+ if (i->inode.bi_nlink == i->count)
+ continue;
+
+ count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
+ if (count2 < 0)
+ return count2;
+
+ if (i->count != count2) {
+ bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
+ i->count, count2);
+ i->count = count2;
+ if (i->inode.bi_nlink == i->count)
+ continue;
+ }
+
+ if (fsck_err_on(i->inode.bi_nlink != i->count,
+ c, inode_dir_wrong_nlink,
+ "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
+ w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
+ i->inode.bi_nlink = i->count;
+ ret = fsck_write_inode(trans, &i->inode, i->snapshot);
+ if (ret)
+ break;
+ }
+ }
+fsck_err:
+ bch_err_fn(c, ret);
+ return ret ?: trans_was_restarted(trans, restart_count);
+}
+
+static int check_dirent_target(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *target,
+ u32 target_snapshot)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_i_dirent *n;
+ bool backpointer_exists = true;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ if (!target->bi_dir &&
+ !target->bi_dir_offset) {
+ target->bi_dir = d.k->p.inode;
+ target->bi_dir_offset = d.k->p.offset;
+
+ ret = __write_inode(trans, target, target_snapshot);
+ if (ret)
+ goto err;
+ }
+
+ if (!inode_points_to_dirent(target, d)) {
+ ret = inode_backpointer_exists(trans, target, d.k->p.snapshot);
+ if (ret < 0)
+ goto err;
+
+ backpointer_exists = ret;
+ ret = 0;
+
+ if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists,
+ c, inode_dir_multiple_links,
+ "directory %llu with multiple links",
+ target->bi_inum)) {
+ ret = __remove_dirent(trans, d.k->p);
+ goto out;
+ }
+
+ if (fsck_err_on(backpointer_exists && !target->bi_nlink,
+ c, inode_multiple_links_but_nlink_0,
+ "inode %llu type %s has multiple links but i_nlink 0",
+ target->bi_inum, bch2_d_types[d.v->d_type])) {
+ target->bi_nlink++;
+ target->bi_flags &= ~BCH_INODE_unlinked;
+
+ ret = __write_inode(trans, target, target_snapshot);
+ if (ret)
+ goto err;
+ }
+
+ if (fsck_err_on(!backpointer_exists,
+ c, inode_wrong_backpointer,
+ "inode %llu:%u has wrong backpointer:\n"
+ "got %llu:%llu\n"
+ "should be %llu:%llu",
+ target->bi_inum, target_snapshot,
+ target->bi_dir,
+ target->bi_dir_offset,
+ d.k->p.inode,
+ d.k->p.offset)) {
+ target->bi_dir = d.k->p.inode;
+ target->bi_dir_offset = d.k->p.offset;
+
+ ret = __write_inode(trans, target, target_snapshot);
+ if (ret)
+ goto err;
+ }
+ }
+
+ if (fsck_err_on(d.v->d_type != inode_d_type(target),
+ c, dirent_d_type_wrong,
+ "incorrect d_type: got %s, should be %s:\n%s",
+ bch2_d_type_str(d.v->d_type),
+ bch2_d_type_str(inode_d_type(target)),
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
+ n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ goto err;
+
+ bkey_reassemble(&n->k_i, d.s_c);
+ n->v.d_type = inode_d_type(target);
+
+ ret = bch2_trans_update(trans, iter, &n->k_i, 0);
+ if (ret)
+ goto err;
+
+ d = dirent_i_to_s_c(n);
+ }
+
+ if (d.v->d_type == DT_SUBVOL &&
+ target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) &&
+ (c->sb.version < bcachefs_metadata_version_subvol_dirent ||
+ fsck_err(c, dirent_d_parent_subvol_wrong,
+ "dirent has wrong d_parent_subvol field: got %u, should be %u",
+ le32_to_cpu(d.v->d_parent_subvol),
+ target->bi_parent_subvol))) {
+ n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ goto err;
+
+ bkey_reassemble(&n->k_i, d.s_c);
+ n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
+
+ ret = bch2_trans_update(trans, iter, &n->k_i, 0);
+ if (ret)
+ goto err;
+
+ d = dirent_i_to_s_c(n);
+ }
+out:
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k,
+ struct bch_hash_info *hash_info,
+ struct inode_walker *dir,
+ struct inode_walker *target,
+ struct snapshots_seen *s)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c_dirent d;
+ struct inode_walker_entry *i;
+ struct printbuf buf = PRINTBUF;
+ struct bpos equiv;
+ int ret = 0;
+
+ ret = check_key_has_snapshot(trans, iter, k);
+ if (ret) {
+ ret = ret < 0 ? ret : 0;
+ goto out;
+ }
+
+ equiv = k.k->p;
+ equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
+ ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
+ if (ret)
+ goto err;
+
+ if (k.k->type == KEY_TYPE_whiteout)
+ goto out;
+
+ if (dir->last_pos.inode != k.k->p.inode) {
+ ret = check_subdir_count(trans, dir);
+ if (ret)
+ goto err;
+ }
+
+ BUG_ON(!iter->path->should_be_locked);
+
+ i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout);
+ ret = PTR_ERR_OR_ZERO(i);
+ if (ret < 0)
+ goto err;
+
+ if (dir->first_this_inode && dir->inodes.nr)
+ *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
+ dir->first_this_inode = false;
+
+ if (fsck_err_on(!i, c, dirent_in_missing_dir_inode,
+ "dirent in nonexisting directory:\n%s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ goto out;
+ }
+
+ if (!i)
+ goto out;
+
+ if (fsck_err_on(!S_ISDIR(i->inode.bi_mode),
+ c, dirent_in_non_dir_inode,
+ "dirent in non directory inode type %s:\n%s",
+ bch2_d_type_str(inode_d_type(&i->inode)),
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, iter, 0);
+ goto out;
+ }
+
+ ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k);
+ if (ret < 0)
+ goto err;
+ if (ret) {
+ /* dirent has been deleted */
+ ret = 0;
+ goto out;
+ }
+
+ if (k.k->type != KEY_TYPE_dirent)
+ goto out;
+
+ d = bkey_s_c_to_dirent(k);
+
+ if (d.v->d_type == DT_SUBVOL) {
+ struct bch_inode_unpacked subvol_root;
+ u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
+ u32 target_snapshot;
+ u64 target_inum;
+
+ ret = __subvol_lookup(trans, target_subvol,
+ &target_snapshot, &target_inum);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ goto err;
+
+ if (fsck_err_on(ret, c, dirent_to_missing_subvol,
+ "dirent points to missing subvolume %u",
+ le32_to_cpu(d.v->d_child_subvol))) {
+ ret = __remove_dirent(trans, d.k->p);
+ goto err;
+ }
+
+ ret = __lookup_inode(trans, target_inum,
+ &subvol_root, &target_snapshot);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ goto err;
+
+ if (fsck_err_on(ret, c, subvol_to_missing_root,
+ "subvolume %u points to missing subvolume root %llu",
+ target_subvol,
+ target_inum)) {
+ bch_err(c, "repair not implemented yet");
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (fsck_err_on(subvol_root.bi_subvol != target_subvol,
+ c, subvol_root_wrong_bi_subvol,
+ "subvol root %llu has wrong bi_subvol field: got %u, should be %u",
+ target_inum,
+ subvol_root.bi_subvol, target_subvol)) {
+ subvol_root.bi_subvol = target_subvol;
+ ret = __write_inode(trans, &subvol_root, target_snapshot);
+ if (ret)
+ goto err;
+ }
+
+ ret = check_dirent_target(trans, iter, d, &subvol_root,
+ target_snapshot);
+ if (ret)
+ goto err;
+ } else {
+ ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
+ if (ret)
+ goto err;
+
+ if (fsck_err_on(!target->inodes.nr,
+ c, dirent_to_missing_inode,
+ "dirent points to missing inode: (equiv %u)\n%s",
+ equiv.snapshot,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k),
+ buf.buf))) {
+ ret = __remove_dirent(trans, d.k->p);
+ if (ret)
+ goto err;
+ }
+
+ darray_for_each(target->inodes, i) {
+ ret = check_dirent_target(trans, iter, d,
+ &i->inode, i->snapshot);
+ if (ret)
+ goto err;
+ }
+ }
+
+ if (d.v->d_type == DT_DIR)
+ for_each_visible_inode(c, s, dir, equiv.snapshot, i)
+ i->count++;
+
+out:
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+/*
+ * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
+ * validate d_type
+ */
+int bch2_check_dirents(struct bch_fs *c)
+{
+ struct inode_walker dir = inode_walker_init();
+ struct inode_walker target = inode_walker_init();
+ struct snapshots_seen s;
+ struct bch_hash_info hash_info;
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ snapshots_seen_init(&s);
+
+ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
+ POS(BCACHEFS_ROOT_INO, 0),
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+ k,
+ NULL, NULL,
+ BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s));
+
+ bch2_trans_put(trans);
+ snapshots_seen_exit(&s);
+ inode_walker_exit(&dir);
+ inode_walker_exit(&target);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k,
+ struct bch_hash_info *hash_info,
+ struct inode_walker *inode)
+{
+ struct bch_fs *c = trans->c;
+ struct inode_walker_entry *i;
+ int ret;
+
+ ret = check_key_has_snapshot(trans, iter, k);
+ if (ret)
+ return ret;
+
+ i = walk_inode(trans, inode, k.k->p, k.k->type == KEY_TYPE_whiteout);
+ ret = PTR_ERR_OR_ZERO(i);
+ if (ret)
+ return ret;
+
+ if (inode->first_this_inode && inode->inodes.nr)
+ *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
+ inode->first_this_inode = false;
+
+ if (fsck_err_on(!i, c, xattr_in_missing_inode,
+ "xattr for missing inode %llu",
+ k.k->p.inode))
+ return bch2_btree_delete_at(trans, iter, 0);
+
+ if (!i)
+ return 0;
+
+ ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
+fsck_err:
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+/*
+ * Walk xattrs: verify that they all have a corresponding inode
+ */
+int bch2_check_xattrs(struct bch_fs *c)
+{
+ struct inode_walker inode = inode_walker_init();
+ struct bch_hash_info hash_info;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
+ POS(BCACHEFS_ROOT_INO, 0),
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+ k,
+ NULL, NULL,
+ BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ check_xattr(trans, &iter, k, &hash_info, &inode)));
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+static int check_root_trans(struct btree_trans *trans)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_inode_unpacked root_inode;
+ u32 snapshot;
+ u64 inum;
+ int ret;
+
+ ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ return ret;
+
+ if (mustfix_fsck_err_on(ret, c, root_subvol_missing,
+ "root subvol missing")) {
+ struct bkey_i_subvolume root_subvol;
+
+ snapshot = U32_MAX;
+ inum = BCACHEFS_ROOT_INO;
+
+ bkey_subvolume_init(&root_subvol.k_i);
+ root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
+ root_subvol.v.flags = 0;
+ root_subvol.v.snapshot = cpu_to_le32(snapshot);
+ root_subvol.v.inode = cpu_to_le64(inum);
+ ret = commit_do(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ bch2_btree_insert_trans(trans, BTREE_ID_subvolumes,
+ &root_subvol.k_i, 0));
+ bch_err_msg(c, ret, "writing root subvol");
+ if (ret)
+ goto err;
+
+ }
+
+ ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ return ret;
+
+ if (mustfix_fsck_err_on(ret, c, root_dir_missing,
+ "root directory missing") ||
+ mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
+ c, root_inode_not_dir,
+ "root inode not a directory")) {
+ bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
+ 0, NULL);
+ root_inode.bi_inum = inum;
+
+ ret = __write_inode(trans, &root_inode, snapshot);
+ bch_err_msg(c, ret, "writing root inode");
+ }
+err:
+fsck_err:
+ return ret;
+}
+
+/* Get root directory, create if it doesn't exist: */
+int bch2_check_root(struct bch_fs *c)
+{
+ int ret;
+
+ ret = bch2_trans_do(c, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ check_root_trans(trans));
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+struct pathbuf_entry {
+ u64 inum;
+ u32 snapshot;
+};
+
+typedef DARRAY(struct pathbuf_entry) pathbuf;
+
+static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
+{
+ struct pathbuf_entry *i;
+
+ darray_for_each(*p, i)
+ if (i->inum == inum &&
+ i->snapshot == snapshot)
+ return true;
+
+ return false;
+}
+
+static int path_down(struct bch_fs *c, pathbuf *p,
+ u64 inum, u32 snapshot)
+{
+ int ret = darray_push(p, ((struct pathbuf_entry) {
+ .inum = inum,
+ .snapshot = snapshot,
+ }));
+
+ if (ret)
+ bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
+ p->size);
+ return ret;
+}
+
+/*
+ * Check that a given inode is reachable from the root:
+ *
+ * XXX: we should also be verifying that inodes are in the right subvolumes
+ */
+static int check_path(struct btree_trans *trans,
+ pathbuf *p,
+ struct bch_inode_unpacked *inode,
+ u32 snapshot)
+{
+ struct bch_fs *c = trans->c;
+ int ret = 0;
+
+ snapshot = bch2_snapshot_equiv(c, snapshot);
+ p->nr = 0;
+
+ while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
+ inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
+ struct btree_iter dirent_iter;
+ struct bkey_s_c_dirent d;
+ u32 parent_snapshot = snapshot;
+
+ if (inode->bi_subvol) {
+ u64 inum;
+
+ ret = subvol_lookup(trans, inode->bi_parent_subvol,
+ &parent_snapshot, &inum);
+ if (ret)
+ break;
+ }
+
+ ret = lockrestart_do(trans,
+ PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter,
+ SPOS(inode->bi_dir, inode->bi_dir_offset,
+ parent_snapshot))).k));
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ break;
+
+ if (!ret && !dirent_points_to_inode(d, inode)) {
+ bch2_trans_iter_exit(trans, &dirent_iter);
+ ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+ }
+
+ if (bch2_err_matches(ret, ENOENT)) {
+ if (fsck_err(c, inode_unreachable,
+ "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
+ inode->bi_inum, snapshot,
+ bch2_d_type_str(inode_d_type(inode)),
+ inode->bi_nlink,
+ inode->bi_dir,
+ inode->bi_dir_offset))
+ ret = reattach_inode(trans, inode, snapshot);
+ break;
+ }
+
+ bch2_trans_iter_exit(trans, &dirent_iter);
+
+ if (!S_ISDIR(inode->bi_mode))
+ break;
+
+ ret = path_down(c, p, inode->bi_inum, snapshot);
+ if (ret) {
+ bch_err(c, "memory allocation failure");
+ return ret;
+ }
+
+ snapshot = parent_snapshot;
+
+ ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
+ if (ret) {
+ /* Should have been caught in dirents pass */
+ bch_err(c, "error looking up parent directory: %i", ret);
+ break;
+ }
+
+ if (path_is_dup(p, inode->bi_inum, snapshot)) {
+ struct pathbuf_entry *i;
+
+ /* XXX print path */
+ bch_err(c, "directory structure loop");
+
+ darray_for_each(*p, i)
+ pr_err("%llu:%u", i->inum, i->snapshot);
+ pr_err("%llu:%u", inode->bi_inum, snapshot);
+
+ if (!fsck_err(c, dir_loop,
+ "directory structure loop"))
+ return 0;
+
+ ret = commit_do(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ remove_backpointer(trans, inode));
+ if (ret) {
+ bch_err(c, "error removing dirent: %i", ret);
+ break;
+ }
+
+ ret = reattach_inode(trans, inode, snapshot);
+ }
+ }
+fsck_err:
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+/*
+ * Check for unreachable inodes, as well as loops in the directory structure:
+ * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's
+ * unreachable:
+ */
+int bch2_check_directory_structure(struct bch_fs *c)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_inode_unpacked u;
+ pathbuf path = { 0, };
+ int ret;
+
+ for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ if (!bkey_is_inode(k.k))
+ continue;
+
+ ret = bch2_inode_unpack(k, &u);
+ if (ret) {
+ /* Should have been caught earlier in fsck: */
+ bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret);
+ break;
+ }
+
+ if (u.bi_flags & BCH_INODE_unlinked)
+ continue;
+
+ ret = check_path(trans, &path, &u, iter.pos.snapshot);
+ if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
+ darray_exit(&path);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+struct nlink_table {
+ size_t nr;
+ size_t size;
+
+ struct nlink {
+ u64 inum;
+ u32 snapshot;
+ u32 count;
+ } *d;
+};
+
+static int add_nlink(struct bch_fs *c, struct nlink_table *t,
+ u64 inum, u32 snapshot)
+{
+ if (t->nr == t->size) {
+ size_t new_size = max_t(size_t, 128UL, t->size * 2);
+ void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
+
+ if (!d) {
+ bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
+ new_size);
+ return -BCH_ERR_ENOMEM_fsck_add_nlink;
+ }
+
+ if (t->d)
+ memcpy(d, t->d, t->size * sizeof(t->d[0]));
+ kvfree(t->d);
+
+ t->d = d;
+ t->size = new_size;
+ }
+
+
+ t->d[t->nr++] = (struct nlink) {
+ .inum = inum,
+ .snapshot = snapshot,
+ };
+
+ return 0;
+}
+
+static int nlink_cmp(const void *_l, const void *_r)
+{
+ const struct nlink *l = _l;
+ const struct nlink *r = _r;
+
+ return cmp_int(l->inum, r->inum);
+}
+
+static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
+ struct nlink_table *links,
+ u64 range_start, u64 range_end, u64 inum, u32 snapshot)
+{
+ struct nlink *link, key = {
+ .inum = inum, .snapshot = U32_MAX,
+ };
+
+ if (inum < range_start || inum >= range_end)
+ return;
+
+ link = __inline_bsearch(&key, links->d, links->nr,
+ sizeof(links->d[0]), nlink_cmp);
+ if (!link)
+ return;
+
+ while (link > links->d && link[0].inum == link[-1].inum)
+ --link;
+
+ for (; link < links->d + links->nr && link->inum == inum; link++)
+ if (ref_visible(c, s, snapshot, link->snapshot)) {
+ link->count++;
+ if (link->snapshot >= snapshot)
+ break;
+ }
+}
+
+noinline_for_stack
+static int check_nlinks_find_hardlinks(struct bch_fs *c,
+ struct nlink_table *t,
+ u64 start, u64 *end)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_inode_unpacked u;
+ int ret = 0;
+
+ for_each_btree_key(trans, iter, BTREE_ID_inodes,
+ POS(0, start),
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ if (!bkey_is_inode(k.k))
+ continue;
+
+ /* Should never fail, checked by bch2_inode_invalid: */
+ BUG_ON(bch2_inode_unpack(k, &u));
+
+ /*
+ * Backpointer and directory structure checks are sufficient for
+ * directories, since they can't have hardlinks:
+ */
+ if (S_ISDIR(u.bi_mode))
+ continue;
+
+ if (!u.bi_nlink)
+ continue;
+
+ ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
+ if (ret) {
+ *end = k.k->p.offset;
+ ret = 0;
+ break;
+ }
+
+ }
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
+
+ if (ret)
+ bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
+
+ return ret;
+}
+
+noinline_for_stack
+static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
+ u64 range_start, u64 range_end)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct snapshots_seen s;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_s_c_dirent d;
+ int ret;
+
+ snapshots_seen_init(&s);
+
+ for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
+ if (ret)
+ break;
+
+ switch (k.k->type) {
+ case KEY_TYPE_dirent:
+ d = bkey_s_c_to_dirent(k);
+
+ if (d.v->d_type != DT_DIR &&
+ d.v->d_type != DT_SUBVOL)
+ inc_link(c, &s, links, range_start, range_end,
+ le64_to_cpu(d.v->d_inum),
+ bch2_snapshot_equiv(c, d.k->p.snapshot));
+ break;
+ }
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (ret)
+ bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
+
+ bch2_trans_put(trans);
+ snapshots_seen_exit(&s);
+ return ret;
+}
+
+static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k,
+ struct nlink_table *links,
+ size_t *idx, u64 range_end)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_inode_unpacked u;
+ struct nlink *link = &links->d[*idx];
+ int ret = 0;
+
+ if (k.k->p.offset >= range_end)
+ return 1;
+
+ if (!bkey_is_inode(k.k))
+ return 0;
+
+ BUG_ON(bch2_inode_unpack(k, &u));
+
+ if (S_ISDIR(u.bi_mode))
+ return 0;
+
+ if (!u.bi_nlink)
+ return 0;
+
+ while ((cmp_int(link->inum, k.k->p.offset) ?:
+ cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
+ BUG_ON(*idx == links->nr);
+ link = &links->d[++*idx];
+ }
+
+ if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
+ c, inode_wrong_nlink,
+ "inode %llu type %s has wrong i_nlink (%u, should be %u)",
+ u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
+ bch2_inode_nlink_get(&u), link->count)) {
+ bch2_inode_nlink_set(&u, link->count);
+ ret = __write_inode(trans, &u, k.k->p.snapshot);
+ }
+fsck_err:
+ return ret;
+}
+
+noinline_for_stack
+static int check_nlinks_update_hardlinks(struct bch_fs *c,
+ struct nlink_table *links,
+ u64 range_start, u64 range_end)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ size_t idx = 0;
+ int ret = 0;
+
+ ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
+ POS(0, range_start),
+ BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
+ if (ret < 0) {
+ bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+int bch2_check_nlinks(struct bch_fs *c)
+{
+ struct nlink_table links = { 0 };
+ u64 this_iter_range_start, next_iter_range_start = 0;
+ int ret = 0;
+
+ do {
+ this_iter_range_start = next_iter_range_start;
+ next_iter_range_start = U64_MAX;
+
+ ret = check_nlinks_find_hardlinks(c, &links,
+ this_iter_range_start,
+ &next_iter_range_start);
+
+ ret = check_nlinks_walk_dirents(c, &links,
+ this_iter_range_start,
+ next_iter_range_start);
+ if (ret)
+ break;
+
+ ret = check_nlinks_update_hardlinks(c, &links,
+ this_iter_range_start,
+ next_iter_range_start);
+ if (ret)
+ break;
+
+ links.nr = 0;
+ } while (next_iter_range_start != U64_MAX);
+
+ kvfree(links.d);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_reflink_p p;
+ struct bkey_i_reflink_p *u;
+ int ret;
+
+ if (k.k->type != KEY_TYPE_reflink_p)
+ return 0;
+
+ p = bkey_s_c_to_reflink_p(k);
+
+ if (!p.v->front_pad && !p.v->back_pad)
+ return 0;
+
+ u = bch2_trans_kmalloc(trans, sizeof(*u));
+ ret = PTR_ERR_OR_ZERO(u);
+ if (ret)
+ return ret;
+
+ bkey_reassemble(&u->k_i, k);
+ u->v.front_pad = 0;
+ u->v.back_pad = 0;
+
+ return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN);
+}
+
+int bch2_fix_reflink_p(struct bch_fs *c)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
+ return 0;
+
+ ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter,
+ BTREE_ID_extents, POS_MIN,
+ BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ fix_reflink_p_key(trans, &iter, k)));
+ bch_err_fn(c, ret);
+ return ret;
+}
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
new file mode 100644
index 000000000000..da991e8cf27e
--- /dev/null
+++ b/fs/bcachefs/fsck.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FSCK_H
+#define _BCACHEFS_FSCK_H
+
+int bch2_check_inodes(struct bch_fs *);
+int bch2_check_extents(struct bch_fs *);
+int bch2_check_indirect_extents(struct bch_fs *);
+int bch2_check_dirents(struct bch_fs *);
+int bch2_check_xattrs(struct bch_fs *);
+int bch2_check_root(struct bch_fs *);
+int bch2_check_directory_structure(struct bch_fs *);
+int bch2_check_nlinks(struct bch_fs *);
+int bch2_fix_reflink_p(struct bch_fs *);
+
+#endif /* _BCACHEFS_FSCK_H */
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
new file mode 100644
index 000000000000..9309cfeecd8d
--- /dev/null
+++ b/fs/bcachefs/inode.c
@@ -0,0 +1,1205 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_key_cache.h"
+#include "btree_write_buffer.h"
+#include "bkey_methods.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "compress.h"
+#include "dirent.h"
+#include "error.h"
+#include "extents.h"
+#include "extent_update.h"
+#include "inode.h"
+#include "str_hash.h"
+#include "snapshot.h"
+#include "subvolume.h"
+#include "varint.h"
+
+#include <linux/random.h>
+
+#include <asm/unaligned.h>
+
+#define x(name, ...) #name,
+const char * const bch2_inode_opts[] = {
+ BCH_INODE_OPTS()
+ NULL,
+};
+
+static const char * const bch2_inode_flag_strs[] = {
+ BCH_INODE_FLAGS()
+ NULL
+};
+#undef x
+
+static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
+
+static int inode_decode_field(const u8 *in, const u8 *end,
+ u64 out[2], unsigned *out_bits)
+{
+ __be64 be[2] = { 0, 0 };
+ unsigned bytes, shift;
+ u8 *p;
+
+ if (in >= end)
+ return -1;
+
+ if (!*in)
+ return -1;
+
+ /*
+ * position of highest set bit indicates number of bytes:
+ * shift = number of bits to remove in high byte:
+ */
+ shift = 8 - __fls(*in); /* 1 <= shift <= 8 */
+ bytes = byte_table[shift - 1];
+
+ if (in + bytes > end)
+ return -1;
+
+ p = (u8 *) be + 16 - bytes;
+ memcpy(p, in, bytes);
+ *p ^= (1 << 8) >> shift;
+
+ out[0] = be64_to_cpu(be[0]);
+ out[1] = be64_to_cpu(be[1]);
+ *out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]);
+
+ return bytes;
+}
+
+static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed,
+ const struct bch_inode_unpacked *inode)
+{
+ struct bkey_i_inode_v3 *k = &packed->inode;
+ u8 *out = k->v.fields;
+ u8 *end = (void *) &packed[1];
+ u8 *last_nonzero_field = out;
+ unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
+ unsigned bytes;
+ int ret;
+
+ bkey_inode_v3_init(&packed->inode.k_i);
+ packed->inode.k.p.offset = inode->bi_inum;
+ packed->inode.v.bi_journal_seq = cpu_to_le64(inode->bi_journal_seq);
+ packed->inode.v.bi_hash_seed = inode->bi_hash_seed;
+ packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags);
+ packed->inode.v.bi_sectors = cpu_to_le64(inode->bi_sectors);
+ packed->inode.v.bi_size = cpu_to_le64(inode->bi_size);
+ packed->inode.v.bi_version = cpu_to_le64(inode->bi_version);
+ SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode);
+ SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR);
+
+
+#define x(_name, _bits) \
+ nr_fields++; \
+ \
+ if (inode->_name) { \
+ ret = bch2_varint_encode_fast(out, inode->_name); \
+ out += ret; \
+ \
+ if (_bits > 64) \
+ *out++ = 0; \
+ \
+ last_nonzero_field = out; \
+ last_nonzero_fieldnr = nr_fields; \
+ } else { \
+ *out++ = 0; \
+ \
+ if (_bits > 64) \
+ *out++ = 0; \
+ }
+
+ BCH_INODE_FIELDS_v3()
+#undef x
+ BUG_ON(out > end);
+
+ out = last_nonzero_field;
+ nr_fields = last_nonzero_fieldnr;
+
+ bytes = out - (u8 *) &packed->inode.v;
+ set_bkey_val_bytes(&packed->inode.k, bytes);
+ memset_u64s_tail(&packed->inode.v, 0, bytes);
+
+ SET_INODEv3_NR_FIELDS(&k->v, nr_fields);
+
+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+ struct bch_inode_unpacked unpacked;
+
+ ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked);
+ BUG_ON(ret);
+ BUG_ON(unpacked.bi_inum != inode->bi_inum);
+ BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed);
+ BUG_ON(unpacked.bi_sectors != inode->bi_sectors);
+ BUG_ON(unpacked.bi_size != inode->bi_size);
+ BUG_ON(unpacked.bi_version != inode->bi_version);
+ BUG_ON(unpacked.bi_mode != inode->bi_mode);
+
+#define x(_name, _bits) if (unpacked._name != inode->_name) \
+ panic("unpacked %llu should be %llu", \
+ (u64) unpacked._name, (u64) inode->_name);
+ BCH_INODE_FIELDS_v3()
+#undef x
+ }
+}
+
+void bch2_inode_pack(struct bkey_inode_buf *packed,
+ const struct bch_inode_unpacked *inode)
+{
+ bch2_inode_pack_inlined(packed, inode);
+}
+
+static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
+ struct bch_inode_unpacked *unpacked)
+{
+ const u8 *in = inode.v->fields;
+ const u8 *end = bkey_val_end(inode);
+ u64 field[2];
+ unsigned fieldnr = 0, field_bits;
+ int ret;
+
+#define x(_name, _bits) \
+ if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \
+ unsigned offset = offsetof(struct bch_inode_unpacked, _name);\
+ memset((void *) unpacked + offset, 0, \
+ sizeof(*unpacked) - offset); \
+ return 0; \
+ } \
+ \
+ ret = inode_decode_field(in, end, field, &field_bits); \
+ if (ret < 0) \
+ return ret; \
+ \
+ if (field_bits > sizeof(unpacked->_name) * 8) \
+ return -1; \
+ \
+ unpacked->_name = field[1]; \
+ in += ret;
+
+ BCH_INODE_FIELDS_v2()
+#undef x
+
+ /* XXX: signal if there were more fields than expected? */
+ return 0;
+}
+
+static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked,
+ const u8 *in, const u8 *end,
+ unsigned nr_fields)
+{
+ unsigned fieldnr = 0;
+ int ret;
+ u64 v[2];
+
+#define x(_name, _bits) \
+ if (fieldnr < nr_fields) { \
+ ret = bch2_varint_decode_fast(in, end, &v[0]); \
+ if (ret < 0) \
+ return ret; \
+ in += ret; \
+ \
+ if (_bits > 64) { \
+ ret = bch2_varint_decode_fast(in, end, &v[1]); \
+ if (ret < 0) \
+ return ret; \
+ in += ret; \
+ } else { \
+ v[1] = 0; \
+ } \
+ } else { \
+ v[0] = v[1] = 0; \
+ } \
+ \
+ unpacked->_name = v[0]; \
+ if (v[1] || v[0] != unpacked->_name) \
+ return -1; \
+ fieldnr++;
+
+ BCH_INODE_FIELDS_v2()
+#undef x
+
+ /* XXX: signal if there were more fields than expected? */
+ return 0;
+}
+
+static int bch2_inode_unpack_v3(struct bkey_s_c k,
+ struct bch_inode_unpacked *unpacked)
+{
+ struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
+ const u8 *in = inode.v->fields;
+ const u8 *end = bkey_val_end(inode);
+ unsigned nr_fields = INODEv3_NR_FIELDS(inode.v);
+ unsigned fieldnr = 0;
+ int ret;
+ u64 v[2];
+
+ unpacked->bi_inum = inode.k->p.offset;
+ unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
+ unpacked->bi_hash_seed = inode.v->bi_hash_seed;
+ unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags);
+ unpacked->bi_sectors = le64_to_cpu(inode.v->bi_sectors);
+ unpacked->bi_size = le64_to_cpu(inode.v->bi_size);
+ unpacked->bi_version = le64_to_cpu(inode.v->bi_version);
+ unpacked->bi_mode = INODEv3_MODE(inode.v);
+
+#define x(_name, _bits) \
+ if (fieldnr < nr_fields) { \
+ ret = bch2_varint_decode_fast(in, end, &v[0]); \
+ if (ret < 0) \
+ return ret; \
+ in += ret; \
+ \
+ if (_bits > 64) { \
+ ret = bch2_varint_decode_fast(in, end, &v[1]); \
+ if (ret < 0) \
+ return ret; \
+ in += ret; \
+ } else { \
+ v[1] = 0; \
+ } \
+ } else { \
+ v[0] = v[1] = 0; \
+ } \
+ \
+ unpacked->_name = v[0]; \
+ if (v[1] || v[0] != unpacked->_name) \
+ return -1; \
+ fieldnr++;
+
+ BCH_INODE_FIELDS_v3()
+#undef x
+
+ /* XXX: signal if there were more fields than expected? */
+ return 0;
+}
+
+static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
+ struct bch_inode_unpacked *unpacked)
+{
+ memset(unpacked, 0, sizeof(*unpacked));
+
+ switch (k.k->type) {
+ case KEY_TYPE_inode: {
+ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
+
+ unpacked->bi_inum = inode.k->p.offset;
+ unpacked->bi_journal_seq= 0;
+ unpacked->bi_hash_seed = inode.v->bi_hash_seed;
+ unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags);
+ unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode);
+
+ if (INODE_NEW_VARINT(inode.v)) {
+ return bch2_inode_unpack_v2(unpacked, inode.v->fields,
+ bkey_val_end(inode),
+ INODE_NR_FIELDS(inode.v));
+ } else {
+ return bch2_inode_unpack_v1(inode, unpacked);
+ }
+ break;
+ }
+ case KEY_TYPE_inode_v2: {
+ struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
+
+ unpacked->bi_inum = inode.k->p.offset;
+ unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
+ unpacked->bi_hash_seed = inode.v->bi_hash_seed;
+ unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags);
+ unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode);
+
+ return bch2_inode_unpack_v2(unpacked, inode.v->fields,
+ bkey_val_end(inode),
+ INODEv2_NR_FIELDS(inode.v));
+ }
+ default:
+ BUG();
+ }
+}
+
+int bch2_inode_unpack(struct bkey_s_c k,
+ struct bch_inode_unpacked *unpacked)
+{
+ if (likely(k.k->type == KEY_TYPE_inode_v3))
+ return bch2_inode_unpack_v3(k, unpacked);
+ return bch2_inode_unpack_slowpath(k, unpacked);
+}
+
+static int bch2_inode_peek_nowarn(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bch_inode_unpacked *inode,
+ subvol_inum inum, unsigned flags)
+{
+ struct bkey_s_c k;
+ u32 snapshot;
+ int ret;
+
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ return ret;
+
+ k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes,
+ SPOS(0, inum.inum, snapshot),
+ flags|BTREE_ITER_CACHED);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
+ if (ret)
+ goto err;
+
+ ret = bch2_inode_unpack(k, inode);
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ bch2_trans_iter_exit(trans, iter);
+ return ret;
+}
+
+int bch2_inode_peek(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bch_inode_unpacked *inode,
+ subvol_inum inum, unsigned flags)
+{
+ int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
+ bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum);
+ return ret;
+}
+
+int bch2_inode_write_flags(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bch_inode_unpacked *inode,
+ enum btree_update_flags flags)
+{
+ struct bkey_inode_buf *inode_p;
+
+ inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
+ if (IS_ERR(inode_p))
+ return PTR_ERR(inode_p);
+
+ bch2_inode_pack_inlined(inode_p, inode);
+ inode_p->inode.k.p.snapshot = iter->snapshot;
+ return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags);
+}
+
+struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
+{
+ struct bch_inode_unpacked u;
+ struct bkey_inode_buf *inode_p;
+ int ret;
+
+ if (!bkey_is_inode(&k->k))
+ return ERR_PTR(-ENOENT);
+
+ inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
+ if (IS_ERR(inode_p))
+ return ERR_CAST(inode_p);
+
+ ret = bch2_inode_unpack(bkey_i_to_s_c(k), &u);
+ if (ret)
+ return ERR_PTR(ret);
+
+ bch2_inode_pack(inode_p, &u);
+ return &inode_p->inode.k_i;
+}
+
+static int __bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err)
+{
+ struct bch_inode_unpacked unpacked;
+ int ret = 0;
+
+ bkey_fsck_err_on(k.k->p.inode, c, err,
+ inode_pos_inode_nonzero,
+ "nonzero k.p.inode");
+
+ bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, c, err,
+ inode_pos_blockdev_range,
+ "fs inode in blockdev range");
+
+ bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), c, err,
+ inode_unpack_error,
+ "invalid variable length fields");
+
+ bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, c, err,
+ inode_checksum_type_invalid,
+ "invalid data checksum type (%u >= %u",
+ unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
+
+ bkey_fsck_err_on(unpacked.bi_compression &&
+ !bch2_compression_opt_valid(unpacked.bi_compression - 1), c, err,
+ inode_compression_type_invalid,
+ "invalid compression opt %u", unpacked.bi_compression - 1);
+
+ bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) &&
+ unpacked.bi_nlink != 0, c, err,
+ inode_unlinked_but_nlink_nonzero,
+ "flagged as unlinked but bi_nlink != 0");
+
+ bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), c, err,
+ inode_subvol_root_but_not_dir,
+ "subvolume root but not a directory");
+fsck_err:
+ return ret;
+}
+
+int bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
+{
+ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
+ int ret = 0;
+
+ bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
+ inode_str_hash_invalid,
+ "invalid str hash type (%llu >= %u)",
+ INODE_STR_HASH(inode.v), BCH_STR_HASH_NR);
+
+ ret = __bch2_inode_invalid(c, k, err);
+fsck_err:
+ return ret;
+}
+
+int bch2_inode_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
+{
+ struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
+ int ret = 0;
+
+ bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
+ inode_str_hash_invalid,
+ "invalid str hash type (%llu >= %u)",
+ INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
+
+ ret = __bch2_inode_invalid(c, k, err);
+fsck_err:
+ return ret;
+}
+
+int bch2_inode_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
+{
+ struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
+ int ret = 0;
+
+ bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
+ INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), c, err,
+ inode_v3_fields_start_bad,
+ "invalid fields_start (got %llu, min %u max %zu)",
+ INODEv3_FIELDS_START(inode.v),
+ INODEv3_FIELDS_START_INITIAL,
+ bkey_val_u64s(inode.k));
+
+ bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
+ inode_str_hash_invalid,
+ "invalid str hash type (%llu >= %u)",
+ INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
+
+ ret = __bch2_inode_invalid(c, k, err);
+fsck_err:
+ return ret;
+}
+
+static void __bch2_inode_unpacked_to_text(struct printbuf *out,
+ struct bch_inode_unpacked *inode)
+{
+ prt_printf(out, "mode=%o ", inode->bi_mode);
+
+ prt_str(out, "flags=");
+ prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
+ prt_printf(out, " (%x)", inode->bi_flags);
+
+ prt_printf(out, " journal_seq=%llu bi_size=%llu bi_sectors=%llu bi_version=%llu",
+ inode->bi_journal_seq,
+ inode->bi_size,
+ inode->bi_sectors,
+ inode->bi_version);
+
+#define x(_name, _bits) \
+ prt_printf(out, " "#_name "=%llu", (u64) inode->_name);
+ BCH_INODE_FIELDS_v3()
+#undef x
+}
+
+void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
+{
+ prt_printf(out, "inum: %llu ", inode->bi_inum);
+ __bch2_inode_unpacked_to_text(out, inode);
+}
+
+void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bch_inode_unpacked inode;
+
+ if (bch2_inode_unpack(k, &inode)) {
+ prt_printf(out, "(unpack error)");
+ return;
+ }
+
+ __bch2_inode_unpacked_to_text(out, &inode);
+}
+
+static inline u64 bkey_inode_flags(struct bkey_s_c k)
+{
+ switch (k.k->type) {
+ case KEY_TYPE_inode:
+ return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
+ case KEY_TYPE_inode_v2:
+ return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
+ case KEY_TYPE_inode_v3:
+ return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
+ default:
+ return 0;
+ }
+}
+
+static inline bool bkey_is_deleted_inode(struct bkey_s_c k)
+{
+ return bkey_inode_flags(k) & BCH_INODE_unlinked;
+}
+
+int bch2_trans_mark_inode(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old,
+ struct bkey_i *new,
+ unsigned flags)
+{
+ int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k);
+ bool old_deleted = bkey_is_deleted_inode(old);
+ bool new_deleted = bkey_is_deleted_inode(bkey_i_to_s_c(new));
+
+ if (nr) {
+ int ret = bch2_replicas_deltas_realloc(trans, 0);
+ struct replicas_delta_list *d = trans->fs_usage_deltas;
+
+ if (ret)
+ return ret;
+
+ d->nr_inodes += nr;
+ }
+
+ if (old_deleted != new_deleted) {
+ int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new->k.p, new_deleted);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+int bch2_mark_inode(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_fs_usage *fs_usage;
+ u64 journal_seq = trans->journal_res.seq;
+
+ if (flags & BTREE_TRIGGER_INSERT) {
+ struct bch_inode_v3 *v = (struct bch_inode_v3 *) new.v;
+
+ BUG_ON(!journal_seq);
+ BUG_ON(new.k->type != KEY_TYPE_inode_v3);
+
+ v->bi_journal_seq = cpu_to_le64(journal_seq);
+ }
+
+ if (flags & BTREE_TRIGGER_GC) {
+ percpu_down_read(&c->mark_lock);
+ preempt_disable();
+
+ fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
+ fs_usage->nr_inodes += bkey_is_inode(new.k);
+ fs_usage->nr_inodes -= bkey_is_inode(old.k);
+
+ preempt_enable();
+ percpu_up_read(&c->mark_lock);
+ }
+ return 0;
+}
+
+int bch2_inode_generation_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
+{
+ int ret = 0;
+
+ bkey_fsck_err_on(k.k->p.inode, c, err,
+ inode_pos_inode_nonzero,
+ "nonzero k.p.inode");
+fsck_err:
+ return ret;
+}
+
+void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k);
+
+ prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
+}
+
+void bch2_inode_init_early(struct bch_fs *c,
+ struct bch_inode_unpacked *inode_u)
+{
+ enum bch_str_hash_type str_hash =
+ bch2_str_hash_opt_to_type(c, c->opts.str_hash);
+
+ memset(inode_u, 0, sizeof(*inode_u));
+
+ /* ick */
+ inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET;
+ get_random_bytes(&inode_u->bi_hash_seed,
+ sizeof(inode_u->bi_hash_seed));
+}
+
+void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now,
+ uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
+ struct bch_inode_unpacked *parent)
+{
+ inode_u->bi_mode = mode;
+ inode_u->bi_uid = uid;
+ inode_u->bi_gid = gid;
+ inode_u->bi_dev = rdev;
+ inode_u->bi_atime = now;
+ inode_u->bi_mtime = now;
+ inode_u->bi_ctime = now;
+ inode_u->bi_otime = now;
+
+ if (parent && parent->bi_mode & S_ISGID) {
+ inode_u->bi_gid = parent->bi_gid;
+ if (S_ISDIR(mode))
+ inode_u->bi_mode |= S_ISGID;
+ }
+
+ if (parent) {
+#define x(_name, ...) inode_u->bi_##_name = parent->bi_##_name;
+ BCH_INODE_OPTS()
+#undef x
+ }
+}
+
+void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
+ uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
+ struct bch_inode_unpacked *parent)
+{
+ bch2_inode_init_early(c, inode_u);
+ bch2_inode_init_late(inode_u, bch2_current_time(c),
+ uid, gid, mode, rdev, parent);
+}
+
+static inline u32 bkey_generation(struct bkey_s_c k)
+{
+ switch (k.k->type) {
+ case KEY_TYPE_inode:
+ case KEY_TYPE_inode_v2:
+ BUG();
+ case KEY_TYPE_inode_generation:
+ return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
+ default:
+ return 0;
+ }
+}
+
+/*
+ * This just finds an empty slot:
+ */
+int bch2_inode_create(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bch_inode_unpacked *inode_u,
+ u32 snapshot, u64 cpu)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k;
+ u64 min, max, start, pos, *hint;
+ int ret = 0;
+ unsigned bits = (c->opts.inodes_32bit ? 31 : 63);
+
+ if (c->opts.shard_inode_numbers) {
+ bits -= c->inode_shard_bits;
+
+ min = (cpu << bits);
+ max = (cpu << bits) | ~(ULLONG_MAX << bits);
+
+ min = max_t(u64, min, BLOCKDEV_INODE_MAX);
+ hint = c->unused_inode_hints + cpu;
+ } else {
+ min = BLOCKDEV_INODE_MAX;
+ max = ~(ULLONG_MAX << bits);
+ hint = c->unused_inode_hints;
+ }
+
+ start = READ_ONCE(*hint);
+
+ if (start >= max || start < min)
+ start = min;
+
+ pos = start;
+ bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos),
+ BTREE_ITER_ALL_SNAPSHOTS|
+ BTREE_ITER_INTENT);
+again:
+ while ((k = bch2_btree_iter_peek(iter)).k &&
+ !(ret = bkey_err(k)) &&
+ bkey_lt(k.k->p, POS(0, max))) {
+ if (pos < iter->pos.offset)
+ goto found_slot;
+
+ /*
+ * We don't need to iterate over keys in every snapshot once
+ * we've found just one:
+ */
+ pos = iter->pos.offset + 1;
+ bch2_btree_iter_set_pos(iter, POS(0, pos));
+ }
+
+ if (!ret && pos < max)
+ goto found_slot;
+
+ if (!ret && start == min)
+ ret = -BCH_ERR_ENOSPC_inode_create;
+
+ if (ret) {
+ bch2_trans_iter_exit(trans, iter);
+ return ret;
+ }
+
+ /* Retry from start */
+ pos = start = min;
+ bch2_btree_iter_set_pos(iter, POS(0, pos));
+ goto again;
+found_slot:
+ bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot));
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret) {
+ bch2_trans_iter_exit(trans, iter);
+ return ret;
+ }
+
+ *hint = k.k->p.offset;
+ inode_u->bi_inum = k.k->p.offset;
+ inode_u->bi_generation = bkey_generation(k);
+ return 0;
+}
+
+static int bch2_inode_delete_keys(struct btree_trans *trans,
+ subvol_inum inum, enum btree_id id)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i delete;
+ struct bpos end = POS(inum.inum, U64_MAX);
+ u32 snapshot;
+ int ret = 0;
+
+ /*
+ * We're never going to be deleting partial extents, no need to use an
+ * extent iterator:
+ */
+ bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
+ BTREE_ITER_INTENT);
+
+ while (1) {
+ bch2_trans_begin(trans);
+
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ bch2_btree_iter_set_snapshot(&iter, snapshot);
+
+ k = bch2_btree_iter_peek_upto(&iter, end);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (!k.k)
+ break;
+
+ bkey_init(&delete.k);
+ delete.k.p = iter.pos;
+
+ if (iter.flags & BTREE_ITER_IS_EXTENTS)
+ bch2_key_resize(&delete.k,
+ bpos_min(end, k.k->p).offset -
+ iter.pos.offset);
+
+ ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL);
+err:
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ break;
+ }
+
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter = { NULL };
+ struct bkey_i_inode_generation delete;
+ struct bch_inode_unpacked inode_u;
+ struct bkey_s_c k;
+ u32 snapshot;
+ int ret;
+
+ /*
+ * If this was a directory, there shouldn't be any real dirents left -
+ * but there could be whiteouts (from hash collisions) that we should
+ * delete:
+ *
+ * XXX: the dirent could ideally would delete whiteouts when they're no
+ * longer needed
+ */
+ ret = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?:
+ bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?:
+ bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents);
+ if (ret)
+ goto err;
+retry:
+ bch2_trans_begin(trans);
+
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, inum.inum, snapshot),
+ BTREE_ITER_INTENT|BTREE_ITER_CACHED);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (!bkey_is_inode(k.k)) {
+ bch2_fs_inconsistent(c,
+ "inode %llu:%u not found when deleting",
+ inum.inum, snapshot);
+ ret = -EIO;
+ goto err;
+ }
+
+ bch2_inode_unpack(k, &inode_u);
+
+ bkey_inode_generation_init(&delete.k_i);
+ delete.k.p = iter.pos;
+ delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
+
+ ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
+
+ bch2_trans_put(trans);
+ return ret;
+}
+
+int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans,
+ subvol_inum inum,
+ struct bch_inode_unpacked *inode)
+{
+ struct btree_iter iter;
+ int ret;
+
+ ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0);
+ if (!ret)
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
+ subvol_inum inum,
+ struct bch_inode_unpacked *inode)
+{
+ struct btree_iter iter;
+ int ret;
+
+ ret = bch2_inode_peek(trans, &iter, inode, inum, 0);
+ if (!ret)
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
+ struct bch_inode_unpacked *inode)
+{
+ return bch2_trans_do(c, NULL, NULL, 0,
+ bch2_inode_find_by_inum_trans(trans, inum, inode));
+}
+
+int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
+{
+ if (bi->bi_flags & BCH_INODE_unlinked)
+ bi->bi_flags &= ~BCH_INODE_unlinked;
+ else {
+ if (bi->bi_nlink == U32_MAX)
+ return -EINVAL;
+
+ bi->bi_nlink++;
+ }
+
+ return 0;
+}
+
+void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi)
+{
+ if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_unlinked)) {
+ bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero",
+ bi->bi_inum);
+ return;
+ }
+
+ if (bi->bi_flags & BCH_INODE_unlinked) {
+ bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum);
+ return;
+ }
+
+ if (bi->bi_nlink)
+ bi->bi_nlink--;
+ else
+ bi->bi_flags |= BCH_INODE_unlinked;
+}
+
+struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode)
+{
+ struct bch_opts ret = { 0 };
+#define x(_name, _bits) \
+ if (inode->bi_##_name) \
+ opt_set(ret, _name, inode->bi_##_name - 1);
+ BCH_INODE_OPTS()
+#undef x
+ return ret;
+}
+
+void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
+ struct bch_inode_unpacked *inode)
+{
+#define x(_name, _bits) opts->_name = inode_opt_get(c, inode, _name);
+ BCH_INODE_OPTS()
+#undef x
+
+ if (opts->nocow)
+ opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0;
+}
+
+int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts)
+{
+ struct bch_inode_unpacked inode;
+ int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode));
+
+ if (ret)
+ return ret;
+
+ bch2_inode_opts_get(opts, trans->c, &inode);
+ return 0;
+}
+
+int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter = { NULL };
+ struct bkey_i_inode_generation delete;
+ struct bch_inode_unpacked inode_u;
+ struct bkey_s_c k;
+ int ret;
+
+ do {
+ ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+ SPOS(inum, 0, snapshot),
+ SPOS(inum, U64_MAX, snapshot),
+ 0, NULL) ?:
+ bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
+ SPOS(inum, 0, snapshot),
+ SPOS(inum, U64_MAX, snapshot),
+ 0, NULL) ?:
+ bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
+ SPOS(inum, 0, snapshot),
+ SPOS(inum, U64_MAX, snapshot),
+ 0, NULL);
+ } while (ret == -BCH_ERR_transaction_restart_nested);
+ if (ret)
+ goto err;
+retry:
+ bch2_trans_begin(trans);
+
+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (!bkey_is_inode(k.k)) {
+ bch2_fs_inconsistent(c,
+ "inode %llu:%u not found when deleting",
+ inum, snapshot);
+ ret = -EIO;
+ goto err;
+ }
+
+ bch2_inode_unpack(k, &inode_u);
+
+ /* Subvolume root? */
+ if (inode_u.bi_subvol)
+ bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum);
+
+ bkey_inode_generation_init(&delete.k_i);
+ delete.k.p = iter.pos;
+ delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
+
+ ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
+
+ return ret ?: -BCH_ERR_transaction_restart_nested;
+}
+
+static int may_delete_deleted_inode(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos pos,
+ bool *need_another_pass)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter inode_iter;
+ struct bkey_s_c k;
+ struct bch_inode_unpacked inode;
+ int ret;
+
+ k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
+ if (fsck_err_on(!bkey_is_inode(k.k), c,
+ deleted_inode_missing,
+ "nonexistent inode %llu:%u in deleted_inodes btree",
+ pos.offset, pos.snapshot))
+ goto delete;
+
+ ret = bch2_inode_unpack(k, &inode);
+ if (ret)
+ goto out;
+
+ if (S_ISDIR(inode.bi_mode)) {
+ ret = bch2_empty_dir_snapshot(trans, pos.offset, pos.snapshot);
+ if (fsck_err_on(ret == -ENOTEMPTY, c, deleted_inode_is_dir,
+ "non empty directory %llu:%u in deleted_inodes btree",
+ pos.offset, pos.snapshot))
+ goto delete;
+ if (ret)
+ goto out;
+ }
+
+ if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), c,
+ deleted_inode_not_unlinked,
+ "non-deleted inode %llu:%u in deleted_inodes btree",
+ pos.offset, pos.snapshot))
+ goto delete;
+
+ if (c->sb.clean &&
+ !fsck_err(c,
+ deleted_inode_but_clean,
+ "filesystem marked as clean but have deleted inode %llu:%u",
+ pos.offset, pos.snapshot)) {
+ ret = 0;
+ goto out;
+ }
+
+ if (bch2_snapshot_is_internal_node(c, pos.snapshot)) {
+ struct bpos new_min_pos;
+
+ ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos);
+ if (ret)
+ goto out;
+
+ inode.bi_flags &= ~BCH_INODE_unlinked;
+
+ ret = bch2_inode_write_flags(trans, &inode_iter, &inode,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ bch_err_msg(c, ret, "clearing inode unlinked flag");
+ if (ret)
+ goto out;
+
+ /*
+ * We'll need another write buffer flush to pick up the new
+ * unlinked inodes in the snapshot leaves:
+ */
+ *need_another_pass = true;
+ goto out;
+ }
+
+ ret = 1;
+out:
+fsck_err:
+ bch2_trans_iter_exit(trans, &inode_iter);
+ return ret;
+delete:
+ ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
+ goto out;
+}
+
+int bch2_delete_dead_inodes(struct bch_fs *c)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ bool need_another_pass;
+ int ret;
+again:
+ need_another_pass = false;
+
+ ret = bch2_btree_write_buffer_flush_sync(trans);
+ if (ret)
+ goto err;
+
+ /*
+ * Weird transaction restart handling here because on successful delete,
+ * bch2_inode_rm_snapshot() will return a nested transaction restart,
+ * but we can't retry because the btree write buffer won't have been
+ * flushed and we'd spin:
+ */
+ for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ ret = commit_do(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass));
+ if (ret < 0)
+ break;
+
+ if (ret) {
+ if (!test_bit(BCH_FS_RW, &c->flags)) {
+ bch2_trans_unlock(trans);
+ bch2_fs_lazy_rw(c);
+ }
+
+ bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot);
+
+ ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot);
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ break;
+ }
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (!ret && need_another_pass)
+ goto again;
+err:
+ bch2_trans_put(trans);
+
+ return ret;
+}
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
new file mode 100644
index 000000000000..88818a332b1e
--- /dev/null
+++ b/fs/bcachefs/inode.h
@@ -0,0 +1,217 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_INODE_H
+#define _BCACHEFS_INODE_H
+
+#include "bkey.h"
+#include "bkey_methods.h"
+#include "opts.h"
+
+enum bkey_invalid_flags;
+extern const char * const bch2_inode_opts[];
+
+int bch2_inode_invalid(struct bch_fs *, struct bkey_s_c,
+ enum bkey_invalid_flags, struct printbuf *);
+int bch2_inode_v2_invalid(struct bch_fs *, struct bkey_s_c,
+ enum bkey_invalid_flags, struct printbuf *);
+int bch2_inode_v3_invalid(struct bch_fs *, struct bkey_s_c,
+ enum bkey_invalid_flags, struct printbuf *);
+void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+int bch2_trans_mark_inode(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_mark_inode(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s_c, unsigned);
+
+#define bch2_bkey_ops_inode ((struct bkey_ops) { \
+ .key_invalid = bch2_inode_invalid, \
+ .val_to_text = bch2_inode_to_text, \
+ .trans_trigger = bch2_trans_mark_inode, \
+ .atomic_trigger = bch2_mark_inode, \
+ .min_val_size = 16, \
+})
+
+#define bch2_bkey_ops_inode_v2 ((struct bkey_ops) { \
+ .key_invalid = bch2_inode_v2_invalid, \
+ .val_to_text = bch2_inode_to_text, \
+ .trans_trigger = bch2_trans_mark_inode, \
+ .atomic_trigger = bch2_mark_inode, \
+ .min_val_size = 32, \
+})
+
+#define bch2_bkey_ops_inode_v3 ((struct bkey_ops) { \
+ .key_invalid = bch2_inode_v3_invalid, \
+ .val_to_text = bch2_inode_to_text, \
+ .trans_trigger = bch2_trans_mark_inode, \
+ .atomic_trigger = bch2_mark_inode, \
+ .min_val_size = 48, \
+})
+
+static inline bool bkey_is_inode(const struct bkey *k)
+{
+ return k->type == KEY_TYPE_inode ||
+ k->type == KEY_TYPE_inode_v2 ||
+ k->type == KEY_TYPE_inode_v3;
+}
+
+int bch2_inode_generation_invalid(struct bch_fs *, struct bkey_s_c,
+ enum bkey_invalid_flags, struct printbuf *);
+void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \
+ .key_invalid = bch2_inode_generation_invalid, \
+ .val_to_text = bch2_inode_generation_to_text, \
+ .min_val_size = 8, \
+})
+
+#if 0
+typedef struct {
+ u64 lo;
+ u32 hi;
+} __packed __aligned(4) u96;
+#endif
+typedef u64 u96;
+
+struct bch_inode_unpacked {
+ u64 bi_inum;
+ u64 bi_journal_seq;
+ __le64 bi_hash_seed;
+ u64 bi_size;
+ u64 bi_sectors;
+ u64 bi_version;
+ u32 bi_flags;
+ u16 bi_mode;
+
+#define x(_name, _bits) u##_bits _name;
+ BCH_INODE_FIELDS_v3()
+#undef x
+};
+
+struct bkey_inode_buf {
+ struct bkey_i_inode_v3 inode;
+
+#define x(_name, _bits) + 8 + _bits / 8
+ u8 _pad[0 + BCH_INODE_FIELDS_v3()];
+#undef x
+} __packed __aligned(8);
+
+void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
+int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *);
+struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *);
+
+void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
+
+int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
+ struct bch_inode_unpacked *, subvol_inum, unsigned);
+
+int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *,
+ struct bch_inode_unpacked *, enum btree_update_flags);
+
+static inline int bch2_inode_write(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bch_inode_unpacked *inode)
+{
+ return bch2_inode_write_flags(trans, iter, inode, 0);
+}
+
+void bch2_inode_init_early(struct bch_fs *,
+ struct bch_inode_unpacked *);
+void bch2_inode_init_late(struct bch_inode_unpacked *, u64,
+ uid_t, gid_t, umode_t, dev_t,
+ struct bch_inode_unpacked *);
+void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
+ uid_t, gid_t, umode_t, dev_t,
+ struct bch_inode_unpacked *);
+
+int bch2_inode_create(struct btree_trans *, struct btree_iter *,
+ struct bch_inode_unpacked *, u32, u64);
+
+int bch2_inode_rm(struct bch_fs *, subvol_inum);
+
+int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *,
+ subvol_inum,
+ struct bch_inode_unpacked *);
+int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
+ struct bch_inode_unpacked *);
+int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
+ struct bch_inode_unpacked *);
+
+#define inode_opt_get(_c, _inode, _name) \
+ ((_inode)->bi_##_name ? (_inode)->bi_##_name - 1 : (_c)->opts._name)
+
+static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode,
+ enum inode_opt_id id, u64 v)
+{
+ switch (id) {
+#define x(_name, ...) \
+ case Inode_opt_##_name: \
+ inode->bi_##_name = v; \
+ break;
+ BCH_INODE_OPTS()
+#undef x
+ default:
+ BUG();
+ }
+}
+
+static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode,
+ enum inode_opt_id id)
+{
+ switch (id) {
+#define x(_name, ...) \
+ case Inode_opt_##_name: \
+ return inode->bi_##_name;
+ BCH_INODE_OPTS()
+#undef x
+ default:
+ BUG();
+ }
+}
+
+static inline u8 mode_to_type(umode_t mode)
+{
+ return (mode >> 12) & 15;
+}
+
+static inline u8 inode_d_type(struct bch_inode_unpacked *inode)
+{
+ return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode);
+}
+
+/* i_nlink: */
+
+static inline unsigned nlink_bias(umode_t mode)
+{
+ return S_ISDIR(mode) ? 2 : 1;
+}
+
+static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi)
+{
+ return bi->bi_flags & BCH_INODE_unlinked
+ ? 0
+ : bi->bi_nlink + nlink_bias(bi->bi_mode);
+}
+
+static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
+ unsigned nlink)
+{
+ if (nlink) {
+ bi->bi_nlink = nlink - nlink_bias(bi->bi_mode);
+ bi->bi_flags &= ~BCH_INODE_unlinked;
+ } else {
+ bi->bi_nlink = 0;
+ bi->bi_flags |= BCH_INODE_unlinked;
+ }
+}
+
+int bch2_inode_nlink_inc(struct bch_inode_unpacked *);
+void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
+
+struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
+void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
+ struct bch_inode_unpacked *);
+int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *);
+
+int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32);
+int bch2_delete_dead_inodes(struct bch_fs *);
+
+#endif /* _BCACHEFS_INODE_H */
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
new file mode 100644
index 000000000000..bebc11444ef5
--- /dev/null
+++ b/fs/bcachefs/io_misc.c
@@ -0,0 +1,524 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * io_misc.c - fallocate, fpunch, truncate:
+ */
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "clock.h"
+#include "error.h"
+#include "extents.h"
+#include "extent_update.h"
+#include "inode.h"
+#include "io_misc.h"
+#include "io_write.h"
+#include "logged_ops.h"
+#include "rebalance.h"
+#include "subvolume.h"
+
+/* Overwrites whatever was present with zeroes: */
+int bch2_extent_fallocate(struct btree_trans *trans,
+ subvol_inum inum,
+ struct btree_iter *iter,
+ u64 sectors,
+ struct bch_io_opts opts,
+ s64 *i_sectors_delta,
+ struct write_point_specifier write_point)
+{
+ struct bch_fs *c = trans->c;
+ struct disk_reservation disk_res = { 0 };
+ struct closure cl;
+ struct open_buckets open_buckets = { 0 };
+ struct bkey_s_c k;
+ struct bkey_buf old, new;
+ unsigned sectors_allocated = 0;
+ bool have_reservation = false;
+ bool unwritten = opts.nocow &&
+ c->sb.version >= bcachefs_metadata_version_unwritten_extents;
+ int ret;
+
+ bch2_bkey_buf_init(&old);
+ bch2_bkey_buf_init(&new);
+ closure_init_stack(&cl);
+
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset);
+
+ if (!have_reservation) {
+ unsigned new_replicas =
+ max(0, (int) opts.data_replicas -
+ (int) bch2_bkey_nr_ptrs_fully_allocated(k));
+ /*
+ * Get a disk reservation before (in the nocow case) calling
+ * into the allocator:
+ */
+ ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
+ if (unlikely(ret))
+ goto err;
+
+ bch2_bkey_buf_reassemble(&old, c, k);
+ }
+
+ if (have_reservation) {
+ if (!bch2_extents_match(k, bkey_i_to_s_c(old.k)))
+ goto err;
+
+ bch2_key_resize(&new.k->k, sectors);
+ } else if (!unwritten) {
+ struct bkey_i_reservation *reservation;
+
+ bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64));
+ reservation = bkey_reservation_init(new.k);
+ reservation->k.p = iter->pos;
+ bch2_key_resize(&reservation->k, sectors);
+ reservation->v.nr_replicas = opts.data_replicas;
+ } else {
+ struct bkey_i_extent *e;
+ struct bch_devs_list devs_have;
+ struct write_point *wp;
+ struct bch_extent_ptr *ptr;
+
+ devs_have.nr = 0;
+
+ bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX);
+
+ e = bkey_extent_init(new.k);
+ e->k.p = iter->pos;
+
+ ret = bch2_alloc_sectors_start_trans(trans,
+ opts.foreground_target,
+ false,
+ write_point,
+ &devs_have,
+ opts.data_replicas,
+ opts.data_replicas,
+ BCH_WATERMARK_normal, 0, &cl, &wp);
+ if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
+ ret = -BCH_ERR_transaction_restart_nested;
+ if (ret)
+ goto err;
+
+ sectors = min_t(u64, sectors, wp->sectors_free);
+ sectors_allocated = sectors;
+
+ bch2_key_resize(&e->k, sectors);
+
+ bch2_open_bucket_get(c, wp, &open_buckets);
+ bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
+ bch2_alloc_sectors_done(c, wp);
+
+ extent_for_each_ptr(extent_i_to_s(e), ptr)
+ ptr->unwritten = true;
+ }
+
+ have_reservation = true;
+
+ ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
+ 0, i_sectors_delta, true);
+err:
+ if (!ret && sectors_allocated)
+ bch2_increment_clock(c, sectors_allocated, WRITE);
+
+ bch2_open_buckets_put(c, &open_buckets);
+ bch2_disk_reservation_put(c, &disk_res);
+ bch2_bkey_buf_exit(&new, c);
+ bch2_bkey_buf_exit(&old, c);
+
+ if (closure_nr_remaining(&cl) != 1) {
+ bch2_trans_unlock(trans);
+ closure_sync(&cl);
+ }
+
+ return ret;
+}
+
+/*
+ * Returns -BCH_ERR_transacton_restart if we had to drop locks:
+ */
+int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
+ subvol_inum inum, u64 end,
+ s64 *i_sectors_delta)
+{
+ struct bch_fs *c = trans->c;
+ unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
+ struct bpos end_pos = POS(inum.inum, end);
+ struct bkey_s_c k;
+ int ret = 0, ret2 = 0;
+ u32 snapshot;
+
+ while (!ret ||
+ bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+ struct disk_reservation disk_res =
+ bch2_disk_reservation_init(c, 0);
+ struct bkey_i delete;
+
+ if (ret)
+ ret2 = ret;
+
+ bch2_trans_begin(trans);
+
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ continue;
+
+ bch2_btree_iter_set_snapshot(iter, snapshot);
+
+ /*
+ * peek_upto() doesn't have ideal semantics for extents:
+ */
+ k = bch2_btree_iter_peek_upto(iter, end_pos);
+ if (!k.k)
+ break;
+
+ ret = bkey_err(k);
+ if (ret)
+ continue;
+
+ bkey_init(&delete.k);
+ delete.k.p = iter->pos;
+
+ /* create the biggest key we can */
+ bch2_key_resize(&delete.k, max_sectors);
+ bch2_cut_back(end_pos, &delete);
+
+ ret = bch2_extent_update(trans, inum, iter, &delete,
+ &disk_res, 0, i_sectors_delta, false);
+ bch2_disk_reservation_put(c, &disk_res);
+ }
+
+ return ret ?: ret2;
+}
+
+int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
+ s64 *i_sectors_delta)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+ POS(inum.inum, start),
+ BTREE_ITER_INTENT);
+
+ ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta);
+
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ ret = 0;
+
+ return ret;
+}
+
+/* truncate: */
+
+void bch2_logged_op_truncate_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_s_c_logged_op_truncate op = bkey_s_c_to_logged_op_truncate(k);
+
+ prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol));
+ prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum));
+ prt_printf(out, " new_i_size=%llu", le64_to_cpu(op.v->new_i_size));
+}
+
+static int truncate_set_isize(struct btree_trans *trans,
+ subvol_inum inum,
+ u64 new_i_size)
+{
+ struct btree_iter iter = { NULL };
+ struct bch_inode_unpacked inode_u;
+ int ret;
+
+ ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT) ?:
+ (inode_u.bi_size = new_i_size, 0) ?:
+ bch2_inode_write(trans, &iter, &inode_u);
+
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
+ struct bkey_i *op_k,
+ u64 *i_sectors_delta)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter fpunch_iter;
+ struct bkey_i_logged_op_truncate *op = bkey_i_to_logged_op_truncate(op_k);
+ subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
+ u64 new_i_size = le64_to_cpu(op->v.new_i_size);
+ int ret;
+
+ ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ truncate_set_isize(trans, inum, new_i_size));
+ if (ret)
+ goto err;
+
+ bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents,
+ POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9),
+ BTREE_ITER_INTENT);
+ ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta);
+ bch2_trans_iter_exit(trans, &fpunch_iter);
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ ret = 0;
+err:
+ bch2_logged_op_finish(trans, op_k);
+ return ret;
+}
+
+int bch2_resume_logged_op_truncate(struct btree_trans *trans, struct bkey_i *op_k)
+{
+ return __bch2_resume_logged_op_truncate(trans, op_k, NULL);
+}
+
+int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sectors_delta)
+{
+ struct bkey_i_logged_op_truncate op;
+
+ bkey_logged_op_truncate_init(&op.k_i);
+ op.v.subvol = cpu_to_le32(inum.subvol);
+ op.v.inum = cpu_to_le64(inum.inum);
+ op.v.new_i_size = cpu_to_le64(new_i_size);
+
+ /*
+ * Logged ops aren't atomic w.r.t. snapshot creation: creating a
+ * snapshot while they're in progress, then crashing, will result in the
+ * resume only proceeding in one of the snapshots
+ */
+ down_read(&c->snapshot_create_lock);
+ int ret = bch2_trans_run(c,
+ bch2_logged_op_start(trans, &op.k_i) ?:
+ __bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta));
+ up_read(&c->snapshot_create_lock);
+
+ return ret;
+}
+
+/* finsert/fcollapse: */
+
+void bch2_logged_op_finsert_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_s_c_logged_op_finsert op = bkey_s_c_to_logged_op_finsert(k);
+
+ prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol));
+ prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum));
+ prt_printf(out, " dst_offset=%lli", le64_to_cpu(op.v->dst_offset));
+ prt_printf(out, " src_offset=%llu", le64_to_cpu(op.v->src_offset));
+}
+
+static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset, s64 len)
+{
+ struct btree_iter iter;
+ struct bch_inode_unpacked inode_u;
+ int ret;
+
+ offset <<= 9;
+ len <<= 9;
+
+ ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT);
+ if (ret)
+ return ret;
+
+ if (len > 0) {
+ if (MAX_LFS_FILESIZE - inode_u.bi_size < len) {
+ ret = -EFBIG;
+ goto err;
+ }
+
+ if (offset >= inode_u.bi_size) {
+ ret = -EINVAL;
+ goto err;
+ }
+ }
+
+ inode_u.bi_size += len;
+ inode_u.bi_mtime = inode_u.bi_ctime = bch2_current_time(trans->c);
+
+ ret = bch2_inode_write(trans, &iter, &inode_u);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
+ struct bkey_i *op_k,
+ u64 *i_sectors_delta)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k);
+ subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
+ struct bch_io_opts opts;
+ u64 dst_offset = le64_to_cpu(op->v.dst_offset);
+ u64 src_offset = le64_to_cpu(op->v.src_offset);
+ s64 shift = dst_offset - src_offset;
+ u64 len = abs(shift);
+ u64 pos = le64_to_cpu(op->v.pos);
+ bool insert = shift > 0;
+ int ret = 0;
+
+ ret = bch2_inum_opts_get(trans, inum, &opts);
+ if (ret)
+ return ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+ POS(inum.inum, 0),
+ BTREE_ITER_INTENT);
+
+ switch (op->v.state) {
+case LOGGED_OP_FINSERT_start:
+ op->v.state = LOGGED_OP_FINSERT_shift_extents;
+
+ if (insert) {
+ ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ adjust_i_size(trans, inum, src_offset, len) ?:
+ bch2_logged_op_update(trans, &op->k_i));
+ if (ret)
+ goto err;
+ } else {
+ bch2_btree_iter_set_pos(&iter, POS(inum.inum, src_offset));
+
+ ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta);
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto err;
+
+ ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ bch2_logged_op_update(trans, &op->k_i));
+ }
+
+ fallthrough;
+case LOGGED_OP_FINSERT_shift_extents:
+ while (1) {
+ struct disk_reservation disk_res =
+ bch2_disk_reservation_init(c, 0);
+ struct bkey_i delete, *copy;
+ struct bkey_s_c k;
+ struct bpos src_pos = POS(inum.inum, src_offset);
+ u32 snapshot;
+
+ bch2_trans_begin(trans);
+
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ goto btree_err;
+
+ bch2_btree_iter_set_snapshot(&iter, snapshot);
+ bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot));
+
+ k = insert
+ ? bch2_btree_iter_peek_prev(&iter)
+ : bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX));
+ if ((ret = bkey_err(k)))
+ goto btree_err;
+
+ if (!k.k ||
+ k.k->p.inode != inum.inum ||
+ bkey_le(k.k->p, POS(inum.inum, src_offset)))
+ break;
+
+ copy = bch2_bkey_make_mut_noupdate(trans, k);
+ if ((ret = PTR_ERR_OR_ZERO(copy)))
+ goto btree_err;
+
+ if (insert &&
+ bkey_lt(bkey_start_pos(k.k), src_pos)) {
+ bch2_cut_front(src_pos, copy);
+
+ /* Splitting compressed extent? */
+ bch2_disk_reservation_add(c, &disk_res,
+ copy->k.size *
+ bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy)),
+ BCH_DISK_RESERVATION_NOFAIL);
+ }
+
+ bkey_init(&delete.k);
+ delete.k.p = copy->k.p;
+ delete.k.p.snapshot = snapshot;
+ delete.k.size = copy->k.size;
+
+ copy->k.p.offset += shift;
+ copy->k.p.snapshot = snapshot;
+
+ op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
+
+ ret = bch2_bkey_set_needs_rebalance(c, copy,
+ opts.background_target,
+ opts.background_compression) ?:
+ bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
+ bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
+ bch2_logged_op_update(trans, &op->k_i) ?:
+ bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL);
+btree_err:
+ bch2_disk_reservation_put(c, &disk_res);
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ goto err;
+
+ pos = le64_to_cpu(op->v.pos);
+ }
+
+ op->v.state = LOGGED_OP_FINSERT_finish;
+
+ if (!insert) {
+ ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ adjust_i_size(trans, inum, src_offset, shift) ?:
+ bch2_logged_op_update(trans, &op->k_i));
+ } else {
+ /* We need an inode update to update bi_journal_seq for fsync: */
+ ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ adjust_i_size(trans, inum, 0, 0) ?:
+ bch2_logged_op_update(trans, &op->k_i));
+ }
+
+ break;
+case LOGGED_OP_FINSERT_finish:
+ break;
+ }
+err:
+ bch2_logged_op_finish(trans, op_k);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_resume_logged_op_finsert(struct btree_trans *trans, struct bkey_i *op_k)
+{
+ return __bch2_resume_logged_op_finsert(trans, op_k, NULL);
+}
+
+int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum,
+ u64 offset, u64 len, bool insert,
+ s64 *i_sectors_delta)
+{
+ struct bkey_i_logged_op_finsert op;
+ s64 shift = insert ? len : -len;
+
+ bkey_logged_op_finsert_init(&op.k_i);
+ op.v.subvol = cpu_to_le32(inum.subvol);
+ op.v.inum = cpu_to_le64(inum.inum);
+ op.v.dst_offset = cpu_to_le64(offset + shift);
+ op.v.src_offset = cpu_to_le64(offset);
+ op.v.pos = cpu_to_le64(insert ? U64_MAX : offset);
+
+ /*
+ * Logged ops aren't atomic w.r.t. snapshot creation: creating a
+ * snapshot while they're in progress, then crashing, will result in the
+ * resume only proceeding in one of the snapshots
+ */
+ down_read(&c->snapshot_create_lock);
+ int ret = bch2_trans_run(c,
+ bch2_logged_op_start(trans, &op.k_i) ?:
+ __bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta));
+ up_read(&c->snapshot_create_lock);
+
+ return ret;
+}
diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h
new file mode 100644
index 000000000000..9cb44a7c43c1
--- /dev/null
+++ b/fs/bcachefs/io_misc.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_MISC_H
+#define _BCACHEFS_IO_MISC_H
+
+int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *,
+ u64, struct bch_io_opts, s64 *,
+ struct write_point_specifier);
+int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
+ subvol_inum, u64, s64 *);
+int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
+
+void bch2_logged_op_truncate_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_logged_op_truncate ((struct bkey_ops) { \
+ .val_to_text = bch2_logged_op_truncate_to_text, \
+ .min_val_size = 24, \
+})
+
+int bch2_resume_logged_op_truncate(struct btree_trans *, struct bkey_i *);
+
+int bch2_truncate(struct bch_fs *, subvol_inum, u64, u64 *);
+
+void bch2_logged_op_finsert_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_logged_op_finsert ((struct bkey_ops) { \
+ .val_to_text = bch2_logged_op_finsert_to_text, \
+ .min_val_size = 24, \
+})
+
+int bch2_resume_logged_op_finsert(struct btree_trans *, struct bkey_i *);
+
+int bch2_fcollapse_finsert(struct bch_fs *, subvol_inum, u64, u64, bool, s64 *);
+
+#endif /* _BCACHEFS_IO_MISC_H */
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
new file mode 100644
index 000000000000..36763865facd
--- /dev/null
+++ b/fs/bcachefs/io_read.c
@@ -0,0 +1,1210 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Some low level IO code, and hacks for various block layer limitations
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "clock.h"
+#include "compress.h"
+#include "data_update.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "error.h"
+#include "io_read.h"
+#include "io_misc.h"
+#include "io_write.h"
+#include "subvolume.h"
+#include "trace.h"
+
+#include <linux/sched/mm.h>
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+
+static bool bch2_target_congested(struct bch_fs *c, u16 target)
+{
+ const struct bch_devs_mask *devs;
+ unsigned d, nr = 0, total = 0;
+ u64 now = local_clock(), last;
+ s64 congested;
+ struct bch_dev *ca;
+
+ if (!target)
+ return false;
+
+ rcu_read_lock();
+ devs = bch2_target_to_mask(c, target) ?:
+ &c->rw_devs[BCH_DATA_user];
+
+ for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
+ ca = rcu_dereference(c->devs[d]);
+ if (!ca)
+ continue;
+
+ congested = atomic_read(&ca->congested);
+ last = READ_ONCE(ca->congested_last);
+ if (time_after64(now, last))
+ congested -= (now - last) >> 12;
+
+ total += max(congested, 0LL);
+ nr++;
+ }
+ rcu_read_unlock();
+
+ return bch2_rand_range(nr * CONGESTED_MAX) < total;
+}
+
+#else
+
+static bool bch2_target_congested(struct bch_fs *c, u16 target)
+{
+ return false;
+}
+
+#endif
+
+/* Cache promotion on read */
+
+struct promote_op {
+ struct rcu_head rcu;
+ u64 start_time;
+
+ struct rhash_head hash;
+ struct bpos pos;
+
+ struct data_update write;
+ struct bio_vec bi_inline_vecs[0]; /* must be last */
+};
+
+static const struct rhashtable_params bch_promote_params = {
+ .head_offset = offsetof(struct promote_op, hash),
+ .key_offset = offsetof(struct promote_op, pos),
+ .key_len = sizeof(struct bpos),
+};
+
+static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
+ struct bpos pos,
+ struct bch_io_opts opts,
+ unsigned flags)
+{
+ BUG_ON(!opts.promote_target);
+
+ if (!(flags & BCH_READ_MAY_PROMOTE))
+ return -BCH_ERR_nopromote_may_not;
+
+ if (bch2_bkey_has_target(c, k, opts.promote_target))
+ return -BCH_ERR_nopromote_already_promoted;
+
+ if (bkey_extent_is_unwritten(k))
+ return -BCH_ERR_nopromote_unwritten;
+
+ if (bch2_target_congested(c, opts.promote_target))
+ return -BCH_ERR_nopromote_congested;
+
+ if (rhashtable_lookup_fast(&c->promote_table, &pos,
+ bch_promote_params))
+ return -BCH_ERR_nopromote_in_flight;
+
+ return 0;
+}
+
+static void promote_free(struct bch_fs *c, struct promote_op *op)
+{
+ int ret;
+
+ bch2_data_update_exit(&op->write);
+
+ ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
+ bch_promote_params);
+ BUG_ON(ret);
+ bch2_write_ref_put(c, BCH_WRITE_REF_promote);
+ kfree_rcu(op, rcu);
+}
+
+static void promote_done(struct bch_write_op *wop)
+{
+ struct promote_op *op =
+ container_of(wop, struct promote_op, write.op);
+ struct bch_fs *c = op->write.op.c;
+
+ bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
+ op->start_time);
+ promote_free(c, op);
+}
+
+static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
+{
+ struct bio *bio = &op->write.op.wbio.bio;
+
+ trace_and_count(op->write.op.c, read_promote, &rbio->bio);
+
+ /* we now own pages: */
+ BUG_ON(!rbio->bounce);
+ BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
+
+ memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
+ sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
+ swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
+
+ bch2_data_update_read_done(&op->write, rbio->pick.crc);
+}
+
+static struct promote_op *__promote_alloc(struct btree_trans *trans,
+ enum btree_id btree_id,
+ struct bkey_s_c k,
+ struct bpos pos,
+ struct extent_ptr_decoded *pick,
+ struct bch_io_opts opts,
+ unsigned sectors,
+ struct bch_read_bio **rbio)
+{
+ struct bch_fs *c = trans->c;
+ struct promote_op *op = NULL;
+ struct bio *bio;
+ unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
+ int ret;
+
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
+ return NULL;
+
+ op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS);
+ if (!op)
+ goto err;
+
+ op->start_time = local_clock();
+ op->pos = pos;
+
+ /*
+ * We don't use the mempool here because extents that aren't
+ * checksummed or compressed can be too big for the mempool:
+ */
+ *rbio = kzalloc(sizeof(struct bch_read_bio) +
+ sizeof(struct bio_vec) * pages,
+ GFP_NOFS);
+ if (!*rbio)
+ goto err;
+
+ rbio_init(&(*rbio)->bio, opts);
+ bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
+
+ if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
+ GFP_NOFS))
+ goto err;
+
+ (*rbio)->bounce = true;
+ (*rbio)->split = true;
+ (*rbio)->kmalloc = true;
+
+ if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
+ bch_promote_params))
+ goto err;
+
+ bio = &op->write.op.wbio.bio;
+ bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
+
+ ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
+ writepoint_hashed((unsigned long) current),
+ opts,
+ (struct data_update_opts) {
+ .target = opts.promote_target,
+ .extra_replicas = 1,
+ .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
+ },
+ btree_id, k);
+ /*
+ * possible errors: -BCH_ERR_nocow_lock_blocked,
+ * -BCH_ERR_ENOSPC_disk_reservation:
+ */
+ if (ret) {
+ ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
+ bch_promote_params);
+ BUG_ON(ret);
+ goto err;
+ }
+
+ op->write.op.end_io = promote_done;
+
+ return op;
+err:
+ if (*rbio)
+ bio_free_pages(&(*rbio)->bio);
+ kfree(*rbio);
+ *rbio = NULL;
+ kfree(op);
+ bch2_write_ref_put(c, BCH_WRITE_REF_promote);
+ return NULL;
+}
+
+noinline
+static struct promote_op *promote_alloc(struct btree_trans *trans,
+ struct bvec_iter iter,
+ struct bkey_s_c k,
+ struct extent_ptr_decoded *pick,
+ struct bch_io_opts opts,
+ unsigned flags,
+ struct bch_read_bio **rbio,
+ bool *bounce,
+ bool *read_full)
+{
+ struct bch_fs *c = trans->c;
+ bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
+ /* data might have to be decompressed in the write path: */
+ unsigned sectors = promote_full
+ ? max(pick->crc.compressed_size, pick->crc.live_size)
+ : bvec_iter_sectors(iter);
+ struct bpos pos = promote_full
+ ? bkey_start_pos(k.k)
+ : POS(k.k->p.inode, iter.bi_sector);
+ struct promote_op *promote;
+ int ret;
+
+ ret = should_promote(c, k, pos, opts, flags);
+ if (ret)
+ goto nopromote;
+
+ promote = __promote_alloc(trans,
+ k.k->type == KEY_TYPE_reflink_v
+ ? BTREE_ID_reflink
+ : BTREE_ID_extents,
+ k, pos, pick, opts, sectors, rbio);
+ if (!promote) {
+ ret = -BCH_ERR_nopromote_enomem;
+ goto nopromote;
+ }
+
+ *bounce = true;
+ *read_full = promote_full;
+ return promote;
+nopromote:
+ trace_read_nopromote(c, ret);
+ return NULL;
+}
+
+/* Read */
+
+#define READ_RETRY_AVOID 1
+#define READ_RETRY 2
+#define READ_ERR 3
+
+enum rbio_context {
+ RBIO_CONTEXT_NULL,
+ RBIO_CONTEXT_HIGHPRI,
+ RBIO_CONTEXT_UNBOUND,
+};
+
+static inline struct bch_read_bio *
+bch2_rbio_parent(struct bch_read_bio *rbio)
+{
+ return rbio->split ? rbio->parent : rbio;
+}
+
+__always_inline
+static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
+ enum rbio_context context,
+ struct workqueue_struct *wq)
+{
+ if (context <= rbio->context) {
+ fn(&rbio->work);
+ } else {
+ rbio->work.func = fn;
+ rbio->context = context;
+ queue_work(wq, &rbio->work);
+ }
+}
+
+static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
+{
+ BUG_ON(rbio->bounce && !rbio->split);
+
+ if (rbio->promote)
+ promote_free(rbio->c, rbio->promote);
+ rbio->promote = NULL;
+
+ if (rbio->bounce)
+ bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
+
+ if (rbio->split) {
+ struct bch_read_bio *parent = rbio->parent;
+
+ if (rbio->kmalloc)
+ kfree(rbio);
+ else
+ bio_put(&rbio->bio);
+
+ rbio = parent;
+ }
+
+ return rbio;
+}
+
+/*
+ * Only called on a top level bch_read_bio to complete an entire read request,
+ * not a split:
+ */
+static void bch2_rbio_done(struct bch_read_bio *rbio)
+{
+ if (rbio->start_time)
+ bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
+ rbio->start_time);
+ bio_endio(&rbio->bio);
+}
+
+static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
+ struct bvec_iter bvec_iter,
+ struct bch_io_failures *failed,
+ unsigned flags)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_buf sk;
+ struct bkey_s_c k;
+ int ret;
+
+ flags &= ~BCH_READ_LAST_FRAGMENT;
+ flags |= BCH_READ_MUST_CLONE;
+
+ bch2_bkey_buf_init(&sk);
+
+ bch2_trans_iter_init(trans, &iter, rbio->data_btree,
+ rbio->read_pos, BTREE_ITER_SLOTS);
+retry:
+ rbio->bio.bi_status = 0;
+
+ k = bch2_btree_iter_peek_slot(&iter);
+ if (bkey_err(k))
+ goto err;
+
+ bch2_bkey_buf_reassemble(&sk, c, k);
+ k = bkey_i_to_s_c(sk.k);
+ bch2_trans_unlock(trans);
+
+ if (!bch2_bkey_matches_ptr(c, k,
+ rbio->pick.ptr,
+ rbio->data_pos.offset -
+ rbio->pick.crc.offset)) {
+ /* extent we wanted to read no longer exists: */
+ rbio->hole = true;
+ goto out;
+ }
+
+ ret = __bch2_read_extent(trans, rbio, bvec_iter,
+ rbio->read_pos,
+ rbio->data_btree,
+ k, 0, failed, flags);
+ if (ret == READ_RETRY)
+ goto retry;
+ if (ret)
+ goto err;
+out:
+ bch2_rbio_done(rbio);
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
+ bch2_bkey_buf_exit(&sk, c);
+ return;
+err:
+ rbio->bio.bi_status = BLK_STS_IOERR;
+ goto out;
+}
+
+static void bch2_rbio_retry(struct work_struct *work)
+{
+ struct bch_read_bio *rbio =
+ container_of(work, struct bch_read_bio, work);
+ struct bch_fs *c = rbio->c;
+ struct bvec_iter iter = rbio->bvec_iter;
+ unsigned flags = rbio->flags;
+ subvol_inum inum = {
+ .subvol = rbio->subvol,
+ .inum = rbio->read_pos.inode,
+ };
+ struct bch_io_failures failed = { .nr = 0 };
+
+ trace_and_count(c, read_retry, &rbio->bio);
+
+ if (rbio->retry == READ_RETRY_AVOID)
+ bch2_mark_io_failure(&failed, &rbio->pick);
+
+ rbio->bio.bi_status = 0;
+
+ rbio = bch2_rbio_free(rbio);
+
+ flags |= BCH_READ_IN_RETRY;
+ flags &= ~BCH_READ_MAY_PROMOTE;
+
+ if (flags & BCH_READ_NODECODE) {
+ bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
+ } else {
+ flags &= ~BCH_READ_LAST_FRAGMENT;
+ flags |= BCH_READ_MUST_CLONE;
+
+ __bch2_read(c, rbio, iter, inum, &failed, flags);
+ }
+}
+
+static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
+ blk_status_t error)
+{
+ rbio->retry = retry;
+
+ if (rbio->flags & BCH_READ_IN_RETRY)
+ return;
+
+ if (retry == READ_ERR) {
+ rbio = bch2_rbio_free(rbio);
+
+ rbio->bio.bi_status = error;
+ bch2_rbio_done(rbio);
+ } else {
+ bch2_rbio_punt(rbio, bch2_rbio_retry,
+ RBIO_CONTEXT_UNBOUND, system_unbound_wq);
+ }
+}
+
+static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
+ struct bch_read_bio *rbio)
+{
+ struct bch_fs *c = rbio->c;
+ u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
+ struct bch_extent_crc_unpacked new_crc;
+ struct btree_iter iter;
+ struct bkey_i *new;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ if (crc_is_compressed(rbio->pick.crc))
+ return 0;
+
+ k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ if ((ret = bkey_err(k)))
+ goto out;
+
+ if (bversion_cmp(k.k->version, rbio->version) ||
+ !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
+ goto out;
+
+ /* Extent was merged? */
+ if (bkey_start_offset(k.k) < data_offset ||
+ k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
+ goto out;
+
+ if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
+ rbio->pick.crc, NULL, &new_crc,
+ bkey_start_offset(k.k) - data_offset, k.k->size,
+ rbio->pick.crc.csum_type)) {
+ bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
+ ret = 0;
+ goto out;
+ }
+
+ /*
+ * going to be temporarily appending another checksum entry:
+ */
+ new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
+ sizeof(struct bch_extent_crc128));
+ if ((ret = PTR_ERR_OR_ZERO(new)))
+ goto out;
+
+ bkey_reassemble(new, k);
+
+ if (!bch2_bkey_narrow_crcs(new, new_crc))
+ goto out;
+
+ ret = bch2_trans_update(trans, &iter, new,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
+{
+ bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL,
+ __bch2_rbio_narrow_crcs(trans, rbio));
+}
+
+/* Inner part that may run in process context */
+static void __bch2_read_endio(struct work_struct *work)
+{
+ struct bch_read_bio *rbio =
+ container_of(work, struct bch_read_bio, work);
+ struct bch_fs *c = rbio->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
+ struct bio *src = &rbio->bio;
+ struct bio *dst = &bch2_rbio_parent(rbio)->bio;
+ struct bvec_iter dst_iter = rbio->bvec_iter;
+ struct bch_extent_crc_unpacked crc = rbio->pick.crc;
+ struct nonce nonce = extent_nonce(rbio->version, crc);
+ unsigned nofs_flags;
+ struct bch_csum csum;
+ int ret;
+
+ nofs_flags = memalloc_nofs_save();
+
+ /* Reset iterator for checksumming and copying bounced data: */
+ if (rbio->bounce) {
+ src->bi_iter.bi_size = crc.compressed_size << 9;
+ src->bi_iter.bi_idx = 0;
+ src->bi_iter.bi_bvec_done = 0;
+ } else {
+ src->bi_iter = rbio->bvec_iter;
+ }
+
+ csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
+ if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io)
+ goto csum_err;
+
+ /*
+ * XXX
+ * We need to rework the narrow_crcs path to deliver the read completion
+ * first, and then punt to a different workqueue, otherwise we're
+ * holding up reads while doing btree updates which is bad for memory
+ * reclaim.
+ */
+ if (unlikely(rbio->narrow_crcs))
+ bch2_rbio_narrow_crcs(rbio);
+
+ if (rbio->flags & BCH_READ_NODECODE)
+ goto nodecode;
+
+ /* Adjust crc to point to subset of data we want: */
+ crc.offset += rbio->offset_into_extent;
+ crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
+
+ if (crc_is_compressed(crc)) {
+ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+ if (ret)
+ goto decrypt_err;
+
+ if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
+ !c->opts.no_data_io)
+ goto decompression_err;
+ } else {
+ /* don't need to decrypt the entire bio: */
+ nonce = nonce_add(nonce, crc.offset << 9);
+ bio_advance(src, crc.offset << 9);
+
+ BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
+ src->bi_iter.bi_size = dst_iter.bi_size;
+
+ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+ if (ret)
+ goto decrypt_err;
+
+ if (rbio->bounce) {
+ struct bvec_iter src_iter = src->bi_iter;
+
+ bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
+ }
+ }
+
+ if (rbio->promote) {
+ /*
+ * Re encrypt data we decrypted, so it's consistent with
+ * rbio->crc:
+ */
+ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+ if (ret)
+ goto decrypt_err;
+
+ promote_start(rbio->promote, rbio);
+ rbio->promote = NULL;
+ }
+nodecode:
+ if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
+ rbio = bch2_rbio_free(rbio);
+ bch2_rbio_done(rbio);
+ }
+out:
+ memalloc_nofs_restore(nofs_flags);
+ return;
+csum_err:
+ /*
+ * Checksum error: if the bio wasn't bounced, we may have been
+ * reading into buffers owned by userspace (that userspace can
+ * scribble over) - retry the read, bouncing it this time:
+ */
+ if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
+ rbio->flags |= BCH_READ_MUST_BOUNCE;
+ bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
+ goto out;
+ }
+
+ bch_err_inum_offset_ratelimited(ca,
+ rbio->read_pos.inode,
+ rbio->read_pos.offset << 9,
+ "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)",
+ rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
+ csum.hi, csum.lo, bch2_csum_types[crc.csum_type]);
+ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+ goto out;
+decompression_err:
+ bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
+ rbio->read_pos.offset << 9,
+ "decompression error");
+ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+ goto out;
+decrypt_err:
+ bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
+ rbio->read_pos.offset << 9,
+ "decrypt error");
+ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+ goto out;
+}
+
+static void bch2_read_endio(struct bio *bio)
+{
+ struct bch_read_bio *rbio =
+ container_of(bio, struct bch_read_bio, bio);
+ struct bch_fs *c = rbio->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
+ struct workqueue_struct *wq = NULL;
+ enum rbio_context context = RBIO_CONTEXT_NULL;
+
+ if (rbio->have_ioref) {
+ bch2_latency_acct(ca, rbio->submit_time, READ);
+ percpu_ref_put(&ca->io_ref);
+ }
+
+ if (!rbio->split)
+ rbio->bio.bi_end_io = rbio->end_io;
+
+ if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
+ rbio->read_pos.inode,
+ rbio->read_pos.offset,
+ "data read error: %s",
+ bch2_blk_status_to_str(bio->bi_status))) {
+ bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
+ return;
+ }
+
+ if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
+ ptr_stale(ca, &rbio->pick.ptr)) {
+ trace_and_count(c, read_reuse_race, &rbio->bio);
+
+ if (rbio->flags & BCH_READ_RETRY_IF_STALE)
+ bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
+ else
+ bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
+ return;
+ }
+
+ if (rbio->narrow_crcs ||
+ rbio->promote ||
+ crc_is_compressed(rbio->pick.crc) ||
+ bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
+ context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
+ else if (rbio->pick.crc.csum_type)
+ context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
+
+ bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
+}
+
+int __bch2_read_indirect_extent(struct btree_trans *trans,
+ unsigned *offset_into_extent,
+ struct bkey_buf *orig_k)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u64 reflink_offset;
+ int ret;
+
+ reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
+ *offset_into_extent;
+
+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink,
+ POS(0, reflink_offset), 0);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_reflink_v &&
+ k.k->type != KEY_TYPE_indirect_inline_data) {
+ bch_err_inum_offset_ratelimited(trans->c,
+ orig_k->k->k.p.inode,
+ orig_k->k->k.p.offset << 9,
+ "%llu len %u points to nonexistent indirect extent %llu",
+ orig_k->k->k.p.offset,
+ orig_k->k->k.size,
+ reflink_offset);
+ bch2_inconsistent_error(trans->c);
+ ret = -EIO;
+ goto err;
+ }
+
+ *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
+ bch2_bkey_buf_reassemble(orig_k, trans->c, k);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
+ struct bkey_s_c k,
+ struct bch_extent_ptr ptr)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev);
+ struct btree_iter iter;
+ struct printbuf buf = PRINTBUF;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+ PTR_BUCKET_POS(c, &ptr),
+ BTREE_ITER_CACHED);
+
+ prt_printf(&buf, "Attempting to read from stale dirty pointer:");
+ printbuf_indent_add(&buf, 2);
+ prt_newline(&buf);
+
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_newline(&buf);
+
+ prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
+
+ ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+ if (!ret) {
+ prt_newline(&buf);
+ bch2_bkey_val_to_text(&buf, c, k);
+ }
+
+ bch2_fs_inconsistent(c, "%s", buf.buf);
+
+ bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
+}
+
+int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
+ struct bvec_iter iter, struct bpos read_pos,
+ enum btree_id data_btree, struct bkey_s_c k,
+ unsigned offset_into_extent,
+ struct bch_io_failures *failed, unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct extent_ptr_decoded pick;
+ struct bch_read_bio *rbio = NULL;
+ struct bch_dev *ca = NULL;
+ struct promote_op *promote = NULL;
+ bool bounce = false, read_full = false, narrow_crcs = false;
+ struct bpos data_pos = bkey_start_pos(k.k);
+ int pick_ret;
+
+ if (bkey_extent_is_inline_data(k.k)) {
+ unsigned bytes = min_t(unsigned, iter.bi_size,
+ bkey_inline_data_bytes(k.k));
+
+ swap(iter.bi_size, bytes);
+ memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
+ swap(iter.bi_size, bytes);
+ bio_advance_iter(&orig->bio, &iter, bytes);
+ zero_fill_bio_iter(&orig->bio, iter);
+ goto out_read_done;
+ }
+retry_pick:
+ pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
+
+ /* hole or reservation - just zero fill: */
+ if (!pick_ret)
+ goto hole;
+
+ if (pick_ret < 0) {
+ bch_err_inum_offset_ratelimited(c,
+ read_pos.inode, read_pos.offset << 9,
+ "no device to read from");
+ goto err;
+ }
+
+ ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+
+ /*
+ * Stale dirty pointers are treated as IO errors, but @failed isn't
+ * allocated unless we're in the retry path - so if we're not in the
+ * retry path, don't check here, it'll be caught in bch2_read_endio()
+ * and we'll end up in the retry path:
+ */
+ if ((flags & BCH_READ_IN_RETRY) &&
+ !pick.ptr.cached &&
+ unlikely(ptr_stale(ca, &pick.ptr))) {
+ read_from_stale_dirty_pointer(trans, k, pick.ptr);
+ bch2_mark_io_failure(failed, &pick);
+ goto retry_pick;
+ }
+
+ /*
+ * Unlock the iterator while the btree node's lock is still in
+ * cache, before doing the IO:
+ */
+ bch2_trans_unlock(trans);
+
+ if (flags & BCH_READ_NODECODE) {
+ /*
+ * can happen if we retry, and the extent we were going to read
+ * has been merged in the meantime:
+ */
+ if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
+ goto hole;
+
+ iter.bi_size = pick.crc.compressed_size << 9;
+ goto get_bio;
+ }
+
+ if (!(flags & BCH_READ_LAST_FRAGMENT) ||
+ bio_flagged(&orig->bio, BIO_CHAIN))
+ flags |= BCH_READ_MUST_CLONE;
+
+ narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
+ bch2_can_narrow_extent_crcs(k, pick.crc);
+
+ if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
+ flags |= BCH_READ_MUST_BOUNCE;
+
+ EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
+
+ if (crc_is_compressed(pick.crc) ||
+ (pick.crc.csum_type != BCH_CSUM_none &&
+ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+ (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
+ (flags & BCH_READ_USER_MAPPED)) ||
+ (flags & BCH_READ_MUST_BOUNCE)))) {
+ read_full = true;
+ bounce = true;
+ }
+
+ if (orig->opts.promote_target)
+ promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
+ &rbio, &bounce, &read_full);
+
+ if (!read_full) {
+ EBUG_ON(crc_is_compressed(pick.crc));
+ EBUG_ON(pick.crc.csum_type &&
+ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+ bvec_iter_sectors(iter) != pick.crc.live_size ||
+ pick.crc.offset ||
+ offset_into_extent));
+
+ data_pos.offset += offset_into_extent;
+ pick.ptr.offset += pick.crc.offset +
+ offset_into_extent;
+ offset_into_extent = 0;
+ pick.crc.compressed_size = bvec_iter_sectors(iter);
+ pick.crc.uncompressed_size = bvec_iter_sectors(iter);
+ pick.crc.offset = 0;
+ pick.crc.live_size = bvec_iter_sectors(iter);
+ }
+get_bio:
+ if (rbio) {
+ /*
+ * promote already allocated bounce rbio:
+ * promote needs to allocate a bio big enough for uncompressing
+ * data in the write path, but we're not going to use it all
+ * here:
+ */
+ EBUG_ON(rbio->bio.bi_iter.bi_size <
+ pick.crc.compressed_size << 9);
+ rbio->bio.bi_iter.bi_size =
+ pick.crc.compressed_size << 9;
+ } else if (bounce) {
+ unsigned sectors = pick.crc.compressed_size;
+
+ rbio = rbio_init(bio_alloc_bioset(NULL,
+ DIV_ROUND_UP(sectors, PAGE_SECTORS),
+ 0,
+ GFP_NOFS,
+ &c->bio_read_split),
+ orig->opts);
+
+ bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
+ rbio->bounce = true;
+ rbio->split = true;
+ } else if (flags & BCH_READ_MUST_CLONE) {
+ /*
+ * Have to clone if there were any splits, due to error
+ * reporting issues (if a split errored, and retrying didn't
+ * work, when it reports the error to its parent (us) we don't
+ * know if the error was from our bio, and we should retry, or
+ * from the whole bio, in which case we don't want to retry and
+ * lose the error)
+ */
+ rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
+ &c->bio_read_split),
+ orig->opts);
+ rbio->bio.bi_iter = iter;
+ rbio->split = true;
+ } else {
+ rbio = orig;
+ rbio->bio.bi_iter = iter;
+ EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
+ }
+
+ EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
+
+ rbio->c = c;
+ rbio->submit_time = local_clock();
+ if (rbio->split)
+ rbio->parent = orig;
+ else
+ rbio->end_io = orig->bio.bi_end_io;
+ rbio->bvec_iter = iter;
+ rbio->offset_into_extent= offset_into_extent;
+ rbio->flags = flags;
+ rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
+ rbio->narrow_crcs = narrow_crcs;
+ rbio->hole = 0;
+ rbio->retry = 0;
+ rbio->context = 0;
+ /* XXX: only initialize this if needed */
+ rbio->devs_have = bch2_bkey_devs(k);
+ rbio->pick = pick;
+ rbio->subvol = orig->subvol;
+ rbio->read_pos = read_pos;
+ rbio->data_btree = data_btree;
+ rbio->data_pos = data_pos;
+ rbio->version = k.k->version;
+ rbio->promote = promote;
+ INIT_WORK(&rbio->work, NULL);
+
+ rbio->bio.bi_opf = orig->bio.bi_opf;
+ rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
+ rbio->bio.bi_end_io = bch2_read_endio;
+
+ if (rbio->bounce)
+ trace_and_count(c, read_bounce, &rbio->bio);
+
+ this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
+ bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
+
+ /*
+ * If it's being moved internally, we don't want to flag it as a cache
+ * hit:
+ */
+ if (pick.ptr.cached && !(flags & BCH_READ_NODECODE))
+ bch2_bucket_io_time_reset(trans, pick.ptr.dev,
+ PTR_BUCKET_NR(ca, &pick.ptr), READ);
+
+ if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
+ bio_inc_remaining(&orig->bio);
+ trace_and_count(c, read_split, &orig->bio);
+ }
+
+ if (!rbio->pick.idx) {
+ if (!rbio->have_ioref) {
+ bch_err_inum_offset_ratelimited(c,
+ read_pos.inode,
+ read_pos.offset << 9,
+ "no device to read from");
+ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+ goto out;
+ }
+
+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
+ bio_sectors(&rbio->bio));
+ bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
+
+ if (unlikely(c->opts.no_data_io)) {
+ if (likely(!(flags & BCH_READ_IN_RETRY)))
+ bio_endio(&rbio->bio);
+ } else {
+ if (likely(!(flags & BCH_READ_IN_RETRY)))
+ submit_bio(&rbio->bio);
+ else
+ submit_bio_wait(&rbio->bio);
+ }
+
+ /*
+ * We just submitted IO which may block, we expect relock fail
+ * events and shouldn't count them:
+ */
+ trans->notrace_relock_fail = true;
+ } else {
+ /* Attempting reconstruct read: */
+ if (bch2_ec_read_extent(trans, rbio)) {
+ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+ goto out;
+ }
+
+ if (likely(!(flags & BCH_READ_IN_RETRY)))
+ bio_endio(&rbio->bio);
+ }
+out:
+ if (likely(!(flags & BCH_READ_IN_RETRY))) {
+ return 0;
+ } else {
+ int ret;
+
+ rbio->context = RBIO_CONTEXT_UNBOUND;
+ bch2_read_endio(&rbio->bio);
+
+ ret = rbio->retry;
+ rbio = bch2_rbio_free(rbio);
+
+ if (ret == READ_RETRY_AVOID) {
+ bch2_mark_io_failure(failed, &pick);
+ ret = READ_RETRY;
+ }
+
+ if (!ret)
+ goto out_read_done;
+
+ return ret;
+ }
+
+err:
+ if (flags & BCH_READ_IN_RETRY)
+ return READ_ERR;
+
+ orig->bio.bi_status = BLK_STS_IOERR;
+ goto out_read_done;
+
+hole:
+ /*
+ * won't normally happen in the BCH_READ_NODECODE
+ * (bch2_move_extent()) path, but if we retry and the extent we wanted
+ * to read no longer exists we have to signal that:
+ */
+ if (flags & BCH_READ_NODECODE)
+ orig->hole = true;
+
+ zero_fill_bio_iter(&orig->bio, iter);
+out_read_done:
+ if (flags & BCH_READ_LAST_FRAGMENT)
+ bch2_rbio_done(orig);
+ return 0;
+}
+
+void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
+ struct bvec_iter bvec_iter, subvol_inum inum,
+ struct bch_io_failures *failed, unsigned flags)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_buf sk;
+ struct bkey_s_c k;
+ u32 snapshot;
+ int ret;
+
+ BUG_ON(flags & BCH_READ_NODECODE);
+
+ bch2_bkey_buf_init(&sk);
+retry:
+ bch2_trans_begin(trans);
+ iter = (struct btree_iter) { NULL };
+
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+ SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
+ BTREE_ITER_SLOTS);
+ while (1) {
+ unsigned bytes, sectors, offset_into_extent;
+ enum btree_id data_btree = BTREE_ID_extents;
+
+ /*
+ * read_extent -> io_time_reset may cause a transaction restart
+ * without returning an error, we need to check for that here:
+ */
+ ret = bch2_trans_relock(trans);
+ if (ret)
+ break;
+
+ bch2_btree_iter_set_pos(&iter,
+ POS(inum.inum, bvec_iter.bi_sector));
+
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ break;
+
+ offset_into_extent = iter.pos.offset -
+ bkey_start_offset(k.k);
+ sectors = k.k->size - offset_into_extent;
+
+ bch2_bkey_buf_reassemble(&sk, c, k);
+
+ ret = bch2_read_indirect_extent(trans, &data_btree,
+ &offset_into_extent, &sk);
+ if (ret)
+ break;
+
+ k = bkey_i_to_s_c(sk.k);
+
+ /*
+ * With indirect extents, the amount of data to read is the min
+ * of the original extent and the indirect extent:
+ */
+ sectors = min(sectors, k.k->size - offset_into_extent);
+
+ bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
+ swap(bvec_iter.bi_size, bytes);
+
+ if (bvec_iter.bi_size == bytes)
+ flags |= BCH_READ_LAST_FRAGMENT;
+
+ ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos,
+ data_btree, k,
+ offset_into_extent, failed, flags);
+ if (ret)
+ break;
+
+ if (flags & BCH_READ_LAST_FRAGMENT)
+ break;
+
+ swap(bvec_iter.bi_size, bytes);
+ bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
+
+ ret = btree_trans_too_many_iters(trans);
+ if (ret)
+ break;
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+ ret == READ_RETRY ||
+ ret == READ_RETRY_AVOID)
+ goto retry;
+
+ bch2_trans_put(trans);
+ bch2_bkey_buf_exit(&sk, c);
+
+ if (ret) {
+ bch_err_inum_offset_ratelimited(c, inum.inum,
+ bvec_iter.bi_sector << 9,
+ "read error %i from btree lookup", ret);
+ rbio->bio.bi_status = BLK_STS_IOERR;
+ bch2_rbio_done(rbio);
+ }
+}
+
+void bch2_fs_io_read_exit(struct bch_fs *c)
+{
+ if (c->promote_table.tbl)
+ rhashtable_destroy(&c->promote_table);
+ bioset_exit(&c->bio_read_split);
+ bioset_exit(&c->bio_read);
+}
+
+int bch2_fs_io_read_init(struct bch_fs *c)
+{
+ if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
+ BIOSET_NEED_BVECS))
+ return -BCH_ERR_ENOMEM_bio_read_init;
+
+ if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
+ BIOSET_NEED_BVECS))
+ return -BCH_ERR_ENOMEM_bio_read_split_init;
+
+ if (rhashtable_init(&c->promote_table, &bch_promote_params))
+ return -BCH_ERR_ENOMEM_promote_table_init;
+
+ return 0;
+}
diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h
new file mode 100644
index 000000000000..d9c18bb7d403
--- /dev/null
+++ b/fs/bcachefs/io_read.h
@@ -0,0 +1,158 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_READ_H
+#define _BCACHEFS_IO_READ_H
+
+#include "bkey_buf.h"
+
+struct bch_read_bio {
+ struct bch_fs *c;
+ u64 start_time;
+ u64 submit_time;
+
+ /*
+ * Reads will often have to be split, and if the extent being read from
+ * was checksummed or compressed we'll also have to allocate bounce
+ * buffers and copy the data back into the original bio.
+ *
+ * If we didn't have to split, we have to save and restore the original
+ * bi_end_io - @split below indicates which:
+ */
+ union {
+ struct bch_read_bio *parent;
+ bio_end_io_t *end_io;
+ };
+
+ /*
+ * Saved copy of bio->bi_iter, from submission time - allows us to
+ * resubmit on IO error, and also to copy data back to the original bio
+ * when we're bouncing:
+ */
+ struct bvec_iter bvec_iter;
+
+ unsigned offset_into_extent;
+
+ u16 flags;
+ union {
+ struct {
+ u16 bounce:1,
+ split:1,
+ kmalloc:1,
+ have_ioref:1,
+ narrow_crcs:1,
+ hole:1,
+ retry:2,
+ context:2;
+ };
+ u16 _state;
+ };
+
+ struct bch_devs_list devs_have;
+
+ struct extent_ptr_decoded pick;
+
+ /*
+ * pos we read from - different from data_pos for indirect extents:
+ */
+ u32 subvol;
+ struct bpos read_pos;
+
+ /*
+ * start pos of data we read (may not be pos of data we want) - for
+ * promote, narrow extents paths:
+ */
+ enum btree_id data_btree;
+ struct bpos data_pos;
+ struct bversion version;
+
+ struct promote_op *promote;
+
+ struct bch_io_opts opts;
+
+ struct work_struct work;
+
+ struct bio bio;
+};
+
+#define to_rbio(_bio) container_of((_bio), struct bch_read_bio, bio)
+
+struct bch_devs_mask;
+struct cache_promote_op;
+struct extent_ptr_decoded;
+
+int __bch2_read_indirect_extent(struct btree_trans *, unsigned *,
+ struct bkey_buf *);
+
+static inline int bch2_read_indirect_extent(struct btree_trans *trans,
+ enum btree_id *data_btree,
+ unsigned *offset_into_extent,
+ struct bkey_buf *k)
+{
+ if (k->k->k.type != KEY_TYPE_reflink_p)
+ return 0;
+
+ *data_btree = BTREE_ID_reflink;
+ return __bch2_read_indirect_extent(trans, offset_into_extent, k);
+}
+
+enum bch_read_flags {
+ BCH_READ_RETRY_IF_STALE = 1 << 0,
+ BCH_READ_MAY_PROMOTE = 1 << 1,
+ BCH_READ_USER_MAPPED = 1 << 2,
+ BCH_READ_NODECODE = 1 << 3,
+ BCH_READ_LAST_FRAGMENT = 1 << 4,
+
+ /* internal: */
+ BCH_READ_MUST_BOUNCE = 1 << 5,
+ BCH_READ_MUST_CLONE = 1 << 6,
+ BCH_READ_IN_RETRY = 1 << 7,
+};
+
+int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *,
+ struct bvec_iter, struct bpos, enum btree_id,
+ struct bkey_s_c, unsigned,
+ struct bch_io_failures *, unsigned);
+
+static inline void bch2_read_extent(struct btree_trans *trans,
+ struct bch_read_bio *rbio, struct bpos read_pos,
+ enum btree_id data_btree, struct bkey_s_c k,
+ unsigned offset_into_extent, unsigned flags)
+{
+ __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
+ data_btree, k, offset_into_extent, NULL, flags);
+}
+
+void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
+ subvol_inum, struct bch_io_failures *, unsigned flags);
+
+static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
+ subvol_inum inum)
+{
+ struct bch_io_failures failed = { .nr = 0 };
+
+ BUG_ON(rbio->_state);
+
+ rbio->c = c;
+ rbio->start_time = local_clock();
+ rbio->subvol = inum.subvol;
+
+ __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
+ BCH_READ_RETRY_IF_STALE|
+ BCH_READ_MAY_PROMOTE|
+ BCH_READ_USER_MAPPED);
+}
+
+static inline struct bch_read_bio *rbio_init(struct bio *bio,
+ struct bch_io_opts opts)
+{
+ struct bch_read_bio *rbio = to_rbio(bio);
+
+ rbio->_state = 0;
+ rbio->promote = NULL;
+ rbio->opts = opts;
+ return rbio;
+}
+
+void bch2_fs_io_read_exit(struct bch_fs *);
+int bch2_fs_io_read_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_IO_READ_H */
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
new file mode 100644
index 000000000000..8c8cb1541ac9
--- /dev/null
+++ b/fs/bcachefs/io_write.c
@@ -0,0 +1,1675 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "bset.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "clock.h"
+#include "compress.h"
+#include "debug.h"
+#include "ec.h"
+#include "error.h"
+#include "extent_update.h"
+#include "inode.h"
+#include "io_write.h"
+#include "journal.h"
+#include "keylist.h"
+#include "move.h"
+#include "nocow_locking.h"
+#include "rebalance.h"
+#include "subvolume.h"
+#include "super.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/blkdev.h>
+#include <linux/prefetch.h>
+#include <linux/random.h>
+#include <linux/sched/mm.h>
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+
+static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
+ u64 now, int rw)
+{
+ u64 latency_capable =
+ ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
+ /* ideally we'd be taking into account the device's variance here: */
+ u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
+ s64 latency_over = io_latency - latency_threshold;
+
+ if (latency_threshold && latency_over > 0) {
+ /*
+ * bump up congested by approximately latency_over * 4 /
+ * latency_threshold - we don't need much accuracy here so don't
+ * bother with the divide:
+ */
+ if (atomic_read(&ca->congested) < CONGESTED_MAX)
+ atomic_add(latency_over >>
+ max_t(int, ilog2(latency_threshold) - 2, 0),
+ &ca->congested);
+
+ ca->congested_last = now;
+ } else if (atomic_read(&ca->congested) > 0) {
+ atomic_dec(&ca->congested);
+ }
+}
+
+void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
+{
+ atomic64_t *latency = &ca->cur_latency[rw];
+ u64 now = local_clock();
+ u64 io_latency = time_after64(now, submit_time)
+ ? now - submit_time
+ : 0;
+ u64 old, new, v = atomic64_read(latency);
+
+ do {
+ old = v;
+
+ /*
+ * If the io latency was reasonably close to the current
+ * latency, skip doing the update and atomic operation - most of
+ * the time:
+ */
+ if (abs((int) (old - io_latency)) < (old >> 1) &&
+ now & ~(~0U << 5))
+ break;
+
+ new = ewma_add(old, io_latency, 5);
+ } while ((v = atomic64_cmpxchg(latency, old, new)) != old);
+
+ bch2_congested_acct(ca, io_latency, now, rw);
+
+ __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
+}
+
+#endif
+
+/* Allocate, free from mempool: */
+
+void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
+{
+ struct bvec_iter_all iter;
+ struct bio_vec *bv;
+
+ bio_for_each_segment_all(bv, bio, iter)
+ if (bv->bv_page != ZERO_PAGE(0))
+ mempool_free(bv->bv_page, &c->bio_bounce_pages);
+ bio->bi_vcnt = 0;
+}
+
+static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
+{
+ struct page *page;
+
+ if (likely(!*using_mempool)) {
+ page = alloc_page(GFP_NOFS);
+ if (unlikely(!page)) {
+ mutex_lock(&c->bio_bounce_pages_lock);
+ *using_mempool = true;
+ goto pool_alloc;
+
+ }
+ } else {
+pool_alloc:
+ page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS);
+ }
+
+ return page;
+}
+
+void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
+ size_t size)
+{
+ bool using_mempool = false;
+
+ while (size) {
+ struct page *page = __bio_alloc_page_pool(c, &using_mempool);
+ unsigned len = min_t(size_t, PAGE_SIZE, size);
+
+ BUG_ON(!bio_add_page(bio, page, len, 0));
+ size -= len;
+ }
+
+ if (using_mempool)
+ mutex_unlock(&c->bio_bounce_pages_lock);
+}
+
+/* Extent update path: */
+
+int bch2_sum_sector_overwrites(struct btree_trans *trans,
+ struct btree_iter *extent_iter,
+ struct bkey_i *new,
+ bool *usage_increasing,
+ s64 *i_sectors_delta,
+ s64 *disk_sectors_delta)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c old;
+ unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new));
+ bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
+ int ret = 0;
+
+ *usage_increasing = false;
+ *i_sectors_delta = 0;
+ *disk_sectors_delta = 0;
+
+ bch2_trans_copy_iter(&iter, extent_iter);
+
+ for_each_btree_key_upto_continue_norestart(iter,
+ new->k.p, BTREE_ITER_SLOTS, old, ret) {
+ s64 sectors = min(new->k.p.offset, old.k->p.offset) -
+ max(bkey_start_offset(&new->k),
+ bkey_start_offset(old.k));
+
+ *i_sectors_delta += sectors *
+ (bkey_extent_is_allocation(&new->k) -
+ bkey_extent_is_allocation(old.k));
+
+ *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new));
+ *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot
+ ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old)
+ : 0;
+
+ if (!*usage_increasing &&
+ (new->k.p.snapshot != old.k->p.snapshot ||
+ new_replicas > bch2_bkey_replicas(c, old) ||
+ (!new_compressed && bch2_bkey_sectors_compressed(old))))
+ *usage_increasing = true;
+
+ if (bkey_ge(old.k->p, new->k.p))
+ break;
+ }
+
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
+ struct btree_iter *extent_iter,
+ u64 new_i_size,
+ s64 i_sectors_delta)
+{
+ struct btree_iter iter;
+ struct bkey_i *k;
+ struct bkey_i_inode_v3 *inode;
+ /*
+ * Crazy performance optimization:
+ * Every extent update needs to also update the inode: the inode trigger
+ * will set bi->journal_seq to the journal sequence number of this
+ * transaction - for fsync.
+ *
+ * But if that's the only reason we're updating the inode (we're not
+ * updating bi_size or bi_sectors), then we don't need the inode update
+ * to be journalled - if we crash, the bi_journal_seq update will be
+ * lost, but that's fine.
+ */
+ unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
+ int ret;
+
+ k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes,
+ SPOS(0,
+ extent_iter->pos.inode,
+ extent_iter->snapshot),
+ BTREE_ITER_CACHED);
+ ret = PTR_ERR_OR_ZERO(k);
+ if (unlikely(ret))
+ return ret;
+
+ if (unlikely(k->k.type != KEY_TYPE_inode_v3)) {
+ k = bch2_inode_to_v3(trans, k);
+ ret = PTR_ERR_OR_ZERO(k);
+ if (unlikely(ret))
+ goto err;
+ }
+
+ inode = bkey_i_to_inode_v3(k);
+
+ if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) &&
+ new_i_size > le64_to_cpu(inode->v.bi_size)) {
+ inode->v.bi_size = cpu_to_le64(new_i_size);
+ inode_update_flags = 0;
+ }
+
+ if (i_sectors_delta) {
+ le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta);
+ inode_update_flags = 0;
+ }
+
+ if (inode->k.p.snapshot != iter.snapshot) {
+ inode->k.p.snapshot = iter.snapshot;
+ inode_update_flags = 0;
+ }
+
+ ret = bch2_trans_update(trans, &iter, &inode->k_i,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+ inode_update_flags);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_extent_update(struct btree_trans *trans,
+ subvol_inum inum,
+ struct btree_iter *iter,
+ struct bkey_i *k,
+ struct disk_reservation *disk_res,
+ u64 new_i_size,
+ s64 *i_sectors_delta_total,
+ bool check_enospc)
+{
+ struct bpos next_pos;
+ bool usage_increasing;
+ s64 i_sectors_delta = 0, disk_sectors_delta = 0;
+ int ret;
+
+ /*
+ * This traverses us the iterator without changing iter->path->pos to
+ * search_key() (which is pos + 1 for extents): we want there to be a
+ * path already traversed at iter->pos because
+ * bch2_trans_extent_update() will use it to attempt extent merging
+ */
+ ret = __bch2_btree_iter_traverse(iter);
+ if (ret)
+ return ret;
+
+ ret = bch2_extent_trim_atomic(trans, iter, k);
+ if (ret)
+ return ret;
+
+ next_pos = k->k.p;
+
+ ret = bch2_sum_sector_overwrites(trans, iter, k,
+ &usage_increasing,
+ &i_sectors_delta,
+ &disk_sectors_delta);
+ if (ret)
+ return ret;
+
+ if (disk_res &&
+ disk_sectors_delta > (s64) disk_res->sectors) {
+ ret = bch2_disk_reservation_add(trans->c, disk_res,
+ disk_sectors_delta - disk_res->sectors,
+ !check_enospc || !usage_increasing
+ ? BCH_DISK_RESERVATION_NOFAIL : 0);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Note:
+ * We always have to do an inode update - even when i_size/i_sectors
+ * aren't changing - for fsync to work properly; fsync relies on
+ * inode->bi_journal_seq which is updated by the trigger code:
+ */
+ ret = bch2_extent_update_i_size_sectors(trans, iter,
+ min(k->k.p.offset << 9, new_i_size),
+ i_sectors_delta) ?:
+ bch2_trans_update(trans, iter, k, 0) ?:
+ bch2_trans_commit(trans, disk_res, NULL,
+ BTREE_INSERT_NOCHECK_RW|
+ BTREE_INSERT_NOFAIL);
+ if (unlikely(ret))
+ return ret;
+
+ if (i_sectors_delta_total)
+ *i_sectors_delta_total += i_sectors_delta;
+ bch2_btree_iter_set_pos(iter, next_pos);
+ return 0;
+}
+
+static int bch2_write_index_default(struct bch_write_op *op)
+{
+ struct bch_fs *c = op->c;
+ struct bkey_buf sk;
+ struct keylist *keys = &op->insert_keys;
+ struct bkey_i *k = bch2_keylist_front(keys);
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ subvol_inum inum = {
+ .subvol = op->subvol,
+ .inum = k->k.p.inode,
+ };
+ int ret;
+
+ BUG_ON(!inum.subvol);
+
+ bch2_bkey_buf_init(&sk);
+
+ do {
+ bch2_trans_begin(trans);
+
+ k = bch2_keylist_front(keys);
+ bch2_bkey_buf_copy(&sk, c, k);
+
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol,
+ &sk.k->k.p.snapshot);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ break;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+ bkey_start_pos(&sk.k->k),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+ ret = bch2_bkey_set_needs_rebalance(c, sk.k,
+ op->opts.background_target,
+ op->opts.background_compression) ?:
+ bch2_extent_update(trans, inum, &iter, sk.k,
+ &op->res,
+ op->new_i_size, &op->i_sectors_delta,
+ op->flags & BCH_WRITE_CHECK_ENOSPC);
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ break;
+
+ if (bkey_ge(iter.pos, k->k.p))
+ bch2_keylist_pop_front(&op->insert_keys);
+ else
+ bch2_cut_front(iter.pos, k);
+ } while (!bch2_keylist_empty(keys));
+
+ bch2_trans_put(trans);
+ bch2_bkey_buf_exit(&sk, c);
+
+ return ret;
+}
+
+/* Writes */
+
+void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
+ enum bch_data_type type,
+ const struct bkey_i *k,
+ bool nocow)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
+ const struct bch_extent_ptr *ptr;
+ struct bch_write_bio *n;
+ struct bch_dev *ca;
+
+ BUG_ON(c->opts.nochanges);
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
+ !c->devs[ptr->dev]);
+
+ ca = bch_dev_bkey_exists(c, ptr->dev);
+
+ if (to_entry(ptr + 1) < ptrs.end) {
+ n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
+ GFP_NOFS, &ca->replica_set));
+
+ n->bio.bi_end_io = wbio->bio.bi_end_io;
+ n->bio.bi_private = wbio->bio.bi_private;
+ n->parent = wbio;
+ n->split = true;
+ n->bounce = false;
+ n->put_bio = true;
+ n->bio.bi_opf = wbio->bio.bi_opf;
+ bio_inc_remaining(&wbio->bio);
+ } else {
+ n = wbio;
+ n->split = false;
+ }
+
+ n->c = c;
+ n->dev = ptr->dev;
+ n->have_ioref = nocow || bch2_dev_get_ioref(ca,
+ type == BCH_DATA_btree ? READ : WRITE);
+ n->nocow = nocow;
+ n->submit_time = local_clock();
+ n->inode_offset = bkey_start_offset(&k->k);
+ n->bio.bi_iter.bi_sector = ptr->offset;
+
+ if (likely(n->have_ioref)) {
+ this_cpu_add(ca->io_done->sectors[WRITE][type],
+ bio_sectors(&n->bio));
+
+ bio_set_dev(&n->bio, ca->disk_sb.bdev);
+
+ if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) {
+ bio_endio(&n->bio);
+ continue;
+ }
+
+ submit_bio(&n->bio);
+ } else {
+ n->bio.bi_status = BLK_STS_REMOVED;
+ bio_endio(&n->bio);
+ }
+ }
+}
+
+static void __bch2_write(struct bch_write_op *);
+
+static void bch2_write_done(struct closure *cl)
+{
+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+ struct bch_fs *c = op->c;
+
+ EBUG_ON(op->open_buckets.nr);
+
+ bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
+ bch2_disk_reservation_put(c, &op->res);
+
+ if (!(op->flags & BCH_WRITE_MOVE))
+ bch2_write_ref_put(c, BCH_WRITE_REF_write);
+ bch2_keylist_free(&op->insert_keys, op->inline_keys);
+
+ EBUG_ON(cl->parent);
+ closure_debug_destroy(cl);
+ if (op->end_io)
+ op->end_io(op);
+}
+
+static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
+{
+ struct keylist *keys = &op->insert_keys;
+ struct bch_extent_ptr *ptr;
+ struct bkey_i *src, *dst = keys->keys, *n;
+
+ for (src = keys->keys; src != keys->top; src = n) {
+ n = bkey_next(src);
+
+ if (bkey_extent_is_direct_data(&src->k)) {
+ bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
+ test_bit(ptr->dev, op->failed.d));
+
+ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src)))
+ return -EIO;
+ }
+
+ if (dst != src)
+ memmove_u64s_down(dst, src, src->k.u64s);
+ dst = bkey_next(dst);
+ }
+
+ keys->top = dst;
+ return 0;
+}
+
+/**
+ * __bch2_write_index - after a write, update index to point to new data
+ * @op: bch_write_op to process
+ */
+static void __bch2_write_index(struct bch_write_op *op)
+{
+ struct bch_fs *c = op->c;
+ struct keylist *keys = &op->insert_keys;
+ unsigned dev;
+ int ret = 0;
+
+ if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
+ ret = bch2_write_drop_io_error_ptrs(op);
+ if (ret)
+ goto err;
+ }
+
+ if (!bch2_keylist_empty(keys)) {
+ u64 sectors_start = keylist_sectors(keys);
+
+ ret = !(op->flags & BCH_WRITE_MOVE)
+ ? bch2_write_index_default(op)
+ : bch2_data_update_index_update(op);
+
+ BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
+ BUG_ON(keylist_sectors(keys) && !ret);
+
+ op->written += sectors_start - keylist_sectors(keys);
+
+ if (ret && !bch2_err_matches(ret, EROFS)) {
+ struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
+
+ bch_err_inum_offset_ratelimited(c,
+ insert->k.p.inode, insert->k.p.offset << 9,
+ "write error while doing btree update: %s",
+ bch2_err_str(ret));
+ }
+
+ if (ret)
+ goto err;
+ }
+out:
+ /* If some a bucket wasn't written, we can't erasure code it: */
+ for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
+ bch2_open_bucket_write_error(c, &op->open_buckets, dev);
+
+ bch2_open_buckets_put(c, &op->open_buckets);
+ return;
+err:
+ keys->top = keys->keys;
+ op->error = ret;
+ op->flags |= BCH_WRITE_DONE;
+ goto out;
+}
+
+static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
+{
+ if (state != wp->state) {
+ u64 now = ktime_get_ns();
+
+ if (wp->last_state_change &&
+ time_after64(now, wp->last_state_change))
+ wp->time[wp->state] += now - wp->last_state_change;
+ wp->state = state;
+ wp->last_state_change = now;
+ }
+}
+
+static inline void wp_update_state(struct write_point *wp, bool running)
+{
+ enum write_point_state state;
+
+ state = running ? WRITE_POINT_running :
+ !list_empty(&wp->writes) ? WRITE_POINT_waiting_io
+ : WRITE_POINT_stopped;
+
+ __wp_update_state(wp, state);
+}
+
+static CLOSURE_CALLBACK(bch2_write_index)
+{
+ closure_type(op, struct bch_write_op, cl);
+ struct write_point *wp = op->wp;
+ struct workqueue_struct *wq = index_update_wq(op);
+ unsigned long flags;
+
+ if ((op->flags & BCH_WRITE_DONE) &&
+ (op->flags & BCH_WRITE_MOVE))
+ bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
+
+ spin_lock_irqsave(&wp->writes_lock, flags);
+ if (wp->state == WRITE_POINT_waiting_io)
+ __wp_update_state(wp, WRITE_POINT_waiting_work);
+ list_add_tail(&op->wp_list, &wp->writes);
+ spin_unlock_irqrestore (&wp->writes_lock, flags);
+
+ queue_work(wq, &wp->index_update_work);
+}
+
+static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp)
+{
+ op->wp = wp;
+
+ if (wp->state == WRITE_POINT_stopped) {
+ spin_lock_irq(&wp->writes_lock);
+ __wp_update_state(wp, WRITE_POINT_waiting_io);
+ spin_unlock_irq(&wp->writes_lock);
+ }
+}
+
+void bch2_write_point_do_index_updates(struct work_struct *work)
+{
+ struct write_point *wp =
+ container_of(work, struct write_point, index_update_work);
+ struct bch_write_op *op;
+
+ while (1) {
+ spin_lock_irq(&wp->writes_lock);
+ op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list);
+ if (op)
+ list_del(&op->wp_list);
+ wp_update_state(wp, op != NULL);
+ spin_unlock_irq(&wp->writes_lock);
+
+ if (!op)
+ break;
+
+ op->flags |= BCH_WRITE_IN_WORKER;
+
+ __bch2_write_index(op);
+
+ if (!(op->flags & BCH_WRITE_DONE))
+ __bch2_write(op);
+ else
+ bch2_write_done(&op->cl);
+ }
+}
+
+static void bch2_write_endio(struct bio *bio)
+{
+ struct closure *cl = bio->bi_private;
+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+ struct bch_write_bio *wbio = to_wbio(bio);
+ struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
+ struct bch_fs *c = wbio->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
+
+ if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
+ op->pos.inode,
+ wbio->inode_offset << 9,
+ "data write error: %s",
+ bch2_blk_status_to_str(bio->bi_status))) {
+ set_bit(wbio->dev, op->failed.d);
+ op->flags |= BCH_WRITE_IO_ERROR;
+ }
+
+ if (wbio->nocow)
+ set_bit(wbio->dev, op->devs_need_flush->d);
+
+ if (wbio->have_ioref) {
+ bch2_latency_acct(ca, wbio->submit_time, WRITE);
+ percpu_ref_put(&ca->io_ref);
+ }
+
+ if (wbio->bounce)
+ bch2_bio_free_pages_pool(c, bio);
+
+ if (wbio->put_bio)
+ bio_put(bio);
+
+ if (parent)
+ bio_endio(&parent->bio);
+ else
+ closure_put(cl);
+}
+
+static void init_append_extent(struct bch_write_op *op,
+ struct write_point *wp,
+ struct bversion version,
+ struct bch_extent_crc_unpacked crc)
+{
+ struct bkey_i_extent *e;
+
+ op->pos.offset += crc.uncompressed_size;
+
+ e = bkey_extent_init(op->insert_keys.top);
+ e->k.p = op->pos;
+ e->k.size = crc.uncompressed_size;
+ e->k.version = version;
+
+ if (crc.csum_type ||
+ crc.compression_type ||
+ crc.nonce)
+ bch2_extent_crc_append(&e->k_i, crc);
+
+ bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size,
+ op->flags & BCH_WRITE_CACHED);
+
+ bch2_keylist_push(&op->insert_keys);
+}
+
+static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
+ struct write_point *wp,
+ struct bio *src,
+ bool *page_alloc_failed,
+ void *buf)
+{
+ struct bch_write_bio *wbio;
+ struct bio *bio;
+ unsigned output_available =
+ min(wp->sectors_free << 9, src->bi_iter.bi_size);
+ unsigned pages = DIV_ROUND_UP(output_available +
+ (buf
+ ? ((unsigned long) buf & (PAGE_SIZE - 1))
+ : 0), PAGE_SIZE);
+
+ pages = min(pages, BIO_MAX_VECS);
+
+ bio = bio_alloc_bioset(NULL, pages, 0,
+ GFP_NOFS, &c->bio_write);
+ wbio = wbio_init(bio);
+ wbio->put_bio = true;
+ /* copy WRITE_SYNC flag */
+ wbio->bio.bi_opf = src->bi_opf;
+
+ if (buf) {
+ bch2_bio_map(bio, buf, output_available);
+ return bio;
+ }
+
+ wbio->bounce = true;
+
+ /*
+ * We can't use mempool for more than c->sb.encoded_extent_max
+ * worth of pages, but we'd like to allocate more if we can:
+ */
+ bch2_bio_alloc_pages_pool(c, bio,
+ min_t(unsigned, output_available,
+ c->opts.encoded_extent_max));
+
+ if (bio->bi_iter.bi_size < output_available)
+ *page_alloc_failed =
+ bch2_bio_alloc_pages(bio,
+ output_available -
+ bio->bi_iter.bi_size,
+ GFP_NOFS) != 0;
+
+ return bio;
+}
+
+static int bch2_write_rechecksum(struct bch_fs *c,
+ struct bch_write_op *op,
+ unsigned new_csum_type)
+{
+ struct bio *bio = &op->wbio.bio;
+ struct bch_extent_crc_unpacked new_crc;
+ int ret;
+
+ /* bch2_rechecksum_bio() can't encrypt or decrypt data: */
+
+ if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
+ bch2_csum_type_is_encryption(new_csum_type))
+ new_csum_type = op->crc.csum_type;
+
+ ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
+ NULL, &new_crc,
+ op->crc.offset, op->crc.live_size,
+ new_csum_type);
+ if (ret)
+ return ret;
+
+ bio_advance(bio, op->crc.offset << 9);
+ bio->bi_iter.bi_size = op->crc.live_size << 9;
+ op->crc = new_crc;
+ return 0;
+}
+
+static int bch2_write_decrypt(struct bch_write_op *op)
+{
+ struct bch_fs *c = op->c;
+ struct nonce nonce = extent_nonce(op->version, op->crc);
+ struct bch_csum csum;
+ int ret;
+
+ if (!bch2_csum_type_is_encryption(op->crc.csum_type))
+ return 0;
+
+ /*
+ * If we need to decrypt data in the write path, we'll no longer be able
+ * to verify the existing checksum (poly1305 mac, in this case) after
+ * it's decrypted - this is the last point we'll be able to reverify the
+ * checksum:
+ */
+ csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
+ if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
+ return -EIO;
+
+ ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
+ op->crc.csum_type = 0;
+ op->crc.csum = (struct bch_csum) { 0, 0 };
+ return ret;
+}
+
+static enum prep_encoded_ret {
+ PREP_ENCODED_OK,
+ PREP_ENCODED_ERR,
+ PREP_ENCODED_CHECKSUM_ERR,
+ PREP_ENCODED_DO_WRITE,
+} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
+{
+ struct bch_fs *c = op->c;
+ struct bio *bio = &op->wbio.bio;
+
+ if (!(op->flags & BCH_WRITE_DATA_ENCODED))
+ return PREP_ENCODED_OK;
+
+ BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
+
+ /* Can we just write the entire extent as is? */
+ if (op->crc.uncompressed_size == op->crc.live_size &&
+ op->crc.uncompressed_size <= c->opts.encoded_extent_max >> 9 &&
+ op->crc.compressed_size <= wp->sectors_free &&
+ (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
+ op->incompressible)) {
+ if (!crc_is_compressed(op->crc) &&
+ op->csum_type != op->crc.csum_type &&
+ bch2_write_rechecksum(c, op, op->csum_type) &&
+ !c->opts.no_data_io)
+ return PREP_ENCODED_CHECKSUM_ERR;
+
+ return PREP_ENCODED_DO_WRITE;
+ }
+
+ /*
+ * If the data is compressed and we couldn't write the entire extent as
+ * is, we have to decompress it:
+ */
+ if (crc_is_compressed(op->crc)) {
+ struct bch_csum csum;
+
+ if (bch2_write_decrypt(op))
+ return PREP_ENCODED_CHECKSUM_ERR;
+
+ /* Last point we can still verify checksum: */
+ csum = bch2_checksum_bio(c, op->crc.csum_type,
+ extent_nonce(op->version, op->crc),
+ bio);
+ if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
+ return PREP_ENCODED_CHECKSUM_ERR;
+
+ if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
+ return PREP_ENCODED_ERR;
+ }
+
+ /*
+ * No longer have compressed data after this point - data might be
+ * encrypted:
+ */
+
+ /*
+ * If the data is checksummed and we're only writing a subset,
+ * rechecksum and adjust bio to point to currently live data:
+ */
+ if ((op->crc.live_size != op->crc.uncompressed_size ||
+ op->crc.csum_type != op->csum_type) &&
+ bch2_write_rechecksum(c, op, op->csum_type) &&
+ !c->opts.no_data_io)
+ return PREP_ENCODED_CHECKSUM_ERR;
+
+ /*
+ * If we want to compress the data, it has to be decrypted:
+ */
+ if ((op->compression_opt ||
+ bch2_csum_type_is_encryption(op->crc.csum_type) !=
+ bch2_csum_type_is_encryption(op->csum_type)) &&
+ bch2_write_decrypt(op))
+ return PREP_ENCODED_CHECKSUM_ERR;
+
+ return PREP_ENCODED_OK;
+}
+
+static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
+ struct bio **_dst)
+{
+ struct bch_fs *c = op->c;
+ struct bio *src = &op->wbio.bio, *dst = src;
+ struct bvec_iter saved_iter;
+ void *ec_buf;
+ unsigned total_output = 0, total_input = 0;
+ bool bounce = false;
+ bool page_alloc_failed = false;
+ int ret, more = 0;
+
+ BUG_ON(!bio_sectors(src));
+
+ ec_buf = bch2_writepoint_ec_buf(c, wp);
+
+ switch (bch2_write_prep_encoded_data(op, wp)) {
+ case PREP_ENCODED_OK:
+ break;
+ case PREP_ENCODED_ERR:
+ ret = -EIO;
+ goto err;
+ case PREP_ENCODED_CHECKSUM_ERR:
+ goto csum_err;
+ case PREP_ENCODED_DO_WRITE:
+ /* XXX look for bug here */
+ if (ec_buf) {
+ dst = bch2_write_bio_alloc(c, wp, src,
+ &page_alloc_failed,
+ ec_buf);
+ bio_copy_data(dst, src);
+ bounce = true;
+ }
+ init_append_extent(op, wp, op->version, op->crc);
+ goto do_write;
+ }
+
+ if (ec_buf ||
+ op->compression_opt ||
+ (op->csum_type &&
+ !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
+ (bch2_csum_type_is_encryption(op->csum_type) &&
+ !(op->flags & BCH_WRITE_PAGES_OWNED))) {
+ dst = bch2_write_bio_alloc(c, wp, src,
+ &page_alloc_failed,
+ ec_buf);
+ bounce = true;
+ }
+
+ saved_iter = dst->bi_iter;
+
+ do {
+ struct bch_extent_crc_unpacked crc = { 0 };
+ struct bversion version = op->version;
+ size_t dst_len = 0, src_len = 0;
+
+ if (page_alloc_failed &&
+ dst->bi_iter.bi_size < (wp->sectors_free << 9) &&
+ dst->bi_iter.bi_size < c->opts.encoded_extent_max)
+ break;
+
+ BUG_ON(op->compression_opt &&
+ (op->flags & BCH_WRITE_DATA_ENCODED) &&
+ bch2_csum_type_is_encryption(op->crc.csum_type));
+ BUG_ON(op->compression_opt && !bounce);
+
+ crc.compression_type = op->incompressible
+ ? BCH_COMPRESSION_TYPE_incompressible
+ : op->compression_opt
+ ? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
+ op->compression_opt)
+ : 0;
+ if (!crc_is_compressed(crc)) {
+ dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+ dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
+
+ if (op->csum_type)
+ dst_len = min_t(unsigned, dst_len,
+ c->opts.encoded_extent_max);
+
+ if (bounce) {
+ swap(dst->bi_iter.bi_size, dst_len);
+ bio_copy_data(dst, src);
+ swap(dst->bi_iter.bi_size, dst_len);
+ }
+
+ src_len = dst_len;
+ }
+
+ BUG_ON(!src_len || !dst_len);
+
+ if (bch2_csum_type_is_encryption(op->csum_type)) {
+ if (bversion_zero(version)) {
+ version.lo = atomic64_inc_return(&c->key_version);
+ } else {
+ crc.nonce = op->nonce;
+ op->nonce += src_len >> 9;
+ }
+ }
+
+ if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
+ !crc_is_compressed(crc) &&
+ bch2_csum_type_is_encryption(op->crc.csum_type) ==
+ bch2_csum_type_is_encryption(op->csum_type)) {
+ u8 compression_type = crc.compression_type;
+ u16 nonce = crc.nonce;
+ /*
+ * Note: when we're using rechecksum(), we need to be
+ * checksumming @src because it has all the data our
+ * existing checksum covers - if we bounced (because we
+ * were trying to compress), @dst will only have the
+ * part of the data the new checksum will cover.
+ *
+ * But normally we want to be checksumming post bounce,
+ * because part of the reason for bouncing is so the
+ * data can't be modified (by userspace) while it's in
+ * flight.
+ */
+ if (bch2_rechecksum_bio(c, src, version, op->crc,
+ &crc, &op->crc,
+ src_len >> 9,
+ bio_sectors(src) - (src_len >> 9),
+ op->csum_type))
+ goto csum_err;
+ /*
+ * rchecksum_bio sets compression_type on crc from op->crc,
+ * this isn't always correct as sometimes we're changing
+ * an extent from uncompressed to incompressible.
+ */
+ crc.compression_type = compression_type;
+ crc.nonce = nonce;
+ } else {
+ if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
+ bch2_rechecksum_bio(c, src, version, op->crc,
+ NULL, &op->crc,
+ src_len >> 9,
+ bio_sectors(src) - (src_len >> 9),
+ op->crc.csum_type))
+ goto csum_err;
+
+ crc.compressed_size = dst_len >> 9;
+ crc.uncompressed_size = src_len >> 9;
+ crc.live_size = src_len >> 9;
+
+ swap(dst->bi_iter.bi_size, dst_len);
+ ret = bch2_encrypt_bio(c, op->csum_type,
+ extent_nonce(version, crc), dst);
+ if (ret)
+ goto err;
+
+ crc.csum = bch2_checksum_bio(c, op->csum_type,
+ extent_nonce(version, crc), dst);
+ crc.csum_type = op->csum_type;
+ swap(dst->bi_iter.bi_size, dst_len);
+ }
+
+ init_append_extent(op, wp, version, crc);
+
+ if (dst != src)
+ bio_advance(dst, dst_len);
+ bio_advance(src, src_len);
+ total_output += dst_len;
+ total_input += src_len;
+ } while (dst->bi_iter.bi_size &&
+ src->bi_iter.bi_size &&
+ wp->sectors_free &&
+ !bch2_keylist_realloc(&op->insert_keys,
+ op->inline_keys,
+ ARRAY_SIZE(op->inline_keys),
+ BKEY_EXTENT_U64s_MAX));
+
+ more = src->bi_iter.bi_size != 0;
+
+ dst->bi_iter = saved_iter;
+
+ if (dst == src && more) {
+ BUG_ON(total_output != total_input);
+
+ dst = bio_split(src, total_input >> 9,
+ GFP_NOFS, &c->bio_write);
+ wbio_init(dst)->put_bio = true;
+ /* copy WRITE_SYNC flag */
+ dst->bi_opf = src->bi_opf;
+ }
+
+ dst->bi_iter.bi_size = total_output;
+do_write:
+ *_dst = dst;
+ return more;
+csum_err:
+ bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)");
+ ret = -EIO;
+err:
+ if (to_wbio(dst)->bounce)
+ bch2_bio_free_pages_pool(c, dst);
+ if (to_wbio(dst)->put_bio)
+ bio_put(dst);
+
+ return ret;
+}
+
+static bool bch2_extent_is_writeable(struct bch_write_op *op,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = op->c;
+ struct bkey_s_c_extent e;
+ struct extent_ptr_decoded p;
+ const union bch_extent_entry *entry;
+ unsigned replicas = 0;
+
+ if (k.k->type != KEY_TYPE_extent)
+ return false;
+
+ e = bkey_s_c_to_extent(k);
+ extent_for_each_ptr_decode(e, p, entry) {
+ if (crc_is_encoded(p.crc) || p.has_ec)
+ return false;
+
+ replicas += bch2_extent_ptr_durability(c, &p);
+ }
+
+ return replicas >= op->opts.data_replicas;
+}
+
+static inline void bch2_nocow_write_unlock(struct bch_write_op *op)
+{
+ struct bch_fs *c = op->c;
+ const struct bch_extent_ptr *ptr;
+ struct bkey_i *k;
+
+ for_each_keylist_key(&op->insert_keys, k) {
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
+
+ bkey_for_each_ptr(ptrs, ptr)
+ bch2_bucket_nocow_unlock(&c->nocow_locks,
+ PTR_BUCKET_POS(c, ptr),
+ BUCKET_NOCOW_LOCK_UPDATE);
+ }
+}
+
+static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_i *orig,
+ struct bkey_s_c k,
+ u64 new_i_size)
+{
+ struct bkey_i *new;
+ struct bkey_ptrs ptrs;
+ struct bch_extent_ptr *ptr;
+ int ret;
+
+ if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) {
+ /* trace this */
+ return 0;
+ }
+
+ new = bch2_bkey_make_mut_noupdate(trans, k);
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ return ret;
+
+ bch2_cut_front(bkey_start_pos(&orig->k), new);
+ bch2_cut_back(orig->k.p, new);
+
+ ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+ bkey_for_each_ptr(ptrs, ptr)
+ ptr->unwritten = 0;
+
+ /*
+ * Note that we're not calling bch2_subvol_get_snapshot() in this path -
+ * that was done when we kicked off the write, and here it's important
+ * that we update the extent that we wrote to - even if a snapshot has
+ * since been created. The write is still outstanding, so we're ok
+ * w.r.t. snapshot atomicity:
+ */
+ return bch2_extent_update_i_size_sectors(trans, iter,
+ min(new->k.p.offset << 9, new_i_size), 0) ?:
+ bch2_trans_update(trans, iter, new,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+}
+
+static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
+{
+ struct bch_fs *c = op->c;
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_i *orig;
+ struct bkey_s_c k;
+ int ret;
+
+ for_each_keylist_key(&op->insert_keys, orig) {
+ ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents,
+ bkey_start_pos(&orig->k), orig->k.p,
+ BTREE_ITER_INTENT, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL, ({
+ bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
+ }));
+
+ if (ret && !bch2_err_matches(ret, EROFS)) {
+ struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
+
+ bch_err_inum_offset_ratelimited(c,
+ insert->k.p.inode, insert->k.p.offset << 9,
+ "write error while doing btree update: %s",
+ bch2_err_str(ret));
+ }
+
+ if (ret) {
+ op->error = ret;
+ break;
+ }
+ }
+
+ bch2_trans_put(trans);
+}
+
+static void __bch2_nocow_write_done(struct bch_write_op *op)
+{
+ bch2_nocow_write_unlock(op);
+
+ if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
+ op->error = -EIO;
+ } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN))
+ bch2_nocow_write_convert_unwritten(op);
+}
+
+static CLOSURE_CALLBACK(bch2_nocow_write_done)
+{
+ closure_type(op, struct bch_write_op, cl);
+
+ __bch2_nocow_write_done(op);
+ bch2_write_done(cl);
+}
+
+struct bucket_to_lock {
+ struct bpos b;
+ unsigned gen;
+ struct nocow_lock_bucket *l;
+};
+
+static void bch2_nocow_write(struct bch_write_op *op)
+{
+ struct bch_fs *c = op->c;
+ struct btree_trans *trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_ptrs_c ptrs;
+ const struct bch_extent_ptr *ptr;
+ DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets;
+ struct bucket_to_lock *i;
+ u32 snapshot;
+ struct bucket_to_lock *stale_at;
+ int ret;
+
+ if (op->flags & BCH_WRITE_MOVE)
+ return;
+
+ darray_init(&buckets);
+ trans = bch2_trans_get(c);
+retry:
+ bch2_trans_begin(trans);
+
+ ret = bch2_subvolume_get_snapshot(trans, op->subvol, &snapshot);
+ if (unlikely(ret))
+ goto err;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+ SPOS(op->pos.inode, op->pos.offset, snapshot),
+ BTREE_ITER_SLOTS);
+ while (1) {
+ struct bio *bio = &op->wbio.bio;
+
+ buckets.nr = 0;
+
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ break;
+
+ /* fall back to normal cow write path? */
+ if (unlikely(k.k->p.snapshot != snapshot ||
+ !bch2_extent_is_writeable(op, k)))
+ break;
+
+ if (bch2_keylist_realloc(&op->insert_keys,
+ op->inline_keys,
+ ARRAY_SIZE(op->inline_keys),
+ k.k->u64s))
+ break;
+
+ /* Get iorefs before dropping btree locks: */
+ ptrs = bch2_bkey_ptrs_c(k);
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bpos b = PTR_BUCKET_POS(c, ptr);
+ struct nocow_lock_bucket *l =
+ bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b));
+ prefetch(l);
+
+ if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE)))
+ goto err_get_ioref;
+
+ /* XXX allocating memory with btree locks held - rare */
+ darray_push_gfp(&buckets, ((struct bucket_to_lock) {
+ .b = b, .gen = ptr->gen, .l = l,
+ }), GFP_KERNEL|__GFP_NOFAIL);
+
+ if (ptr->unwritten)
+ op->flags |= BCH_WRITE_CONVERT_UNWRITTEN;
+ }
+
+ /* Unlock before taking nocow locks, doing IO: */
+ bkey_reassemble(op->insert_keys.top, k);
+ bch2_trans_unlock(trans);
+
+ bch2_cut_front(op->pos, op->insert_keys.top);
+ if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN)
+ bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
+
+ darray_for_each(buckets, i) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, i->b.inode);
+
+ __bch2_bucket_nocow_lock(&c->nocow_locks, i->l,
+ bucket_to_u64(i->b),
+ BUCKET_NOCOW_LOCK_UPDATE);
+
+ rcu_read_lock();
+ bool stale = gen_after(*bucket_gen(ca, i->b.offset), i->gen);
+ rcu_read_unlock();
+
+ if (unlikely(stale)) {
+ stale_at = i;
+ goto err_bucket_stale;
+ }
+ }
+
+ bio = &op->wbio.bio;
+ if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) {
+ bio = bio_split(bio, k.k->p.offset - op->pos.offset,
+ GFP_KERNEL, &c->bio_write);
+ wbio_init(bio)->put_bio = true;
+ bio->bi_opf = op->wbio.bio.bi_opf;
+ } else {
+ op->flags |= BCH_WRITE_DONE;
+ }
+
+ op->pos.offset += bio_sectors(bio);
+ op->written += bio_sectors(bio);
+
+ bio->bi_end_io = bch2_write_endio;
+ bio->bi_private = &op->cl;
+ bio->bi_opf |= REQ_OP_WRITE;
+ closure_get(&op->cl);
+ bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
+ op->insert_keys.top, true);
+
+ bch2_keylist_push(&op->insert_keys);
+ if (op->flags & BCH_WRITE_DONE)
+ break;
+ bch2_btree_iter_advance(&iter);
+ }
+out:
+ bch2_trans_iter_exit(trans, &iter);
+err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
+
+ if (ret) {
+ bch_err_inum_offset_ratelimited(c,
+ op->pos.inode, op->pos.offset << 9,
+ "%s: btree lookup error %s", __func__, bch2_err_str(ret));
+ op->error = ret;
+ op->flags |= BCH_WRITE_DONE;
+ }
+
+ bch2_trans_put(trans);
+ darray_exit(&buckets);
+
+ /* fallback to cow write path? */
+ if (!(op->flags & BCH_WRITE_DONE)) {
+ closure_sync(&op->cl);
+ __bch2_nocow_write_done(op);
+ op->insert_keys.top = op->insert_keys.keys;
+ } else if (op->flags & BCH_WRITE_SYNC) {
+ closure_sync(&op->cl);
+ bch2_nocow_write_done(&op->cl.work);
+ } else {
+ /*
+ * XXX
+ * needs to run out of process context because ei_quota_lock is
+ * a mutex
+ */
+ continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op));
+ }
+ return;
+err_get_ioref:
+ darray_for_each(buckets, i)
+ percpu_ref_put(&bch_dev_bkey_exists(c, i->b.inode)->io_ref);
+
+ /* Fall back to COW path: */
+ goto out;
+err_bucket_stale:
+ darray_for_each(buckets, i) {
+ bch2_bucket_nocow_unlock(&c->nocow_locks, i->b, BUCKET_NOCOW_LOCK_UPDATE);
+ if (i == stale_at)
+ break;
+ }
+
+ /* We can retry this: */
+ ret = -BCH_ERR_transaction_restart;
+ goto err_get_ioref;
+}
+
+static void __bch2_write(struct bch_write_op *op)
+{
+ struct bch_fs *c = op->c;
+ struct write_point *wp = NULL;
+ struct bio *bio = NULL;
+ unsigned nofs_flags;
+ int ret;
+
+ nofs_flags = memalloc_nofs_save();
+
+ if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) {
+ bch2_nocow_write(op);
+ if (op->flags & BCH_WRITE_DONE)
+ goto out_nofs_restore;
+ }
+again:
+ memset(&op->failed, 0, sizeof(op->failed));
+
+ do {
+ struct bkey_i *key_to_write;
+ unsigned key_to_write_offset = op->insert_keys.top_p -
+ op->insert_keys.keys_p;
+
+ /* +1 for possible cache device: */
+ if (op->open_buckets.nr + op->nr_replicas + 1 >
+ ARRAY_SIZE(op->open_buckets.v))
+ break;
+
+ if (bch2_keylist_realloc(&op->insert_keys,
+ op->inline_keys,
+ ARRAY_SIZE(op->inline_keys),
+ BKEY_EXTENT_U64s_MAX))
+ break;
+
+ /*
+ * The copygc thread is now global, which means it's no longer
+ * freeing up space on specific disks, which means that
+ * allocations for specific disks may hang arbitrarily long:
+ */
+ ret = bch2_trans_do(c, NULL, NULL, 0,
+ bch2_alloc_sectors_start_trans(trans,
+ op->target,
+ op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
+ op->write_point,
+ &op->devs_have,
+ op->nr_replicas,
+ op->nr_replicas_required,
+ op->watermark,
+ op->flags,
+ (op->flags & (BCH_WRITE_ALLOC_NOWAIT|
+ BCH_WRITE_ONLY_SPECIFIED_DEVS))
+ ? NULL : &op->cl, &wp));
+ if (unlikely(ret)) {
+ if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
+ break;
+
+ goto err;
+ }
+
+ EBUG_ON(!wp);
+
+ bch2_open_bucket_get(c, wp, &op->open_buckets);
+ ret = bch2_write_extent(op, wp, &bio);
+
+ bch2_alloc_sectors_done_inlined(c, wp);
+err:
+ if (ret <= 0) {
+ op->flags |= BCH_WRITE_DONE;
+
+ if (ret < 0) {
+ op->error = ret;
+ break;
+ }
+ }
+
+ bio->bi_end_io = bch2_write_endio;
+ bio->bi_private = &op->cl;
+ bio->bi_opf |= REQ_OP_WRITE;
+
+ closure_get(bio->bi_private);
+
+ key_to_write = (void *) (op->insert_keys.keys_p +
+ key_to_write_offset);
+
+ bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
+ key_to_write, false);
+ } while (ret);
+
+ /*
+ * Sync or no?
+ *
+ * If we're running asynchronously, wne may still want to block
+ * synchronously here if we weren't able to submit all of the IO at
+ * once, as that signals backpressure to the caller.
+ */
+ if ((op->flags & BCH_WRITE_SYNC) ||
+ (!(op->flags & BCH_WRITE_DONE) &&
+ !(op->flags & BCH_WRITE_IN_WORKER))) {
+ closure_sync(&op->cl);
+ __bch2_write_index(op);
+
+ if (!(op->flags & BCH_WRITE_DONE))
+ goto again;
+ bch2_write_done(&op->cl);
+ } else {
+ bch2_write_queue(op, wp);
+ continue_at(&op->cl, bch2_write_index, NULL);
+ }
+out_nofs_restore:
+ memalloc_nofs_restore(nofs_flags);
+}
+
+static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
+{
+ struct bio *bio = &op->wbio.bio;
+ struct bvec_iter iter;
+ struct bkey_i_inline_data *id;
+ unsigned sectors;
+ int ret;
+
+ op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
+ op->flags |= BCH_WRITE_DONE;
+
+ bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
+
+ ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
+ ARRAY_SIZE(op->inline_keys),
+ BKEY_U64s + DIV_ROUND_UP(data_len, 8));
+ if (ret) {
+ op->error = ret;
+ goto err;
+ }
+
+ sectors = bio_sectors(bio);
+ op->pos.offset += sectors;
+
+ id = bkey_inline_data_init(op->insert_keys.top);
+ id->k.p = op->pos;
+ id->k.version = op->version;
+ id->k.size = sectors;
+
+ iter = bio->bi_iter;
+ iter.bi_size = data_len;
+ memcpy_from_bio(id->v.data, bio, iter);
+
+ while (data_len & 7)
+ id->v.data[data_len++] = '\0';
+ set_bkey_val_bytes(&id->k, data_len);
+ bch2_keylist_push(&op->insert_keys);
+
+ __bch2_write_index(op);
+err:
+ bch2_write_done(&op->cl);
+}
+
+/**
+ * bch2_write() - handle a write to a cache device or flash only volume
+ * @cl: &bch_write_op->cl
+ *
+ * This is the starting point for any data to end up in a cache device; it could
+ * be from a normal write, or a writeback write, or a write to a flash only
+ * volume - it's also used by the moving garbage collector to compact data in
+ * mostly empty buckets.
+ *
+ * It first writes the data to the cache, creating a list of keys to be inserted
+ * (if the data won't fit in a single open bucket, there will be multiple keys);
+ * after the data is written it calls bch_journal, and after the keys have been
+ * added to the next journal write they're inserted into the btree.
+ *
+ * If op->discard is true, instead of inserting the data it invalidates the
+ * region of the cache represented by op->bio and op->inode.
+ */
+CLOSURE_CALLBACK(bch2_write)
+{
+ closure_type(op, struct bch_write_op, cl);
+ struct bio *bio = &op->wbio.bio;
+ struct bch_fs *c = op->c;
+ unsigned data_len;
+
+ EBUG_ON(op->cl.parent);
+ BUG_ON(!op->nr_replicas);
+ BUG_ON(!op->write_point.v);
+ BUG_ON(bkey_eq(op->pos, POS_MAX));
+
+ op->start_time = local_clock();
+ bch2_keylist_init(&op->insert_keys, op->inline_keys);
+ wbio_init(bio)->put_bio = false;
+
+ if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) {
+ bch_err_inum_offset_ratelimited(c,
+ op->pos.inode,
+ op->pos.offset << 9,
+ "misaligned write");
+ op->error = -EIO;
+ goto err;
+ }
+
+ if (c->opts.nochanges) {
+ op->error = -BCH_ERR_erofs_no_writes;
+ goto err;
+ }
+
+ if (!(op->flags & BCH_WRITE_MOVE) &&
+ !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
+ op->error = -BCH_ERR_erofs_no_writes;
+ goto err;
+ }
+
+ this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
+ bch2_increment_clock(c, bio_sectors(bio), WRITE);
+
+ data_len = min_t(u64, bio->bi_iter.bi_size,
+ op->new_i_size - (op->pos.offset << 9));
+
+ if (c->opts.inline_data &&
+ data_len <= min(block_bytes(c) / 2, 1024U)) {
+ bch2_write_data_inline(op, data_len);
+ return;
+ }
+
+ __bch2_write(op);
+ return;
+err:
+ bch2_disk_reservation_put(c, &op->res);
+
+ closure_debug_destroy(&op->cl);
+ if (op->end_io)
+ op->end_io(op);
+}
+
+static const char * const bch2_write_flags[] = {
+#define x(f) #f,
+ BCH_WRITE_FLAGS()
+#undef x
+ NULL
+};
+
+void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
+{
+ prt_str(out, "pos: ");
+ bch2_bpos_to_text(out, op->pos);
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+
+ prt_str(out, "started: ");
+ bch2_pr_time_units(out, local_clock() - op->start_time);
+ prt_newline(out);
+
+ prt_str(out, "flags: ");
+ prt_bitflags(out, bch2_write_flags, op->flags);
+ prt_newline(out);
+
+ prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl));
+ prt_newline(out);
+
+ printbuf_indent_sub(out, 2);
+}
+
+void bch2_fs_io_write_exit(struct bch_fs *c)
+{
+ mempool_exit(&c->bio_bounce_pages);
+ bioset_exit(&c->bio_write);
+}
+
+int bch2_fs_io_write_init(struct bch_fs *c)
+{
+ if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
+ BIOSET_NEED_BVECS))
+ return -BCH_ERR_ENOMEM_bio_write_init;
+
+ if (mempool_init_page_pool(&c->bio_bounce_pages,
+ max_t(unsigned,
+ c->opts.btree_node_size,
+ c->opts.encoded_extent_max) /
+ PAGE_SIZE, 0))
+ return -BCH_ERR_ENOMEM_bio_bounce_pages_init;
+
+ return 0;
+}
diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h
new file mode 100644
index 000000000000..6c276a48f95d
--- /dev/null
+++ b/fs/bcachefs/io_write.h
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_WRITE_H
+#define _BCACHEFS_IO_WRITE_H
+
+#include "checksum.h"
+#include "io_write_types.h"
+
+#define to_wbio(_bio) \
+ container_of((_bio), struct bch_write_bio, bio)
+
+void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
+void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+void bch2_latency_acct(struct bch_dev *, u64, int);
+#else
+static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
+#endif
+
+void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
+ enum bch_data_type, const struct bkey_i *, bool);
+
+#define BCH_WRITE_FLAGS() \
+ x(ALLOC_NOWAIT) \
+ x(CACHED) \
+ x(DATA_ENCODED) \
+ x(PAGES_STABLE) \
+ x(PAGES_OWNED) \
+ x(ONLY_SPECIFIED_DEVS) \
+ x(WROTE_DATA_INLINE) \
+ x(FROM_INTERNAL) \
+ x(CHECK_ENOSPC) \
+ x(SYNC) \
+ x(MOVE) \
+ x(IN_WORKER) \
+ x(DONE) \
+ x(IO_ERROR) \
+ x(CONVERT_UNWRITTEN)
+
+enum __bch_write_flags {
+#define x(f) __BCH_WRITE_##f,
+ BCH_WRITE_FLAGS()
+#undef x
+};
+
+enum bch_write_flags {
+#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f),
+ BCH_WRITE_FLAGS()
+#undef x
+};
+
+static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
+{
+ return op->watermark == BCH_WATERMARK_copygc
+ ? op->c->copygc_wq
+ : op->c->btree_update_wq;
+}
+
+int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
+ struct bkey_i *, bool *, s64 *, s64 *);
+int bch2_extent_update(struct btree_trans *, subvol_inum,
+ struct btree_iter *, struct bkey_i *,
+ struct disk_reservation *, u64, s64 *, bool);
+
+static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
+ struct bch_io_opts opts)
+{
+ op->c = c;
+ op->end_io = NULL;
+ op->flags = 0;
+ op->written = 0;
+ op->error = 0;
+ op->csum_type = bch2_data_checksum_type(c, opts);
+ op->compression_opt = opts.compression;
+ op->nr_replicas = 0;
+ op->nr_replicas_required = c->opts.data_replicas_required;
+ op->watermark = BCH_WATERMARK_normal;
+ op->incompressible = 0;
+ op->open_buckets.nr = 0;
+ op->devs_have.nr = 0;
+ op->target = 0;
+ op->opts = opts;
+ op->subvol = 0;
+ op->pos = POS_MAX;
+ op->version = ZERO_VERSION;
+ op->write_point = (struct write_point_specifier) { 0 };
+ op->res = (struct disk_reservation) { 0 };
+ op->new_i_size = U64_MAX;
+ op->i_sectors_delta = 0;
+ op->devs_need_flush = NULL;
+}
+
+CLOSURE_CALLBACK(bch2_write);
+void bch2_write_point_do_index_updates(struct work_struct *);
+
+static inline struct bch_write_bio *wbio_init(struct bio *bio)
+{
+ struct bch_write_bio *wbio = to_wbio(bio);
+
+ memset(&wbio->wbio, 0, sizeof(wbio->wbio));
+ return wbio;
+}
+
+void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *);
+
+void bch2_fs_io_write_exit(struct bch_fs *);
+int bch2_fs_io_write_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_IO_WRITE_H */
diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h
new file mode 100644
index 000000000000..c7f97c2c4805
--- /dev/null
+++ b/fs/bcachefs/io_write_types.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_WRITE_TYPES_H
+#define _BCACHEFS_IO_WRITE_TYPES_H
+
+#include "alloc_types.h"
+#include "btree_types.h"
+#include "buckets_types.h"
+#include "extents_types.h"
+#include "keylist_types.h"
+#include "opts.h"
+#include "super_types.h"
+
+#include <linux/llist.h>
+#include <linux/workqueue.h>
+
+struct bch_write_bio {
+ struct_group(wbio,
+ struct bch_fs *c;
+ struct bch_write_bio *parent;
+
+ u64 submit_time;
+ u64 inode_offset;
+
+ struct bch_devs_list failed;
+ u8 dev;
+
+ unsigned split:1,
+ bounce:1,
+ put_bio:1,
+ have_ioref:1,
+ nocow:1,
+ used_mempool:1,
+ first_btree_write:1;
+ );
+
+ struct bio bio;
+};
+
+struct bch_write_op {
+ struct closure cl;
+ struct bch_fs *c;
+ void (*end_io)(struct bch_write_op *);
+ u64 start_time;
+
+ unsigned written; /* sectors */
+ u16 flags;
+ s16 error; /* dio write path expects it to hold -ERESTARTSYS... */
+
+ unsigned compression_opt:8;
+ unsigned csum_type:4;
+ unsigned nr_replicas:4;
+ unsigned nr_replicas_required:4;
+ unsigned watermark:3;
+ unsigned incompressible:1;
+ unsigned stripe_waited:1;
+
+ struct bch_devs_list devs_have;
+ u16 target;
+ u16 nonce;
+ struct bch_io_opts opts;
+
+ u32 subvol;
+ struct bpos pos;
+ struct bversion version;
+
+ /* For BCH_WRITE_DATA_ENCODED: */
+ struct bch_extent_crc_unpacked crc;
+
+ struct write_point_specifier write_point;
+
+ struct write_point *wp;
+ struct list_head wp_list;
+
+ struct disk_reservation res;
+
+ struct open_buckets open_buckets;
+
+ u64 new_i_size;
+ s64 i_sectors_delta;
+
+ struct bch_devs_mask failed;
+
+ struct keylist insert_keys;
+ u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2];
+
+ /*
+ * Bitmask of devices that have had nocow writes issued to them since
+ * last flush:
+ */
+ struct bch_devs_mask *devs_need_flush;
+
+ /* Must be last: */
+ struct bch_write_bio wbio;
+};
+
+#endif /* _BCACHEFS_IO_WRITE_TYPES_H */
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
new file mode 100644
index 000000000000..8cf238be6213
--- /dev/null
+++ b/fs/bcachefs/journal.c
@@ -0,0 +1,1439 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * bcachefs journalling code, for btree insertions
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_methods.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "journal_sb.h"
+#include "journal_seq_blacklist.h"
+#include "trace.h"
+
+static const char * const bch2_journal_errors[] = {
+#define x(n) #n,
+ JOURNAL_ERRORS()
+#undef x
+ NULL
+};
+
+static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
+{
+ return seq > j->seq_ondisk;
+}
+
+static bool __journal_entry_is_open(union journal_res_state state)
+{
+ return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
+}
+
+static inline unsigned nr_unwritten_journal_entries(struct journal *j)
+{
+ return atomic64_read(&j->seq) - j->seq_ondisk;
+}
+
+static bool journal_entry_is_open(struct journal *j)
+{
+ return __journal_entry_is_open(j->reservations);
+}
+
+static inline struct journal_buf *
+journal_seq_to_buf(struct journal *j, u64 seq)
+{
+ struct journal_buf *buf = NULL;
+
+ EBUG_ON(seq > journal_cur_seq(j));
+
+ if (journal_seq_unwritten(j, seq)) {
+ buf = j->buf + (seq & JOURNAL_BUF_MASK);
+ EBUG_ON(le64_to_cpu(buf->data->seq) != seq);
+ }
+ return buf;
+}
+
+static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
+{
+ unsigned i;
+
+ for (i = 0; i < ARRAY_SIZE(p->list); i++)
+ INIT_LIST_HEAD(&p->list[i]);
+ INIT_LIST_HEAD(&p->flushed);
+ atomic_set(&p->count, count);
+ p->devs.nr = 0;
+}
+
+/*
+ * Detect stuck journal conditions and trigger shutdown. Technically the journal
+ * can end up stuck for a variety of reasons, such as a blocked I/O, journal
+ * reservation lockup, etc. Since this is a fatal error with potentially
+ * unpredictable characteristics, we want to be fairly conservative before we
+ * decide to shut things down.
+ *
+ * Consider the journal stuck when it appears full with no ability to commit
+ * btree transactions, to discard journal buckets, nor acquire priority
+ * (reserved watermark) reservation.
+ */
+static inline bool
+journal_error_check_stuck(struct journal *j, int error, unsigned flags)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ bool stuck = false;
+ struct printbuf buf = PRINTBUF;
+
+ if (!(error == JOURNAL_ERR_journal_full ||
+ error == JOURNAL_ERR_journal_pin_full) ||
+ nr_unwritten_journal_entries(j) ||
+ (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim)
+ return stuck;
+
+ spin_lock(&j->lock);
+
+ if (j->can_discard) {
+ spin_unlock(&j->lock);
+ return stuck;
+ }
+
+ stuck = true;
+
+ /*
+ * The journal shutdown path will set ->err_seq, but do it here first to
+ * serialize against concurrent failures and avoid duplicate error
+ * reports.
+ */
+ if (j->err_seq) {
+ spin_unlock(&j->lock);
+ return stuck;
+ }
+ j->err_seq = journal_cur_seq(j);
+ spin_unlock(&j->lock);
+
+ bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)",
+ bch2_journal_errors[error]);
+ bch2_journal_debug_to_text(&buf, j);
+ bch_err(c, "%s", buf.buf);
+
+ printbuf_reset(&buf);
+ bch2_journal_pins_to_text(&buf, j);
+ bch_err(c, "Journal pins:\n%s", buf.buf);
+ printbuf_exit(&buf);
+
+ bch2_fatal_error(c);
+ dump_stack();
+
+ return stuck;
+}
+
+/*
+ * Final processing when the last reference of a journal buffer has been
+ * dropped. Drop the pin list reference acquired at journal entry open and write
+ * the buffer, if requested.
+ */
+void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
+ lockdep_assert_held(&j->lock);
+
+ if (__bch2_journal_pin_put(j, seq))
+ bch2_journal_reclaim_fast(j);
+ if (write)
+ closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+}
+
+/*
+ * Returns true if journal entry is now closed:
+ *
+ * We don't close a journal_buf until the next journal_buf is finished writing,
+ * and can be opened again - this also initializes the next journal_buf:
+ */
+static void __journal_entry_close(struct journal *j, unsigned closed_val)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct journal_buf *buf = journal_cur_buf(j);
+ union journal_res_state old, new;
+ u64 v = atomic64_read(&j->reservations.counter);
+ unsigned sectors;
+
+ BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL &&
+ closed_val != JOURNAL_ENTRY_ERROR_VAL);
+
+ lockdep_assert_held(&j->lock);
+
+ do {
+ old.v = new.v = v;
+ new.cur_entry_offset = closed_val;
+
+ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL ||
+ old.cur_entry_offset == new.cur_entry_offset)
+ return;
+ } while ((v = atomic64_cmpxchg(&j->reservations.counter,
+ old.v, new.v)) != old.v);
+
+ if (!__journal_entry_is_open(old))
+ return;
+
+ /* Close out old buffer: */
+ buf->data->u64s = cpu_to_le32(old.cur_entry_offset);
+
+ sectors = vstruct_blocks_plus(buf->data, c->block_bits,
+ buf->u64s_reserved) << c->block_bits;
+ BUG_ON(sectors > buf->sectors);
+ buf->sectors = sectors;
+
+ /*
+ * We have to set last_seq here, _before_ opening a new journal entry:
+ *
+ * A threads may replace an old pin with a new pin on their current
+ * journal reservation - the expectation being that the journal will
+ * contain either what the old pin protected or what the new pin
+ * protects.
+ *
+ * After the old pin is dropped journal_last_seq() won't include the old
+ * pin, so we can only write the updated last_seq on the entry that
+ * contains whatever the new pin protects.
+ *
+ * Restated, we can _not_ update last_seq for a given entry if there
+ * could be a newer entry open with reservations/pins that have been
+ * taken against it.
+ *
+ * Hence, we want update/set last_seq on the current journal entry right
+ * before we open a new one:
+ */
+ buf->last_seq = journal_last_seq(j);
+ buf->data->last_seq = cpu_to_le64(buf->last_seq);
+ BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq));
+
+ cancel_delayed_work(&j->write_work);
+
+ bch2_journal_space_available(j);
+
+ __bch2_journal_buf_put(j, old.idx, le64_to_cpu(buf->data->seq));
+}
+
+void bch2_journal_halt(struct journal *j)
+{
+ spin_lock(&j->lock);
+ __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL);
+ if (!j->err_seq)
+ j->err_seq = journal_cur_seq(j);
+ journal_wake(j);
+ spin_unlock(&j->lock);
+}
+
+static bool journal_entry_want_write(struct journal *j)
+{
+ bool ret = !journal_entry_is_open(j) ||
+ journal_cur_seq(j) == journal_last_unwritten_seq(j);
+
+ /* Don't close it yet if we already have a write in flight: */
+ if (ret)
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ else if (nr_unwritten_journal_entries(j)) {
+ struct journal_buf *buf = journal_cur_buf(j);
+
+ if (!buf->flush_time) {
+ buf->flush_time = local_clock() ?: 1;
+ buf->expires = jiffies;
+ }
+ }
+
+ return ret;
+}
+
+bool bch2_journal_entry_close(struct journal *j)
+{
+ bool ret;
+
+ spin_lock(&j->lock);
+ ret = journal_entry_want_write(j);
+ spin_unlock(&j->lock);
+
+ return ret;
+}
+
+/*
+ * should _only_ called from journal_res_get() - when we actually want a
+ * journal reservation - journal entry is open means journal is dirty:
+ */
+static int journal_entry_open(struct journal *j)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct journal_buf *buf = j->buf +
+ ((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK);
+ union journal_res_state old, new;
+ int u64s;
+ u64 v;
+
+ lockdep_assert_held(&j->lock);
+ BUG_ON(journal_entry_is_open(j));
+ BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+
+ if (j->blocked)
+ return JOURNAL_ERR_blocked;
+
+ if (j->cur_entry_error)
+ return j->cur_entry_error;
+
+ if (bch2_journal_error(j))
+ return JOURNAL_ERR_insufficient_devices; /* -EROFS */
+
+ if (!fifo_free(&j->pin))
+ return JOURNAL_ERR_journal_pin_full;
+
+ if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf))
+ return JOURNAL_ERR_max_in_flight;
+
+ BUG_ON(!j->cur_entry_sectors);
+
+ buf->expires =
+ (journal_cur_seq(j) == j->flushed_seq_ondisk
+ ? jiffies
+ : j->last_flush_write) +
+ msecs_to_jiffies(c->opts.journal_flush_delay);
+
+ buf->u64s_reserved = j->entry_u64s_reserved;
+ buf->disk_sectors = j->cur_entry_sectors;
+ buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9);
+
+ u64s = (int) (buf->sectors << 9) / sizeof(u64) -
+ journal_entry_overhead(j);
+ u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
+
+ if (u64s <= (ssize_t) j->early_journal_entries.nr)
+ return JOURNAL_ERR_journal_full;
+
+ if (fifo_empty(&j->pin) && j->reclaim_thread)
+ wake_up_process(j->reclaim_thread);
+
+ /*
+ * The fifo_push() needs to happen at the same time as j->seq is
+ * incremented for journal_last_seq() to be calculated correctly
+ */
+ atomic64_inc(&j->seq);
+ journal_pin_list_init(fifo_push_ref(&j->pin), 1);
+
+ BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
+
+ BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf);
+
+ bkey_extent_init(&buf->key);
+ buf->noflush = false;
+ buf->must_flush = false;
+ buf->separate_flush = false;
+ buf->flush_time = 0;
+
+ memset(buf->data, 0, sizeof(*buf->data));
+ buf->data->seq = cpu_to_le64(journal_cur_seq(j));
+ buf->data->u64s = 0;
+
+ if (j->early_journal_entries.nr) {
+ memcpy(buf->data->_data, j->early_journal_entries.data,
+ j->early_journal_entries.nr * sizeof(u64));
+ le32_add_cpu(&buf->data->u64s, j->early_journal_entries.nr);
+ }
+
+ /*
+ * Must be set before marking the journal entry as open:
+ */
+ j->cur_entry_u64s = u64s;
+
+ v = atomic64_read(&j->reservations.counter);
+ do {
+ old.v = new.v = v;
+
+ BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL);
+
+ new.idx++;
+ BUG_ON(journal_state_count(new, new.idx));
+ BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK));
+
+ journal_state_inc(&new);
+
+ /* Handle any already added entries */
+ new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
+ } while ((v = atomic64_cmpxchg(&j->reservations.counter,
+ old.v, new.v)) != old.v);
+
+ if (j->res_get_blocked_start)
+ bch2_time_stats_update(j->blocked_time,
+ j->res_get_blocked_start);
+ j->res_get_blocked_start = 0;
+
+ mod_delayed_work(c->io_complete_wq,
+ &j->write_work,
+ msecs_to_jiffies(c->opts.journal_flush_delay));
+ journal_wake(j);
+
+ if (j->early_journal_entries.nr)
+ darray_exit(&j->early_journal_entries);
+ return 0;
+}
+
+static bool journal_quiesced(struct journal *j)
+{
+ bool ret = atomic64_read(&j->seq) == j->seq_ondisk;
+
+ if (!ret)
+ bch2_journal_entry_close(j);
+ return ret;
+}
+
+static void journal_quiesce(struct journal *j)
+{
+ wait_event(j->wait, journal_quiesced(j));
+}
+
+static void journal_write_work(struct work_struct *work)
+{
+ struct journal *j = container_of(work, struct journal, write_work.work);
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ long delta;
+
+ spin_lock(&j->lock);
+ if (!__journal_entry_is_open(j->reservations))
+ goto unlock;
+
+ delta = journal_cur_buf(j)->expires - jiffies;
+
+ if (delta > 0)
+ mod_delayed_work(c->io_complete_wq, &j->write_work, delta);
+ else
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+unlock:
+ spin_unlock(&j->lock);
+}
+
+static int __journal_res_get(struct journal *j, struct journal_res *res,
+ unsigned flags)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct journal_buf *buf;
+ bool can_discard;
+ int ret;
+retry:
+ if (journal_res_get_fast(j, res, flags))
+ return 0;
+
+ if (bch2_journal_error(j))
+ return -BCH_ERR_erofs_journal_err;
+
+ spin_lock(&j->lock);
+
+ /* check once more in case somebody else shut things down... */
+ if (bch2_journal_error(j)) {
+ spin_unlock(&j->lock);
+ return -BCH_ERR_erofs_journal_err;
+ }
+
+ /*
+ * Recheck after taking the lock, so we don't race with another thread
+ * that just did journal_entry_open() and call bch2_journal_entry_close()
+ * unnecessarily
+ */
+ if (journal_res_get_fast(j, res, flags)) {
+ spin_unlock(&j->lock);
+ return 0;
+ }
+
+ if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
+ /*
+ * Don't want to close current journal entry, just need to
+ * invoke reclaim:
+ */
+ ret = JOURNAL_ERR_journal_full;
+ goto unlock;
+ }
+
+ /*
+ * If we couldn't get a reservation because the current buf filled up,
+ * and we had room for a bigger entry on disk, signal that we want to
+ * realloc the journal bufs:
+ */
+ buf = journal_cur_buf(j);
+ if (journal_entry_is_open(j) &&
+ buf->buf_size >> 9 < buf->disk_sectors &&
+ buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
+ j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
+
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ ret = journal_entry_open(j);
+
+ if (ret == JOURNAL_ERR_max_in_flight)
+ trace_and_count(c, journal_entry_full, c);
+unlock:
+ if ((ret && ret != JOURNAL_ERR_insufficient_devices) &&
+ !j->res_get_blocked_start) {
+ j->res_get_blocked_start = local_clock() ?: 1;
+ trace_and_count(c, journal_full, c);
+ }
+
+ can_discard = j->can_discard;
+ spin_unlock(&j->lock);
+
+ if (!ret)
+ goto retry;
+ if (journal_error_check_stuck(j, ret, flags))
+ ret = -BCH_ERR_journal_res_get_blocked;
+
+ /*
+ * Journal is full - can't rely on reclaim from work item due to
+ * freezing:
+ */
+ if ((ret == JOURNAL_ERR_journal_full ||
+ ret == JOURNAL_ERR_journal_pin_full) &&
+ !(flags & JOURNAL_RES_GET_NONBLOCK)) {
+ if (can_discard) {
+ bch2_journal_do_discards(j);
+ goto retry;
+ }
+
+ if (mutex_trylock(&j->reclaim_lock)) {
+ bch2_journal_reclaim(j);
+ mutex_unlock(&j->reclaim_lock);
+ }
+ }
+
+ return ret == JOURNAL_ERR_insufficient_devices
+ ? -BCH_ERR_erofs_journal_err
+ : -BCH_ERR_journal_res_get_blocked;
+}
+
+/*
+ * Essentially the entry function to the journaling code. When bcachefs is doing
+ * a btree insert, it calls this function to get the current journal write.
+ * Journal write is the structure used set up journal writes. The calling
+ * function will then add its keys to the structure, queuing them for the next
+ * write.
+ *
+ * To ensure forward progress, the current task must not be holding any
+ * btree node write locks.
+ */
+int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
+ unsigned flags)
+{
+ int ret;
+
+ closure_wait_event(&j->async_wait,
+ (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
+ (flags & JOURNAL_RES_GET_NONBLOCK));
+ return ret;
+}
+
+/* journal_entry_res: */
+
+void bch2_journal_entry_res_resize(struct journal *j,
+ struct journal_entry_res *res,
+ unsigned new_u64s)
+{
+ union journal_res_state state;
+ int d = new_u64s - res->u64s;
+
+ spin_lock(&j->lock);
+
+ j->entry_u64s_reserved += d;
+ if (d <= 0)
+ goto out;
+
+ j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d);
+ smp_mb();
+ state = READ_ONCE(j->reservations);
+
+ if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL &&
+ state.cur_entry_offset > j->cur_entry_u64s) {
+ j->cur_entry_u64s += d;
+ /*
+ * Not enough room in current journal entry, have to flush it:
+ */
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ } else {
+ journal_cur_buf(j)->u64s_reserved += d;
+ }
+out:
+ spin_unlock(&j->lock);
+ res->u64s += d;
+}
+
+/* journal flushing: */
+
+/**
+ * bch2_journal_flush_seq_async - wait for a journal entry to be written
+ * @j: journal object
+ * @seq: seq to flush
+ * @parent: closure object to wait with
+ * Returns: 1 if @seq has already been flushed, 0 if @seq is being flushed,
+ * -EIO if @seq will never be flushed
+ *
+ * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if
+ * necessary
+ */
+int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
+ struct closure *parent)
+{
+ struct journal_buf *buf;
+ int ret = 0;
+
+ if (seq <= j->flushed_seq_ondisk)
+ return 1;
+
+ spin_lock(&j->lock);
+
+ if (WARN_ONCE(seq > journal_cur_seq(j),
+ "requested to flush journal seq %llu, but currently at %llu",
+ seq, journal_cur_seq(j)))
+ goto out;
+
+ /* Recheck under lock: */
+ if (j->err_seq && seq >= j->err_seq) {
+ ret = -EIO;
+ goto out;
+ }
+
+ if (seq <= j->flushed_seq_ondisk) {
+ ret = 1;
+ goto out;
+ }
+
+ /* if seq was written, but not flushed - flush a newer one instead */
+ seq = max(seq, journal_last_unwritten_seq(j));
+
+recheck_need_open:
+ if (seq > journal_cur_seq(j)) {
+ struct journal_res res = { 0 };
+
+ if (journal_entry_is_open(j))
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+
+ spin_unlock(&j->lock);
+
+ ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+ if (ret)
+ return ret;
+
+ seq = res.seq;
+ buf = j->buf + (seq & JOURNAL_BUF_MASK);
+ buf->must_flush = true;
+
+ if (!buf->flush_time) {
+ buf->flush_time = local_clock() ?: 1;
+ buf->expires = jiffies;
+ }
+
+ if (parent && !closure_wait(&buf->wait, parent))
+ BUG();
+
+ bch2_journal_res_put(j, &res);
+
+ spin_lock(&j->lock);
+ goto want_write;
+ }
+
+ /*
+ * if write was kicked off without a flush, flush the next sequence
+ * number instead
+ */
+ buf = journal_seq_to_buf(j, seq);
+ if (buf->noflush) {
+ seq++;
+ goto recheck_need_open;
+ }
+
+ buf->must_flush = true;
+
+ if (parent && !closure_wait(&buf->wait, parent))
+ BUG();
+want_write:
+ if (seq == journal_cur_seq(j))
+ journal_entry_want_write(j);
+out:
+ spin_unlock(&j->lock);
+ return ret;
+}
+
+int bch2_journal_flush_seq(struct journal *j, u64 seq)
+{
+ u64 start_time = local_clock();
+ int ret, ret2;
+
+ /*
+ * Don't update time_stats when @seq is already flushed:
+ */
+ if (seq <= j->flushed_seq_ondisk)
+ return 0;
+
+ ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
+
+ if (!ret)
+ bch2_time_stats_update(j->flush_seq_time, start_time);
+
+ return ret ?: ret2 < 0 ? ret2 : 0;
+}
+
+/*
+ * bch2_journal_flush_async - if there is an open journal entry, or a journal
+ * still being written, write it and wait for the write to complete
+ */
+void bch2_journal_flush_async(struct journal *j, struct closure *parent)
+{
+ bch2_journal_flush_seq_async(j, atomic64_read(&j->seq), parent);
+}
+
+int bch2_journal_flush(struct journal *j)
+{
+ return bch2_journal_flush_seq(j, atomic64_read(&j->seq));
+}
+
+/*
+ * bch2_journal_noflush_seq - tell the journal not to issue any flushes before
+ * @seq
+ */
+bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ u64 unwritten_seq;
+ bool ret = false;
+
+ if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush)))
+ return false;
+
+ if (seq <= c->journal.flushed_seq_ondisk)
+ return false;
+
+ spin_lock(&j->lock);
+ if (seq <= c->journal.flushed_seq_ondisk)
+ goto out;
+
+ for (unwritten_seq = journal_last_unwritten_seq(j);
+ unwritten_seq < seq;
+ unwritten_seq++) {
+ struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
+
+ /* journal write is already in flight, and was a flush write: */
+ if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush)
+ goto out;
+
+ buf->noflush = true;
+ }
+
+ ret = true;
+out:
+ spin_unlock(&j->lock);
+ return ret;
+}
+
+int bch2_journal_meta(struct journal *j)
+{
+ struct journal_buf *buf;
+ struct journal_res res;
+ int ret;
+
+ memset(&res, 0, sizeof(res));
+
+ ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+ if (ret)
+ return ret;
+
+ buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
+ buf->must_flush = true;
+
+ if (!buf->flush_time) {
+ buf->flush_time = local_clock() ?: 1;
+ buf->expires = jiffies;
+ }
+
+ bch2_journal_res_put(j, &res);
+
+ return bch2_journal_flush_seq(j, res.seq);
+}
+
+/* block/unlock the journal: */
+
+void bch2_journal_unblock(struct journal *j)
+{
+ spin_lock(&j->lock);
+ j->blocked--;
+ spin_unlock(&j->lock);
+
+ journal_wake(j);
+}
+
+void bch2_journal_block(struct journal *j)
+{
+ spin_lock(&j->lock);
+ j->blocked++;
+ spin_unlock(&j->lock);
+
+ journal_quiesce(j);
+}
+
+/* allocate journal on a device: */
+
+static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
+ bool new_fs, struct closure *cl)
+{
+ struct bch_fs *c = ca->fs;
+ struct journal_device *ja = &ca->journal;
+ u64 *new_bucket_seq = NULL, *new_buckets = NULL;
+ struct open_bucket **ob = NULL;
+ long *bu = NULL;
+ unsigned i, pos, nr_got = 0, nr_want = nr - ja->nr;
+ int ret = 0;
+
+ BUG_ON(nr <= ja->nr);
+
+ bu = kcalloc(nr_want, sizeof(*bu), GFP_KERNEL);
+ ob = kcalloc(nr_want, sizeof(*ob), GFP_KERNEL);
+ new_buckets = kcalloc(nr, sizeof(u64), GFP_KERNEL);
+ new_bucket_seq = kcalloc(nr, sizeof(u64), GFP_KERNEL);
+ if (!bu || !ob || !new_buckets || !new_bucket_seq) {
+ ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets;
+ goto err_free;
+ }
+
+ for (nr_got = 0; nr_got < nr_want; nr_got++) {
+ if (new_fs) {
+ bu[nr_got] = bch2_bucket_alloc_new_fs(ca);
+ if (bu[nr_got] < 0) {
+ ret = -BCH_ERR_ENOSPC_bucket_alloc;
+ break;
+ }
+ } else {
+ ob[nr_got] = bch2_bucket_alloc(c, ca, BCH_WATERMARK_normal, cl);
+ ret = PTR_ERR_OR_ZERO(ob[nr_got]);
+ if (ret)
+ break;
+
+ ret = bch2_trans_run(c,
+ bch2_trans_mark_metadata_bucket(trans, ca,
+ ob[nr_got]->bucket, BCH_DATA_journal,
+ ca->mi.bucket_size));
+ if (ret) {
+ bch2_open_bucket_put(c, ob[nr_got]);
+ bch_err_msg(c, ret, "marking new journal buckets");
+ break;
+ }
+
+ bu[nr_got] = ob[nr_got]->bucket;
+ }
+ }
+
+ if (!nr_got)
+ goto err_free;
+
+ /* Don't return an error if we successfully allocated some buckets: */
+ ret = 0;
+
+ if (c) {
+ bch2_journal_flush_all_pins(&c->journal);
+ bch2_journal_block(&c->journal);
+ mutex_lock(&c->sb_lock);
+ }
+
+ memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64));
+ memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64));
+
+ BUG_ON(ja->discard_idx > ja->nr);
+
+ pos = ja->discard_idx ?: ja->nr;
+
+ memmove(new_buckets + pos + nr_got,
+ new_buckets + pos,
+ sizeof(new_buckets[0]) * (ja->nr - pos));
+ memmove(new_bucket_seq + pos + nr_got,
+ new_bucket_seq + pos,
+ sizeof(new_bucket_seq[0]) * (ja->nr - pos));
+
+ for (i = 0; i < nr_got; i++) {
+ new_buckets[pos + i] = bu[i];
+ new_bucket_seq[pos + i] = 0;
+ }
+
+ nr = ja->nr + nr_got;
+
+ ret = bch2_journal_buckets_to_sb(c, ca, new_buckets, nr);
+ if (ret)
+ goto err_unblock;
+
+ if (!new_fs)
+ bch2_write_super(c);
+
+ /* Commit: */
+ if (c)
+ spin_lock(&c->journal.lock);
+
+ swap(new_buckets, ja->buckets);
+ swap(new_bucket_seq, ja->bucket_seq);
+ ja->nr = nr;
+
+ if (pos <= ja->discard_idx)
+ ja->discard_idx = (ja->discard_idx + nr_got) % ja->nr;
+ if (pos <= ja->dirty_idx_ondisk)
+ ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + nr_got) % ja->nr;
+ if (pos <= ja->dirty_idx)
+ ja->dirty_idx = (ja->dirty_idx + nr_got) % ja->nr;
+ if (pos <= ja->cur_idx)
+ ja->cur_idx = (ja->cur_idx + nr_got) % ja->nr;
+
+ if (c)
+ spin_unlock(&c->journal.lock);
+err_unblock:
+ if (c) {
+ bch2_journal_unblock(&c->journal);
+ mutex_unlock(&c->sb_lock);
+ }
+
+ if (ret && !new_fs)
+ for (i = 0; i < nr_got; i++)
+ bch2_trans_run(c,
+ bch2_trans_mark_metadata_bucket(trans, ca,
+ bu[i], BCH_DATA_free, 0));
+err_free:
+ if (!new_fs)
+ for (i = 0; i < nr_got; i++)
+ bch2_open_bucket_put(c, ob[i]);
+
+ kfree(new_bucket_seq);
+ kfree(new_buckets);
+ kfree(ob);
+ kfree(bu);
+ return ret;
+}
+
+/*
+ * Allocate more journal space at runtime - not currently making use if it, but
+ * the code works:
+ */
+int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
+ unsigned nr)
+{
+ struct journal_device *ja = &ca->journal;
+ struct closure cl;
+ int ret = 0;
+
+ closure_init_stack(&cl);
+
+ down_write(&c->state_lock);
+
+ /* don't handle reducing nr of buckets yet: */
+ if (nr < ja->nr)
+ goto unlock;
+
+ while (ja->nr < nr) {
+ struct disk_reservation disk_res = { 0, 0, 0 };
+
+ /*
+ * note: journal buckets aren't really counted as _sectors_ used yet, so
+ * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
+ * when space used goes up without a reservation - but we do need the
+ * reservation to ensure we'll actually be able to allocate:
+ *
+ * XXX: that's not right, disk reservations only ensure a
+ * filesystem-wide allocation will succeed, this is a device
+ * specific allocation - we can hang here:
+ */
+
+ ret = bch2_disk_reservation_get(c, &disk_res,
+ bucket_to_sector(ca, nr - ja->nr), 1, 0);
+ if (ret)
+ break;
+
+ ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl);
+
+ bch2_disk_reservation_put(c, &disk_res);
+
+ closure_sync(&cl);
+
+ if (ret && ret != -BCH_ERR_bucket_alloc_blocked)
+ break;
+ }
+
+ if (ret)
+ bch_err_fn(c, ret);
+unlock:
+ up_write(&c->state_lock);
+ return ret;
+}
+
+int bch2_dev_journal_alloc(struct bch_dev *ca)
+{
+ unsigned nr;
+ int ret;
+
+ if (dynamic_fault("bcachefs:add:journal_alloc")) {
+ ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets;
+ goto err;
+ }
+
+ /* 1/128th of the device by default: */
+ nr = ca->mi.nbuckets >> 7;
+
+ /*
+ * clamp journal size to 8192 buckets or 8GB (in sectors), whichever
+ * is smaller:
+ */
+ nr = clamp_t(unsigned, nr,
+ BCH_JOURNAL_BUCKETS_MIN,
+ min(1 << 13,
+ (1 << 24) / ca->mi.bucket_size));
+
+ ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
+err:
+ if (ret)
+ bch_err_fn(ca, ret);
+ return ret;
+}
+
+int bch2_fs_journal_alloc(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i;
+
+ for_each_online_member(ca, c, i) {
+ if (ca->journal.nr)
+ continue;
+
+ int ret = bch2_dev_journal_alloc(ca);
+ if (ret) {
+ percpu_ref_put(&ca->io_ref);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+/* startup/shutdown: */
+
+static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
+{
+ bool ret = false;
+ u64 seq;
+
+ spin_lock(&j->lock);
+ for (seq = journal_last_unwritten_seq(j);
+ seq <= journal_cur_seq(j) && !ret;
+ seq++) {
+ struct journal_buf *buf = journal_seq_to_buf(j, seq);
+
+ if (bch2_bkey_has_device_c(bkey_i_to_s_c(&buf->key), dev_idx))
+ ret = true;
+ }
+ spin_unlock(&j->lock);
+
+ return ret;
+}
+
+void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
+{
+ wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx));
+}
+
+void bch2_fs_journal_stop(struct journal *j)
+{
+ bch2_journal_reclaim_stop(j);
+ bch2_journal_flush_all_pins(j);
+
+ wait_event(j->wait, bch2_journal_entry_close(j));
+
+ /*
+ * Always write a new journal entry, to make sure the clock hands are up
+ * to date (and match the superblock)
+ */
+ bch2_journal_meta(j);
+
+ journal_quiesce(j);
+
+ BUG_ON(!bch2_journal_error(j) &&
+ test_bit(JOURNAL_REPLAY_DONE, &j->flags) &&
+ j->last_empty_seq != journal_cur_seq(j));
+
+ cancel_delayed_work_sync(&j->write_work);
+}
+
+int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct journal_entry_pin_list *p;
+ struct journal_replay *i, **_i;
+ struct genradix_iter iter;
+ bool had_entries = false;
+ unsigned ptr;
+ u64 last_seq = cur_seq, nr, seq;
+
+ genradix_for_each_reverse(&c->journal_entries, iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
+ continue;
+
+ last_seq = le64_to_cpu(i->j.last_seq);
+ break;
+ }
+
+ nr = cur_seq - last_seq;
+
+ if (nr + 1 > j->pin.size) {
+ free_fifo(&j->pin);
+ init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL);
+ if (!j->pin.data) {
+ bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
+ return -BCH_ERR_ENOMEM_journal_pin_fifo;
+ }
+ }
+
+ j->replay_journal_seq = last_seq;
+ j->replay_journal_seq_end = cur_seq;
+ j->last_seq_ondisk = last_seq;
+ j->flushed_seq_ondisk = cur_seq - 1;
+ j->seq_ondisk = cur_seq - 1;
+ j->pin.front = last_seq;
+ j->pin.back = cur_seq;
+ atomic64_set(&j->seq, cur_seq - 1);
+
+ fifo_for_each_entry_ptr(p, &j->pin, seq)
+ journal_pin_list_init(p, 1);
+
+ genradix_for_each(&c->journal_entries, iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
+ continue;
+
+ seq = le64_to_cpu(i->j.seq);
+ BUG_ON(seq >= cur_seq);
+
+ if (seq < last_seq)
+ continue;
+
+ if (journal_entry_empty(&i->j))
+ j->last_empty_seq = le64_to_cpu(i->j.seq);
+
+ p = journal_seq_pin(j, seq);
+
+ p->devs.nr = 0;
+ for (ptr = 0; ptr < i->nr_ptrs; ptr++)
+ bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
+
+ had_entries = true;
+ }
+
+ if (!had_entries)
+ j->last_empty_seq = cur_seq;
+
+ spin_lock(&j->lock);
+
+ set_bit(JOURNAL_STARTED, &j->flags);
+ j->last_flush_write = jiffies;
+
+ j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
+ j->reservations.unwritten_idx++;
+
+ c->last_bucket_seq_cleanup = journal_cur_seq(j);
+
+ bch2_journal_space_available(j);
+ spin_unlock(&j->lock);
+
+ return bch2_journal_reclaim_start(j);
+}
+
+/* init/exit: */
+
+void bch2_dev_journal_exit(struct bch_dev *ca)
+{
+ kfree(ca->journal.bio);
+ kfree(ca->journal.buckets);
+ kfree(ca->journal.bucket_seq);
+
+ ca->journal.bio = NULL;
+ ca->journal.buckets = NULL;
+ ca->journal.bucket_seq = NULL;
+}
+
+int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
+{
+ struct journal_device *ja = &ca->journal;
+ struct bch_sb_field_journal *journal_buckets =
+ bch2_sb_field_get(sb, journal);
+ struct bch_sb_field_journal_v2 *journal_buckets_v2 =
+ bch2_sb_field_get(sb, journal_v2);
+ unsigned i, nr_bvecs;
+
+ ja->nr = 0;
+
+ if (journal_buckets_v2) {
+ unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
+
+ for (i = 0; i < nr; i++)
+ ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr);
+ } else if (journal_buckets) {
+ ja->nr = bch2_nr_journal_buckets(journal_buckets);
+ }
+
+ ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
+ if (!ja->bucket_seq)
+ return -BCH_ERR_ENOMEM_dev_journal_init;
+
+ nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
+
+ ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
+ if (!ca->journal.bio)
+ return -BCH_ERR_ENOMEM_dev_journal_init;
+
+ bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0);
+
+ ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
+ if (!ja->buckets)
+ return -BCH_ERR_ENOMEM_dev_journal_init;
+
+ if (journal_buckets_v2) {
+ unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
+ unsigned j, dst = 0;
+
+ for (i = 0; i < nr; i++)
+ for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
+ ja->buckets[dst++] =
+ le64_to_cpu(journal_buckets_v2->d[i].start) + j;
+ } else if (journal_buckets) {
+ for (i = 0; i < ja->nr; i++)
+ ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
+ }
+
+ return 0;
+}
+
+void bch2_fs_journal_exit(struct journal *j)
+{
+ unsigned i;
+
+ darray_exit(&j->early_journal_entries);
+
+ for (i = 0; i < ARRAY_SIZE(j->buf); i++)
+ kvpfree(j->buf[i].data, j->buf[i].buf_size);
+ free_fifo(&j->pin);
+}
+
+int bch2_fs_journal_init(struct journal *j)
+{
+ static struct lock_class_key res_key;
+ unsigned i;
+
+ spin_lock_init(&j->lock);
+ spin_lock_init(&j->err_lock);
+ init_waitqueue_head(&j->wait);
+ INIT_DELAYED_WORK(&j->write_work, journal_write_work);
+ init_waitqueue_head(&j->reclaim_wait);
+ init_waitqueue_head(&j->pin_flush_wait);
+ mutex_init(&j->reclaim_lock);
+ mutex_init(&j->discard_lock);
+
+ lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
+
+ atomic64_set(&j->reservations.counter,
+ ((union journal_res_state)
+ { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
+
+ if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)))
+ return -BCH_ERR_ENOMEM_journal_pin_fifo;
+
+ for (i = 0; i < ARRAY_SIZE(j->buf); i++) {
+ j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
+ j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL);
+ if (!j->buf[i].data)
+ return -BCH_ERR_ENOMEM_journal_buf;
+ }
+
+ j->pin.front = j->pin.back = 1;
+ return 0;
+}
+
+/* debug: */
+
+void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ union journal_res_state s;
+ struct bch_dev *ca;
+ unsigned long now = jiffies;
+ u64 seq;
+ unsigned i;
+
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 24);
+ out->atomic++;
+
+ rcu_read_lock();
+ s = READ_ONCE(j->reservations);
+
+ prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size);
+ prt_printf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j));
+ prt_printf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk);
+ prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j));
+ prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk);
+ prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk);
+ prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]);
+ prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved);
+ prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes);
+ prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes);
+ prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim);
+ prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim);
+ prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked);
+ prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now)
+ ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
+ prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
+ prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]);
+ prt_printf(out, "current entry:\t\t");
+
+ switch (s.cur_entry_offset) {
+ case JOURNAL_ENTRY_ERROR_VAL:
+ prt_printf(out, "error");
+ break;
+ case JOURNAL_ENTRY_CLOSED_VAL:
+ prt_printf(out, "closed");
+ break;
+ default:
+ prt_printf(out, "%u/%u", s.cur_entry_offset, j->cur_entry_u64s);
+ break;
+ }
+
+ prt_newline(out);
+
+ for (seq = journal_cur_seq(j);
+ seq >= journal_last_unwritten_seq(j);
+ --seq) {
+ i = seq & JOURNAL_BUF_MASK;
+
+ prt_printf(out, "unwritten entry:");
+ prt_tab(out);
+ prt_printf(out, "%llu", seq);
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+
+ prt_printf(out, "refcount:");
+ prt_tab(out);
+ prt_printf(out, "%u", journal_state_count(s, i));
+ prt_newline(out);
+
+ prt_printf(out, "sectors:");
+ prt_tab(out);
+ prt_printf(out, "%u", j->buf[i].sectors);
+ prt_newline(out);
+
+ prt_printf(out, "expires");
+ prt_tab(out);
+ prt_printf(out, "%li jiffies", j->buf[i].expires - jiffies);
+ prt_newline(out);
+
+ printbuf_indent_sub(out, 2);
+ }
+
+ prt_printf(out,
+ "replay done:\t\t%i\n",
+ test_bit(JOURNAL_REPLAY_DONE, &j->flags));
+
+ prt_printf(out, "space:\n");
+ prt_printf(out, "\tdiscarded\t%u:%u\n",
+ j->space[journal_space_discarded].next_entry,
+ j->space[journal_space_discarded].total);
+ prt_printf(out, "\tclean ondisk\t%u:%u\n",
+ j->space[journal_space_clean_ondisk].next_entry,
+ j->space[journal_space_clean_ondisk].total);
+ prt_printf(out, "\tclean\t\t%u:%u\n",
+ j->space[journal_space_clean].next_entry,
+ j->space[journal_space_clean].total);
+ prt_printf(out, "\ttotal\t\t%u:%u\n",
+ j->space[journal_space_total].next_entry,
+ j->space[journal_space_total].total);
+
+ for_each_member_device_rcu(ca, c, i,
+ &c->rw_devs[BCH_DATA_journal]) {
+ struct journal_device *ja = &ca->journal;
+
+ if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d))
+ continue;
+
+ if (!ja->nr)
+ continue;
+
+ prt_printf(out, "dev %u:\n", i);
+ prt_printf(out, "\tnr\t\t%u\n", ja->nr);
+ prt_printf(out, "\tbucket size\t%u\n", ca->mi.bucket_size);
+ prt_printf(out, "\tavailable\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
+ prt_printf(out, "\tdiscard_idx\t%u\n", ja->discard_idx);
+ prt_printf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]);
+ prt_printf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]);
+ prt_printf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]);
+ }
+
+ rcu_read_unlock();
+
+ --out->atomic;
+}
+
+void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
+{
+ spin_lock(&j->lock);
+ __bch2_journal_debug_to_text(out, j);
+ spin_unlock(&j->lock);
+}
+
+bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
+{
+ struct journal_entry_pin_list *pin_list;
+ struct journal_entry_pin *pin;
+ unsigned i;
+
+ spin_lock(&j->lock);
+ *seq = max(*seq, j->pin.front);
+
+ if (*seq >= j->pin.back) {
+ spin_unlock(&j->lock);
+ return true;
+ }
+
+ out->atomic++;
+
+ pin_list = journal_seq_pin(j, *seq);
+
+ prt_printf(out, "%llu: count %u", *seq, atomic_read(&pin_list->count));
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+
+ for (i = 0; i < ARRAY_SIZE(pin_list->list); i++)
+ list_for_each_entry(pin, &pin_list->list[i], list) {
+ prt_printf(out, "\t%px %ps", pin, pin->flush);
+ prt_newline(out);
+ }
+
+ if (!list_empty(&pin_list->flushed)) {
+ prt_printf(out, "flushed:");
+ prt_newline(out);
+ }
+
+ list_for_each_entry(pin, &pin_list->flushed, list) {
+ prt_printf(out, "\t%px %ps", pin, pin->flush);
+ prt_newline(out);
+ }
+
+ printbuf_indent_sub(out, 2);
+
+ --out->atomic;
+ spin_unlock(&j->lock);
+
+ return false;
+}
+
+void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
+{
+ u64 seq = 0;
+
+ while (!bch2_journal_seq_pins_to_text(out, j, &seq))
+ seq++;
+}
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
new file mode 100644
index 000000000000..2f768e11aec9
--- /dev/null
+++ b/fs/bcachefs/journal.h
@@ -0,0 +1,450 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_H
+#define _BCACHEFS_JOURNAL_H
+
+/*
+ * THE JOURNAL:
+ *
+ * The primary purpose of the journal is to log updates (insertions) to the
+ * b-tree, to avoid having to do synchronous updates to the b-tree on disk.
+ *
+ * Without the journal, the b-tree is always internally consistent on
+ * disk - and in fact, in the earliest incarnations bcache didn't have a journal
+ * but did handle unclean shutdowns by doing all index updates synchronously
+ * (with coalescing).
+ *
+ * Updates to interior nodes still happen synchronously and without the journal
+ * (for simplicity) - this may change eventually but updates to interior nodes
+ * are rare enough it's not a huge priority.
+ *
+ * This means the journal is relatively separate from the b-tree; it consists of
+ * just a list of keys and journal replay consists of just redoing those
+ * insertions in same order that they appear in the journal.
+ *
+ * PERSISTENCE:
+ *
+ * For synchronous updates (where we're waiting on the index update to hit
+ * disk), the journal entry will be written out immediately (or as soon as
+ * possible, if the write for the previous journal entry was still in flight).
+ *
+ * Synchronous updates are specified by passing a closure (@flush_cl) to
+ * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter
+ * down to the journalling code. That closure will wait on the journal write to
+ * complete (via closure_wait()).
+ *
+ * If the index update wasn't synchronous, the journal entry will be
+ * written out after 10 ms have elapsed, by default (the delay_ms field
+ * in struct journal).
+ *
+ * JOURNAL ENTRIES:
+ *
+ * A journal entry is variable size (struct jset), it's got a fixed length
+ * header and then a variable number of struct jset_entry entries.
+ *
+ * Journal entries are identified by monotonically increasing 64 bit sequence
+ * numbers - jset->seq; other places in the code refer to this sequence number.
+ *
+ * A jset_entry entry contains one or more bkeys (which is what gets inserted
+ * into the b-tree). We need a container to indicate which b-tree the key is
+ * for; also, the roots of the various b-trees are stored in jset_entry entries
+ * (one for each b-tree) - this lets us add new b-tree types without changing
+ * the on disk format.
+ *
+ * We also keep some things in the journal header that are logically part of the
+ * superblock - all the things that are frequently updated. This is for future
+ * bcache on raw flash support; the superblock (which will become another
+ * journal) can't be moved or wear leveled, so it contains just enough
+ * information to find the main journal, and the superblock only has to be
+ * rewritten when we want to move/wear level the main journal.
+ *
+ * JOURNAL LAYOUT ON DISK:
+ *
+ * The journal is written to a ringbuffer of buckets (which is kept in the
+ * superblock); the individual buckets are not necessarily contiguous on disk
+ * which means that journal entries are not allowed to span buckets, but also
+ * that we can resize the journal at runtime if desired (unimplemented).
+ *
+ * The journal buckets exist in the same pool as all the other buckets that are
+ * managed by the allocator and garbage collection - garbage collection marks
+ * the journal buckets as metadata buckets.
+ *
+ * OPEN/DIRTY JOURNAL ENTRIES:
+ *
+ * Open/dirty journal entries are journal entries that contain b-tree updates
+ * that have not yet been written out to the b-tree on disk. We have to track
+ * which journal entries are dirty, and we also have to avoid wrapping around
+ * the journal and overwriting old but still dirty journal entries with new
+ * journal entries.
+ *
+ * On disk, this is represented with the "last_seq" field of struct jset;
+ * last_seq is the first sequence number that journal replay has to replay.
+ *
+ * To avoid overwriting dirty journal entries on disk, we keep a mapping (in
+ * journal_device->seq) of for each journal bucket, the highest sequence number
+ * any journal entry it contains. Then, by comparing that against last_seq we
+ * can determine whether that journal bucket contains dirty journal entries or
+ * not.
+ *
+ * To track which journal entries are dirty, we maintain a fifo of refcounts
+ * (where each entry corresponds to a specific sequence number) - when a ref
+ * goes to 0, that journal entry is no longer dirty.
+ *
+ * Journalling of index updates is done at the same time as the b-tree itself is
+ * being modified (see btree_insert_key()); when we add the key to the journal
+ * the pending b-tree write takes a ref on the journal entry the key was added
+ * to. If a pending b-tree write would need to take refs on multiple dirty
+ * journal entries, it only keeps the ref on the oldest one (since a newer
+ * journal entry will still be replayed if an older entry was dirty).
+ *
+ * JOURNAL FILLING UP:
+ *
+ * There are two ways the journal could fill up; either we could run out of
+ * space to write to, or we could have too many open journal entries and run out
+ * of room in the fifo of refcounts. Since those refcounts are decremented
+ * without any locking we can't safely resize that fifo, so we handle it the
+ * same way.
+ *
+ * If the journal fills up, we start flushing dirty btree nodes until we can
+ * allocate space for a journal write again - preferentially flushing btree
+ * nodes that are pinning the oldest journal entries first.
+ */
+
+#include <linux/hash.h>
+
+#include "journal_types.h"
+
+struct bch_fs;
+
+static inline void journal_wake(struct journal *j)
+{
+ wake_up(&j->wait);
+ closure_wake_up(&j->async_wait);
+ closure_wake_up(&j->preres_wait);
+}
+
+static inline struct journal_buf *journal_cur_buf(struct journal *j)
+{
+ return j->buf + j->reservations.idx;
+}
+
+/* Sequence number of oldest dirty journal entry */
+
+static inline u64 journal_last_seq(struct journal *j)
+{
+ return j->pin.front;
+}
+
+static inline u64 journal_cur_seq(struct journal *j)
+{
+ return atomic64_read(&j->seq);
+}
+
+static inline u64 journal_last_unwritten_seq(struct journal *j)
+{
+ return j->seq_ondisk + 1;
+}
+
+static inline int journal_state_count(union journal_res_state s, int idx)
+{
+ switch (idx) {
+ case 0: return s.buf0_count;
+ case 1: return s.buf1_count;
+ case 2: return s.buf2_count;
+ case 3: return s.buf3_count;
+ }
+ BUG();
+}
+
+static inline void journal_state_inc(union journal_res_state *s)
+{
+ s->buf0_count += s->idx == 0;
+ s->buf1_count += s->idx == 1;
+ s->buf2_count += s->idx == 2;
+ s->buf3_count += s->idx == 3;
+}
+
+/*
+ * Amount of space that will be taken up by some keys in the journal (i.e.
+ * including the jset header)
+ */
+static inline unsigned jset_u64s(unsigned u64s)
+{
+ return u64s + sizeof(struct jset_entry) / sizeof(u64);
+}
+
+static inline int journal_entry_overhead(struct journal *j)
+{
+ return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved;
+}
+
+static inline struct jset_entry *
+bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
+{
+ struct jset *jset = buf->data;
+ struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s));
+
+ memset(entry, 0, sizeof(*entry));
+ entry->u64s = cpu_to_le16(u64s);
+
+ le32_add_cpu(&jset->u64s, jset_u64s(u64s));
+
+ return entry;
+}
+
+static inline struct jset_entry *
+journal_res_entry(struct journal *j, struct journal_res *res)
+{
+ return vstruct_idx(j->buf[res->idx].data, res->offset);
+}
+
+static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type,
+ enum btree_id id, unsigned level,
+ unsigned u64s)
+{
+ entry->u64s = cpu_to_le16(u64s);
+ entry->btree_id = id;
+ entry->level = level;
+ entry->type = type;
+ entry->pad[0] = 0;
+ entry->pad[1] = 0;
+ entry->pad[2] = 0;
+ return jset_u64s(u64s);
+}
+
+static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type,
+ enum btree_id id, unsigned level,
+ const void *data, unsigned u64s)
+{
+ unsigned ret = journal_entry_init(entry, type, id, level, u64s);
+
+ memcpy_u64s_small(entry->_data, data, u64s);
+ return ret;
+}
+
+static inline struct jset_entry *
+bch2_journal_add_entry(struct journal *j, struct journal_res *res,
+ unsigned type, enum btree_id id,
+ unsigned level, unsigned u64s)
+{
+ struct jset_entry *entry = journal_res_entry(j, res);
+ unsigned actual = journal_entry_init(entry, type, id, level, u64s);
+
+ EBUG_ON(!res->ref);
+ EBUG_ON(actual > res->u64s);
+
+ res->offset += actual;
+ res->u64s -= actual;
+ return entry;
+}
+
+static inline bool journal_entry_empty(struct jset *j)
+{
+ struct jset_entry *i;
+
+ if (j->seq != j->last_seq)
+ return false;
+
+ vstruct_for_each(j, i)
+ if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s)
+ return false;
+ return true;
+}
+
+/*
+ * Drop reference on a buffer index and return true if the count has hit zero.
+ */
+static inline union journal_res_state journal_state_buf_put(struct journal *j, unsigned idx)
+{
+ union journal_res_state s;
+
+ s.v = atomic64_sub_return(((union journal_res_state) {
+ .buf0_count = idx == 0,
+ .buf1_count = idx == 1,
+ .buf2_count = idx == 2,
+ .buf3_count = idx == 3,
+ }).v, &j->reservations.counter);
+ return s;
+}
+
+bool bch2_journal_entry_close(struct journal *);
+void bch2_journal_buf_put_final(struct journal *, u64, bool);
+
+static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
+{
+ union journal_res_state s;
+
+ s = journal_state_buf_put(j, idx);
+ if (!journal_state_count(s, idx))
+ bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+}
+
+static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
+{
+ union journal_res_state s;
+
+ s = journal_state_buf_put(j, idx);
+ if (!journal_state_count(s, idx)) {
+ spin_lock(&j->lock);
+ bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+ spin_unlock(&j->lock);
+ }
+}
+
+/*
+ * This function releases the journal write structure so other threads can
+ * then proceed to add their keys as well.
+ */
+static inline void bch2_journal_res_put(struct journal *j,
+ struct journal_res *res)
+{
+ if (!res->ref)
+ return;
+
+ lock_release(&j->res_map, _THIS_IP_);
+
+ while (res->u64s)
+ bch2_journal_add_entry(j, res,
+ BCH_JSET_ENTRY_btree_keys,
+ 0, 0, 0);
+
+ bch2_journal_buf_put(j, res->idx, res->seq);
+
+ res->ref = 0;
+}
+
+int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
+ unsigned);
+
+/* First bits for BCH_WATERMARK: */
+enum journal_res_flags {
+ __JOURNAL_RES_GET_NONBLOCK = BCH_WATERMARK_BITS,
+ __JOURNAL_RES_GET_CHECK,
+};
+
+#define JOURNAL_RES_GET_NONBLOCK (1 << __JOURNAL_RES_GET_NONBLOCK)
+#define JOURNAL_RES_GET_CHECK (1 << __JOURNAL_RES_GET_CHECK)
+
+static inline int journal_res_get_fast(struct journal *j,
+ struct journal_res *res,
+ unsigned flags)
+{
+ union journal_res_state old, new;
+ u64 v = atomic64_read(&j->reservations.counter);
+
+ do {
+ old.v = new.v = v;
+
+ /*
+ * Check if there is still room in the current journal
+ * entry:
+ */
+ if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
+ return 0;
+
+ EBUG_ON(!journal_state_count(new, new.idx));
+
+ if ((flags & BCH_WATERMARK_MASK) < j->watermark)
+ return 0;
+
+ new.cur_entry_offset += res->u64s;
+ journal_state_inc(&new);
+
+ /*
+ * If the refcount would overflow, we have to wait:
+ * XXX - tracepoint this:
+ */
+ if (!journal_state_count(new, new.idx))
+ return 0;
+
+ if (flags & JOURNAL_RES_GET_CHECK)
+ return 1;
+ } while ((v = atomic64_cmpxchg(&j->reservations.counter,
+ old.v, new.v)) != old.v);
+
+ res->ref = true;
+ res->idx = old.idx;
+ res->offset = old.cur_entry_offset;
+ res->seq = le64_to_cpu(j->buf[old.idx].data->seq);
+ return 1;
+}
+
+static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res,
+ unsigned u64s, unsigned flags)
+{
+ int ret;
+
+ EBUG_ON(res->ref);
+ EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
+
+ res->u64s = u64s;
+
+ if (journal_res_get_fast(j, res, flags))
+ goto out;
+
+ ret = bch2_journal_res_get_slowpath(j, res, flags);
+ if (ret)
+ return ret;
+out:
+ if (!(flags & JOURNAL_RES_GET_CHECK)) {
+ lock_acquire_shared(&j->res_map, 0,
+ (flags & JOURNAL_RES_GET_NONBLOCK) != 0,
+ NULL, _THIS_IP_);
+ EBUG_ON(!res->ref);
+ }
+ return 0;
+}
+
+/* journal_entry_res: */
+
+void bch2_journal_entry_res_resize(struct journal *,
+ struct journal_entry_res *,
+ unsigned);
+
+int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
+void bch2_journal_flush_async(struct journal *, struct closure *);
+
+int bch2_journal_flush_seq(struct journal *, u64);
+int bch2_journal_flush(struct journal *);
+bool bch2_journal_noflush_seq(struct journal *, u64);
+int bch2_journal_meta(struct journal *);
+
+void bch2_journal_halt(struct journal *);
+
+static inline int bch2_journal_error(struct journal *j)
+{
+ return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL
+ ? -EIO : 0;
+}
+
+struct bch_dev;
+
+static inline void bch2_journal_set_replay_done(struct journal *j)
+{
+ BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
+ set_bit(JOURNAL_REPLAY_DONE, &j->flags);
+}
+
+void bch2_journal_unblock(struct journal *);
+void bch2_journal_block(struct journal *);
+
+void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
+void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
+void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
+bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
+
+int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
+ unsigned nr);
+int bch2_dev_journal_alloc(struct bch_dev *);
+int bch2_fs_journal_alloc(struct bch_fs *);
+
+void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
+
+void bch2_fs_journal_stop(struct journal *);
+int bch2_fs_journal_start(struct journal *, u64);
+
+void bch2_dev_journal_exit(struct bch_dev *);
+int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
+void bch2_fs_journal_exit(struct journal *);
+int bch2_fs_journal_init(struct journal *);
+
+#endif /* _BCACHEFS_JOURNAL_H */
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
new file mode 100644
index 000000000000..3eb6c3f62a81
--- /dev/null
+++ b/fs/bcachefs/journal_io.c
@@ -0,0 +1,1966 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "btree_io.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "disk_groups.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+#include "replicas.h"
+#include "sb-clean.h"
+#include "trace.h"
+
+static struct nonce journal_nonce(const struct jset *jset)
+{
+ return (struct nonce) {{
+ [0] = 0,
+ [1] = ((__le32 *) &jset->seq)[0],
+ [2] = ((__le32 *) &jset->seq)[1],
+ [3] = BCH_NONCE_JOURNAL,
+ }};
+}
+
+static bool jset_csum_good(struct bch_fs *c, struct jset *j)
+{
+ return bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)) &&
+ !bch2_crc_cmp(j->csum,
+ csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j));
+}
+
+static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
+{
+ return (seq - c->journal_entries_base_seq) & (~0U >> 1);
+}
+
+static void __journal_replay_free(struct bch_fs *c,
+ struct journal_replay *i)
+{
+ struct journal_replay **p =
+ genradix_ptr(&c->journal_entries,
+ journal_entry_radix_idx(c, le64_to_cpu(i->j.seq)));
+
+ BUG_ON(*p != i);
+ *p = NULL;
+ kvpfree(i, offsetof(struct journal_replay, j) +
+ vstruct_bytes(&i->j));
+}
+
+static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
+{
+ i->ignore = true;
+
+ if (!c->opts.read_entire_journal)
+ __journal_replay_free(c, i);
+}
+
+struct journal_list {
+ struct closure cl;
+ u64 last_seq;
+ struct mutex lock;
+ int ret;
+};
+
+#define JOURNAL_ENTRY_ADD_OK 0
+#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5
+
+/*
+ * Given a journal entry we just read, add it to the list of journal entries to
+ * be replayed:
+ */
+static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
+ struct journal_ptr entry_ptr,
+ struct journal_list *jlist, struct jset *j)
+{
+ struct genradix_iter iter;
+ struct journal_replay **_i, *i, *dup;
+ struct journal_ptr *ptr;
+ size_t bytes = vstruct_bytes(j);
+ u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
+ int ret = JOURNAL_ENTRY_ADD_OK;
+
+ /* Is this entry older than the range we need? */
+ if (!c->opts.read_entire_journal &&
+ le64_to_cpu(j->seq) < jlist->last_seq)
+ return JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+
+ /*
+ * genradixes are indexed by a ulong, not a u64, so we can't index them
+ * by sequence number directly: Assume instead that they will all fall
+ * within the range of +-2billion of the filrst one we find.
+ */
+ if (!c->journal_entries_base_seq)
+ c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX);
+
+ /* Drop entries we don't need anymore */
+ if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) {
+ genradix_for_each_from(&c->journal_entries, iter, _i,
+ journal_entry_radix_idx(c, jlist->last_seq)) {
+ i = *_i;
+
+ if (!i || i->ignore)
+ continue;
+
+ if (le64_to_cpu(i->j.seq) >= last_seq)
+ break;
+ journal_replay_free(c, i);
+ }
+ }
+
+ jlist->last_seq = max(jlist->last_seq, last_seq);
+
+ _i = genradix_ptr_alloc(&c->journal_entries,
+ journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
+ GFP_KERNEL);
+ if (!_i)
+ return -BCH_ERR_ENOMEM_journal_entry_add;
+
+ /*
+ * Duplicate journal entries? If so we want the one that didn't have a
+ * checksum error:
+ */
+ dup = *_i;
+ if (dup) {
+ if (bytes == vstruct_bytes(&dup->j) &&
+ !memcmp(j, &dup->j, bytes)) {
+ i = dup;
+ goto found;
+ }
+
+ if (!entry_ptr.csum_good) {
+ i = dup;
+ goto found;
+ }
+
+ if (!dup->csum_good)
+ goto replace;
+
+ fsck_err(c, journal_entry_replicas_data_mismatch,
+ "found duplicate but non identical journal entries (seq %llu)",
+ le64_to_cpu(j->seq));
+ i = dup;
+ goto found;
+ }
+replace:
+ i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
+ if (!i)
+ return -BCH_ERR_ENOMEM_journal_entry_add;
+
+ i->nr_ptrs = 0;
+ i->csum_good = entry_ptr.csum_good;
+ i->ignore = false;
+ unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
+ i->ptrs[i->nr_ptrs++] = entry_ptr;
+
+ if (dup) {
+ if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) {
+ bch_err(c, "found too many copies of journal entry %llu",
+ le64_to_cpu(i->j.seq));
+ dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1;
+ }
+
+ /* The first ptr should represent the jset we kept: */
+ memcpy(i->ptrs + i->nr_ptrs,
+ dup->ptrs,
+ sizeof(dup->ptrs[0]) * dup->nr_ptrs);
+ i->nr_ptrs += dup->nr_ptrs;
+ __journal_replay_free(c, dup);
+ }
+
+ *_i = i;
+ return 0;
+found:
+ for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
+ if (ptr->dev == ca->dev_idx) {
+ bch_err(c, "duplicate journal entry %llu on same device",
+ le64_to_cpu(i->j.seq));
+ goto out;
+ }
+ }
+
+ if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) {
+ bch_err(c, "found too many copies of journal entry %llu",
+ le64_to_cpu(i->j.seq));
+ goto out;
+ }
+
+ i->ptrs[i->nr_ptrs++] = entry_ptr;
+out:
+fsck_err:
+ return ret;
+}
+
+/* this fills in a range with empty jset_entries: */
+static void journal_entry_null_range(void *start, void *end)
+{
+ struct jset_entry *entry;
+
+ for (entry = start; entry != end; entry = vstruct_next(entry))
+ memset(entry, 0, sizeof(*entry));
+}
+
+#define JOURNAL_ENTRY_REREAD 5
+#define JOURNAL_ENTRY_NONE 6
+#define JOURNAL_ENTRY_BAD 7
+
+static void journal_entry_err_msg(struct printbuf *out,
+ u32 version,
+ struct jset *jset,
+ struct jset_entry *entry)
+{
+ prt_str(out, "invalid journal entry, version=");
+ bch2_version_to_text(out, version);
+
+ if (entry) {
+ prt_str(out, " type=");
+ prt_str(out, bch2_jset_entry_types[entry->type]);
+ }
+
+ if (!jset) {
+ prt_printf(out, " in superblock");
+ } else {
+
+ prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq));
+
+ if (entry)
+ prt_printf(out, " offset=%zi/%u",
+ (u64 *) entry - jset->_data,
+ le32_to_cpu(jset->u64s));
+ }
+
+ prt_str(out, ": ");
+}
+
+#define journal_entry_err(c, version, jset, entry, _err, msg, ...) \
+({ \
+ struct printbuf _buf = PRINTBUF; \
+ \
+ journal_entry_err_msg(&_buf, version, jset, entry); \
+ prt_printf(&_buf, msg, ##__VA_ARGS__); \
+ \
+ switch (flags & BKEY_INVALID_WRITE) { \
+ case READ: \
+ mustfix_fsck_err(c, _err, "%s", _buf.buf); \
+ break; \
+ case WRITE: \
+ bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \
+ bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\
+ if (bch2_fs_inconsistent(c)) { \
+ ret = -BCH_ERR_fsck_errors_not_fixed; \
+ goto fsck_err; \
+ } \
+ break; \
+ } \
+ \
+ printbuf_exit(&_buf); \
+ true; \
+})
+
+#define journal_entry_err_on(cond, ...) \
+ ((cond) ? journal_entry_err(__VA_ARGS__) : false)
+
+#define FSCK_DELETED_KEY 5
+
+static int journal_validate_key(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned level, enum btree_id btree_id,
+ struct bkey_i *k,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
+{
+ int write = flags & BKEY_INVALID_WRITE;
+ void *next = vstruct_next(entry);
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ if (journal_entry_err_on(!k->k.u64s,
+ c, version, jset, entry,
+ journal_entry_bkey_u64s_0,
+ "k->u64s 0")) {
+ entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
+ journal_entry_null_range(vstruct_next(entry), next);
+ return FSCK_DELETED_KEY;
+ }
+
+ if (journal_entry_err_on((void *) bkey_next(k) >
+ (void *) vstruct_next(entry),
+ c, version, jset, entry,
+ journal_entry_bkey_past_end,
+ "extends past end of journal entry")) {
+ entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
+ journal_entry_null_range(vstruct_next(entry), next);
+ return FSCK_DELETED_KEY;
+ }
+
+ if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT,
+ c, version, jset, entry,
+ journal_entry_bkey_bad_format,
+ "bad format %u", k->k.format)) {
+ le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
+ memmove(k, bkey_next(k), next - (void *) bkey_next(k));
+ journal_entry_null_range(vstruct_next(entry), next);
+ return FSCK_DELETED_KEY;
+ }
+
+ if (!write)
+ bch2_bkey_compat(level, btree_id, version, big_endian,
+ write, NULL, bkey_to_packed(k));
+
+ if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
+ __btree_node_type(level, btree_id), write, &buf)) {
+ printbuf_reset(&buf);
+ journal_entry_err_msg(&buf, version, jset, entry);
+ prt_newline(&buf);
+ printbuf_indent_add(&buf, 2);
+
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+ prt_newline(&buf);
+ bch2_bkey_invalid(c, bkey_i_to_s_c(k),
+ __btree_node_type(level, btree_id), write, &buf);
+
+ mustfix_fsck_err(c, journal_entry_bkey_invalid,
+ "%s", buf.buf);
+
+ le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
+ memmove(k, bkey_next(k), next - (void *) bkey_next(k));
+ journal_entry_null_range(vstruct_next(entry), next);
+
+ printbuf_exit(&buf);
+ return FSCK_DELETED_KEY;
+ }
+
+ if (write)
+ bch2_bkey_compat(level, btree_id, version, big_endian,
+ write, NULL, bkey_to_packed(k));
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int journal_entry_btree_keys_validate(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
+{
+ struct bkey_i *k = entry->start;
+
+ while (k != vstruct_last(entry)) {
+ int ret = journal_validate_key(c, jset, entry,
+ entry->level,
+ entry->btree_id,
+ k, version, big_endian,
+ flags|BKEY_INVALID_JOURNAL);
+ if (ret == FSCK_DELETED_KEY)
+ continue;
+
+ k = bkey_next(k);
+ }
+
+ return 0;
+}
+
+static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ struct bkey_i *k;
+ bool first = true;
+
+ jset_entry_for_each_key(entry, k) {
+ if (!first) {
+ prt_newline(out);
+ prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+ }
+ prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level);
+ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
+ first = false;
+ }
+}
+
+static int journal_entry_btree_root_validate(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
+{
+ struct bkey_i *k = entry->start;
+ int ret = 0;
+
+ if (journal_entry_err_on(!entry->u64s ||
+ le16_to_cpu(entry->u64s) != k->k.u64s,
+ c, version, jset, entry,
+ journal_entry_btree_root_bad_size,
+ "invalid btree root journal entry: wrong number of keys")) {
+ void *next = vstruct_next(entry);
+ /*
+ * we don't want to null out this jset_entry,
+ * just the contents, so that later we can tell
+ * we were _supposed_ to have a btree root
+ */
+ entry->u64s = 0;
+ journal_entry_null_range(vstruct_next(entry), next);
+ return 0;
+ }
+
+ ret = journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
+ version, big_endian, flags);
+ if (ret == FSCK_DELETED_KEY)
+ ret = 0;
+fsck_err:
+ return ret;
+}
+
+static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ journal_entry_btree_keys_to_text(out, c, entry);
+}
+
+static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
+{
+ /* obsolete, don't care: */
+ return 0;
+}
+
+static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+}
+
+static int journal_entry_blacklist_validate(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
+{
+ int ret = 0;
+
+ if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1,
+ c, version, jset, entry,
+ journal_entry_blacklist_bad_size,
+ "invalid journal seq blacklist entry: bad size")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ }
+fsck_err:
+ return ret;
+}
+
+static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ struct jset_entry_blacklist *bl =
+ container_of(entry, struct jset_entry_blacklist, entry);
+
+ prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq));
+}
+
+static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
+{
+ struct jset_entry_blacklist_v2 *bl_entry;
+ int ret = 0;
+
+ if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2,
+ c, version, jset, entry,
+ journal_entry_blacklist_v2_bad_size,
+ "invalid journal seq blacklist entry: bad size")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ goto out;
+ }
+
+ bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+ if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
+ le64_to_cpu(bl_entry->end),
+ c, version, jset, entry,
+ journal_entry_blacklist_v2_start_past_end,
+ "invalid journal seq blacklist entry: start > end")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ }
+out:
+fsck_err:
+ return ret;
+}
+
+static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ struct jset_entry_blacklist_v2 *bl =
+ container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+ prt_printf(out, "start=%llu end=%llu",
+ le64_to_cpu(bl->start),
+ le64_to_cpu(bl->end));
+}
+
+static int journal_entry_usage_validate(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
+{
+ struct jset_entry_usage *u =
+ container_of(entry, struct jset_entry_usage, entry);
+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+ int ret = 0;
+
+ if (journal_entry_err_on(bytes < sizeof(*u),
+ c, version, jset, entry,
+ journal_entry_usage_bad_size,
+ "invalid journal entry usage: bad size")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+fsck_err:
+ return ret;
+}
+
+static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ struct jset_entry_usage *u =
+ container_of(entry, struct jset_entry_usage, entry);
+
+ prt_printf(out, "type=%s v=%llu",
+ bch2_fs_usage_types[u->entry.btree_id],
+ le64_to_cpu(u->v));
+}
+
+static int journal_entry_data_usage_validate(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
+{
+ struct jset_entry_data_usage *u =
+ container_of(entry, struct jset_entry_data_usage, entry);
+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+ struct printbuf err = PRINTBUF;
+ int ret = 0;
+
+ if (journal_entry_err_on(bytes < sizeof(*u) ||
+ bytes < sizeof(*u) + u->r.nr_devs,
+ c, version, jset, entry,
+ journal_entry_data_usage_bad_size,
+ "invalid journal entry usage: bad size")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ goto out;
+ }
+
+ if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c->disk_sb.sb, &err),
+ c, version, jset, entry,
+ journal_entry_data_usage_bad_size,
+ "invalid journal entry usage: %s", err.buf)) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ goto out;
+ }
+out:
+fsck_err:
+ printbuf_exit(&err);
+ return ret;
+}
+
+static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ struct jset_entry_data_usage *u =
+ container_of(entry, struct jset_entry_data_usage, entry);
+
+ bch2_replicas_entry_to_text(out, &u->r);
+ prt_printf(out, "=%llu", le64_to_cpu(u->v));
+}
+
+static int journal_entry_clock_validate(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
+{
+ struct jset_entry_clock *clock =
+ container_of(entry, struct jset_entry_clock, entry);
+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+ int ret = 0;
+
+ if (journal_entry_err_on(bytes != sizeof(*clock),
+ c, version, jset, entry,
+ journal_entry_clock_bad_size,
+ "bad size")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+ if (journal_entry_err_on(clock->rw > 1,
+ c, version, jset, entry,
+ journal_entry_clock_bad_rw,
+ "bad rw")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+fsck_err:
+ return ret;
+}
+
+static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ struct jset_entry_clock *clock =
+ container_of(entry, struct jset_entry_clock, entry);
+
+ prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
+}
+
+static int journal_entry_dev_usage_validate(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
+{
+ struct jset_entry_dev_usage *u =
+ container_of(entry, struct jset_entry_dev_usage, entry);
+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+ unsigned expected = sizeof(*u);
+ unsigned dev;
+ int ret = 0;
+
+ if (journal_entry_err_on(bytes < expected,
+ c, version, jset, entry,
+ journal_entry_dev_usage_bad_size,
+ "bad size (%u < %u)",
+ bytes, expected)) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+ dev = le32_to_cpu(u->dev);
+
+ if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
+ c, version, jset, entry,
+ journal_entry_dev_usage_bad_dev,
+ "bad dev")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+ if (journal_entry_err_on(u->pad,
+ c, version, jset, entry,
+ journal_entry_dev_usage_bad_pad,
+ "bad pad")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+fsck_err:
+ return ret;
+}
+
+static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ struct jset_entry_dev_usage *u =
+ container_of(entry, struct jset_entry_dev_usage, entry);
+ unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
+
+ prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
+
+ for (i = 0; i < nr_types; i++) {
+ if (i < BCH_DATA_NR)
+ prt_printf(out, " %s", bch2_data_types[i]);
+ else
+ prt_printf(out, " (unknown data type %u)", i);
+ prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
+ le64_to_cpu(u->d[i].buckets),
+ le64_to_cpu(u->d[i].sectors),
+ le64_to_cpu(u->d[i].fragmented));
+ }
+
+ prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec));
+}
+
+static int journal_entry_log_validate(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
+{
+ return 0;
+}
+
+static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
+ unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d);
+
+ prt_printf(out, "%.*s", bytes, l->d);
+}
+
+static int journal_entry_overwrite_validate(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
+{
+ return journal_entry_btree_keys_validate(c, jset, entry,
+ version, big_endian, READ);
+}
+
+static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ journal_entry_btree_keys_to_text(out, c, entry);
+}
+
+struct jset_entry_ops {
+ int (*validate)(struct bch_fs *, struct jset *,
+ struct jset_entry *, unsigned, int,
+ enum bkey_invalid_flags);
+ void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
+};
+
+static const struct jset_entry_ops bch2_jset_entry_ops[] = {
+#define x(f, nr) \
+ [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \
+ .validate = journal_entry_##f##_validate, \
+ .to_text = journal_entry_##f##_to_text, \
+ },
+ BCH_JSET_ENTRY_TYPES()
+#undef x
+};
+
+int bch2_journal_entry_validate(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
+{
+ return entry->type < BCH_JSET_ENTRY_NR
+ ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
+ version, big_endian, flags)
+ : 0;
+}
+
+void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ if (entry->type < BCH_JSET_ENTRY_NR) {
+ prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+ bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
+ } else {
+ prt_printf(out, "(unknown type %u)", entry->type);
+ }
+}
+
+static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
+ enum bkey_invalid_flags flags)
+{
+ struct jset_entry *entry;
+ unsigned version = le32_to_cpu(jset->version);
+ int ret = 0;
+
+ vstruct_for_each(jset, entry) {
+ if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset),
+ c, version, jset, entry,
+ journal_entry_past_jset_end,
+ "journal entry extends past end of jset")) {
+ jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
+ break;
+ }
+
+ ret = bch2_journal_entry_validate(c, jset, entry,
+ version, JSET_BIG_ENDIAN(jset), flags);
+ if (ret)
+ break;
+ }
+fsck_err:
+ return ret;
+}
+
+static int jset_validate(struct bch_fs *c,
+ struct bch_dev *ca,
+ struct jset *jset, u64 sector,
+ enum bkey_invalid_flags flags)
+{
+ unsigned version;
+ int ret = 0;
+
+ if (le64_to_cpu(jset->magic) != jset_magic(c))
+ return JOURNAL_ENTRY_NONE;
+
+ version = le32_to_cpu(jset->version);
+ if (journal_entry_err_on(!bch2_version_compatible(version),
+ c, version, jset, NULL,
+ jset_unsupported_version,
+ "%s sector %llu seq %llu: incompatible journal entry version %u.%u",
+ ca ? ca->name : c->name,
+ sector, le64_to_cpu(jset->seq),
+ BCH_VERSION_MAJOR(version),
+ BCH_VERSION_MINOR(version))) {
+ /* don't try to continue: */
+ return -EINVAL;
+ }
+
+ if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
+ c, version, jset, NULL,
+ jset_unknown_csum,
+ "%s sector %llu seq %llu: journal entry with unknown csum type %llu",
+ ca ? ca->name : c->name,
+ sector, le64_to_cpu(jset->seq),
+ JSET_CSUM_TYPE(jset)))
+ ret = JOURNAL_ENTRY_BAD;
+
+ /* last_seq is ignored when JSET_NO_FLUSH is true */
+ if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
+ le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
+ c, version, jset, NULL,
+ jset_last_seq_newer_than_seq,
+ "invalid journal entry: last_seq > seq (%llu > %llu)",
+ le64_to_cpu(jset->last_seq),
+ le64_to_cpu(jset->seq))) {
+ jset->last_seq = jset->seq;
+ return JOURNAL_ENTRY_BAD;
+ }
+
+ ret = jset_validate_entries(c, jset, flags);
+fsck_err:
+ return ret;
+}
+
+static int jset_validate_early(struct bch_fs *c,
+ struct bch_dev *ca,
+ struct jset *jset, u64 sector,
+ unsigned bucket_sectors_left,
+ unsigned sectors_read)
+{
+ size_t bytes = vstruct_bytes(jset);
+ unsigned version;
+ enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
+ int ret = 0;
+
+ if (le64_to_cpu(jset->magic) != jset_magic(c))
+ return JOURNAL_ENTRY_NONE;
+
+ version = le32_to_cpu(jset->version);
+ if (journal_entry_err_on(!bch2_version_compatible(version),
+ c, version, jset, NULL,
+ jset_unsupported_version,
+ "%s sector %llu seq %llu: unknown journal entry version %u.%u",
+ ca ? ca->name : c->name,
+ sector, le64_to_cpu(jset->seq),
+ BCH_VERSION_MAJOR(version),
+ BCH_VERSION_MINOR(version))) {
+ /* don't try to continue: */
+ return -EINVAL;
+ }
+
+ if (bytes > (sectors_read << 9) &&
+ sectors_read < bucket_sectors_left)
+ return JOURNAL_ENTRY_REREAD;
+
+ if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
+ c, version, jset, NULL,
+ jset_past_bucket_end,
+ "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
+ ca ? ca->name : c->name,
+ sector, le64_to_cpu(jset->seq), bytes))
+ le32_add_cpu(&jset->u64s,
+ -((bytes - (bucket_sectors_left << 9)) / 8));
+fsck_err:
+ return ret;
+}
+
+struct journal_read_buf {
+ void *data;
+ size_t size;
+};
+
+static int journal_read_buf_realloc(struct journal_read_buf *b,
+ size_t new_size)
+{
+ void *n;
+
+ /* the bios are sized for this many pages, max: */
+ if (new_size > JOURNAL_ENTRY_SIZE_MAX)
+ return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
+
+ new_size = roundup_pow_of_two(new_size);
+ n = kvpmalloc(new_size, GFP_KERNEL);
+ if (!n)
+ return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
+
+ kvpfree(b->data, b->size);
+ b->data = n;
+ b->size = new_size;
+ return 0;
+}
+
+static int journal_read_bucket(struct bch_dev *ca,
+ struct journal_read_buf *buf,
+ struct journal_list *jlist,
+ unsigned bucket)
+{
+ struct bch_fs *c = ca->fs;
+ struct journal_device *ja = &ca->journal;
+ struct jset *j = NULL;
+ unsigned sectors, sectors_read = 0;
+ u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
+ end = offset + ca->mi.bucket_size;
+ bool saw_bad = false, csum_good;
+ int ret = 0;
+
+ pr_debug("reading %u", bucket);
+
+ while (offset < end) {
+ if (!sectors_read) {
+ struct bio *bio;
+ unsigned nr_bvecs;
+reread:
+ sectors_read = min_t(unsigned,
+ end - offset, buf->size >> 9);
+ nr_bvecs = buf_pages(buf->data, sectors_read << 9);
+
+ bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
+ bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
+
+ bio->bi_iter.bi_sector = offset;
+ bch2_bio_map(bio, buf->data, sectors_read << 9);
+
+ ret = submit_bio_wait(bio);
+ kfree(bio);
+
+ if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read,
+ "journal read error: sector %llu",
+ offset) ||
+ bch2_meta_read_fault("journal")) {
+ /*
+ * We don't error out of the recovery process
+ * here, since the relevant journal entry may be
+ * found on a different device, and missing or
+ * no journal entries will be handled later
+ */
+ return 0;
+ }
+
+ j = buf->data;
+ }
+
+ ret = jset_validate_early(c, ca, j, offset,
+ end - offset, sectors_read);
+ switch (ret) {
+ case 0:
+ sectors = vstruct_sectors(j, c->block_bits);
+ break;
+ case JOURNAL_ENTRY_REREAD:
+ if (vstruct_bytes(j) > buf->size) {
+ ret = journal_read_buf_realloc(buf,
+ vstruct_bytes(j));
+ if (ret)
+ return ret;
+ }
+ goto reread;
+ case JOURNAL_ENTRY_NONE:
+ if (!saw_bad)
+ return 0;
+ /*
+ * On checksum error we don't really trust the size
+ * field of the journal entry we read, so try reading
+ * again at next block boundary:
+ */
+ sectors = block_sectors(c);
+ goto next_block;
+ default:
+ return ret;
+ }
+
+ /*
+ * This happens sometimes if we don't have discards on -
+ * when we've partially overwritten a bucket with new
+ * journal entries. We don't need the rest of the
+ * bucket:
+ */
+ if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
+ return 0;
+
+ ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
+
+ csum_good = jset_csum_good(c, j);
+ if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum,
+ "journal checksum error"))
+ saw_bad = true;
+
+ ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
+ j->encrypted_start,
+ vstruct_end(j) - (void *) j->encrypted_start);
+ bch2_fs_fatal_err_on(ret, c,
+ "error decrypting journal entry: %i", ret);
+
+ mutex_lock(&jlist->lock);
+ ret = journal_entry_add(c, ca, (struct journal_ptr) {
+ .csum_good = csum_good,
+ .dev = ca->dev_idx,
+ .bucket = bucket,
+ .bucket_offset = offset -
+ bucket_to_sector(ca, ja->buckets[bucket]),
+ .sector = offset,
+ }, jlist, j);
+ mutex_unlock(&jlist->lock);
+
+ switch (ret) {
+ case JOURNAL_ENTRY_ADD_OK:
+ break;
+ case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
+ break;
+ default:
+ return ret;
+ }
+next_block:
+ pr_debug("next");
+ offset += sectors;
+ sectors_read -= sectors;
+ j = ((void *) j) + (sectors << 9);
+ }
+
+ return 0;
+}
+
+static CLOSURE_CALLBACK(bch2_journal_read_device)
+{
+ closure_type(ja, struct journal_device, read);
+ struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
+ struct bch_fs *c = ca->fs;
+ struct journal_list *jlist =
+ container_of(cl->parent, struct journal_list, cl);
+ struct journal_replay *r, **_r;
+ struct genradix_iter iter;
+ struct journal_read_buf buf = { NULL, 0 };
+ unsigned i;
+ int ret = 0;
+
+ if (!ja->nr)
+ goto out;
+
+ ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
+ if (ret)
+ goto err;
+
+ pr_debug("%u journal buckets", ja->nr);
+
+ for (i = 0; i < ja->nr; i++) {
+ ret = journal_read_bucket(ca, &buf, jlist, i);
+ if (ret)
+ goto err;
+ }
+
+ ja->sectors_free = ca->mi.bucket_size;
+
+ mutex_lock(&jlist->lock);
+ genradix_for_each_reverse(&c->journal_entries, iter, _r) {
+ r = *_r;
+
+ if (!r)
+ continue;
+
+ for (i = 0; i < r->nr_ptrs; i++) {
+ if (r->ptrs[i].dev == ca->dev_idx) {
+ unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
+ vstruct_sectors(&r->j, c->block_bits);
+
+ ja->cur_idx = r->ptrs[i].bucket;
+ ja->sectors_free = ca->mi.bucket_size - wrote;
+ goto found;
+ }
+ }
+ }
+found:
+ mutex_unlock(&jlist->lock);
+
+ if (ja->bucket_seq[ja->cur_idx] &&
+ ja->sectors_free == ca->mi.bucket_size) {
+#if 0
+ /*
+ * Debug code for ZNS support, where we (probably) want to be
+ * correlated where we stopped in the journal to the zone write
+ * points:
+ */
+ bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
+ bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
+ for (i = 0; i < 3; i++) {
+ unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr;
+
+ bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
+ }
+#endif
+ ja->sectors_free = 0;
+ }
+
+ /*
+ * Set dirty_idx to indicate the entire journal is full and needs to be
+ * reclaimed - journal reclaim will immediately reclaim whatever isn't
+ * pinned when it first runs:
+ */
+ ja->discard_idx = ja->dirty_idx_ondisk =
+ ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
+out:
+ bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
+ kvpfree(buf.data, buf.size);
+ percpu_ref_put(&ca->io_ref);
+ closure_return(cl);
+ return;
+err:
+ mutex_lock(&jlist->lock);
+ jlist->ret = ret;
+ mutex_unlock(&jlist->lock);
+ goto out;
+}
+
+void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+ struct journal_replay *j)
+{
+ unsigned i;
+
+ for (i = 0; i < j->nr_ptrs; i++) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
+ u64 offset;
+
+ div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset);
+
+ if (i)
+ prt_printf(out, " ");
+ prt_printf(out, "%u:%u:%u (sector %llu)",
+ j->ptrs[i].dev,
+ j->ptrs[i].bucket,
+ j->ptrs[i].bucket_offset,
+ j->ptrs[i].sector);
+ }
+}
+
+int bch2_journal_read(struct bch_fs *c,
+ u64 *last_seq,
+ u64 *blacklist_seq,
+ u64 *start_seq)
+{
+ struct journal_list jlist;
+ struct journal_replay *i, **_i, *prev = NULL;
+ struct genradix_iter radix_iter;
+ struct bch_dev *ca;
+ unsigned iter;
+ struct printbuf buf = PRINTBUF;
+ bool degraded = false, last_write_torn = false;
+ u64 seq;
+ int ret = 0;
+
+ closure_init_stack(&jlist.cl);
+ mutex_init(&jlist.lock);
+ jlist.last_seq = 0;
+ jlist.ret = 0;
+
+ for_each_member_device(ca, c, iter) {
+ if (!c->opts.fsck &&
+ !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
+ continue;
+
+ if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
+ ca->mi.state == BCH_MEMBER_STATE_ro) &&
+ percpu_ref_tryget(&ca->io_ref))
+ closure_call(&ca->journal.read,
+ bch2_journal_read_device,
+ system_unbound_wq,
+ &jlist.cl);
+ else
+ degraded = true;
+ }
+
+ closure_sync(&jlist.cl);
+
+ if (jlist.ret)
+ return jlist.ret;
+
+ *last_seq = 0;
+ *start_seq = 0;
+ *blacklist_seq = 0;
+
+ /*
+ * Find most recent flush entry, and ignore newer non flush entries -
+ * those entries will be blacklisted:
+ */
+ genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
+ enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
+
+ i = *_i;
+
+ if (!i || i->ignore)
+ continue;
+
+ if (!*start_seq)
+ *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1;
+
+ if (JSET_NO_FLUSH(&i->j)) {
+ i->ignore = true;
+ continue;
+ }
+
+ if (!last_write_torn && !i->csum_good) {
+ last_write_torn = true;
+ i->ignore = true;
+ continue;
+ }
+
+ if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
+ c, le32_to_cpu(i->j.version), &i->j, NULL,
+ jset_last_seq_newer_than_seq,
+ "invalid journal entry: last_seq > seq (%llu > %llu)",
+ le64_to_cpu(i->j.last_seq),
+ le64_to_cpu(i->j.seq)))
+ i->j.last_seq = i->j.seq;
+
+ *last_seq = le64_to_cpu(i->j.last_seq);
+ *blacklist_seq = le64_to_cpu(i->j.seq) + 1;
+ break;
+ }
+
+ if (!*start_seq) {
+ bch_info(c, "journal read done, but no entries found");
+ return 0;
+ }
+
+ if (!*last_seq) {
+ fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes,
+ "journal read done, but no entries found after dropping non-flushes");
+ return 0;
+ }
+
+ bch_info(c, "journal read done, replaying entries %llu-%llu",
+ *last_seq, *blacklist_seq - 1);
+
+ if (*start_seq != *blacklist_seq)
+ bch_info(c, "dropped unflushed entries %llu-%llu",
+ *blacklist_seq, *start_seq - 1);
+
+ /* Drop blacklisted entries and entries older than last_seq: */
+ genradix_for_each(&c->journal_entries, radix_iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
+ continue;
+
+ seq = le64_to_cpu(i->j.seq);
+ if (seq < *last_seq) {
+ journal_replay_free(c, i);
+ continue;
+ }
+
+ if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
+ fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
+ jset_seq_blacklisted,
+ "found blacklisted journal entry %llu", seq);
+ i->ignore = true;
+ }
+ }
+
+ /* Check for missing entries: */
+ seq = *last_seq;
+ genradix_for_each(&c->journal_entries, radix_iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
+ continue;
+
+ BUG_ON(seq > le64_to_cpu(i->j.seq));
+
+ while (seq < le64_to_cpu(i->j.seq)) {
+ u64 missing_start, missing_end;
+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+
+ while (seq < le64_to_cpu(i->j.seq) &&
+ bch2_journal_seq_is_blacklisted(c, seq, false))
+ seq++;
+
+ if (seq == le64_to_cpu(i->j.seq))
+ break;
+
+ missing_start = seq;
+
+ while (seq < le64_to_cpu(i->j.seq) &&
+ !bch2_journal_seq_is_blacklisted(c, seq, false))
+ seq++;
+
+ if (prev) {
+ bch2_journal_ptrs_to_text(&buf1, c, prev);
+ prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
+ } else
+ prt_printf(&buf1, "(none)");
+ bch2_journal_ptrs_to_text(&buf2, c, i);
+
+ missing_end = seq - 1;
+ fsck_err(c, journal_entries_missing,
+ "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
+ " prev at %s\n"
+ " next at %s",
+ missing_start, missing_end,
+ *last_seq, *blacklist_seq - 1,
+ buf1.buf, buf2.buf);
+
+ printbuf_exit(&buf1);
+ printbuf_exit(&buf2);
+ }
+
+ prev = i;
+ seq++;
+ }
+
+ genradix_for_each(&c->journal_entries, radix_iter, _i) {
+ struct bch_replicas_padded replicas = {
+ .e.data_type = BCH_DATA_journal,
+ .e.nr_required = 1,
+ };
+ unsigned ptr;
+
+ i = *_i;
+ if (!i || i->ignore)
+ continue;
+
+ for (ptr = 0; ptr < i->nr_ptrs; ptr++) {
+ ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
+
+ if (!i->ptrs[ptr].csum_good)
+ bch_err_dev_offset(ca, i->ptrs[ptr].sector,
+ "invalid journal checksum, seq %llu%s",
+ le64_to_cpu(i->j.seq),
+ i->csum_good ? " (had good copy on another device)" : "");
+ }
+
+ ret = jset_validate(c,
+ bch_dev_bkey_exists(c, i->ptrs[0].dev),
+ &i->j,
+ i->ptrs[0].sector,
+ READ);
+ if (ret)
+ goto err;
+
+ for (ptr = 0; ptr < i->nr_ptrs; ptr++)
+ replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
+
+ bch2_replicas_entry_sort(&replicas.e);
+
+ printbuf_reset(&buf);
+ bch2_replicas_entry_to_text(&buf, &replicas.e);
+
+ if (!degraded &&
+ !bch2_replicas_marked(c, &replicas.e) &&
+ (le64_to_cpu(i->j.seq) == *last_seq ||
+ fsck_err(c, journal_entry_replicas_not_marked,
+ "superblock not marked as containing replicas for journal entry %llu\n %s",
+ le64_to_cpu(i->j.seq), buf.buf))) {
+ ret = bch2_mark_replicas(c, &replicas.e);
+ if (ret)
+ goto err;
+ }
+ }
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+/* journal write: */
+
+static void __journal_write_alloc(struct journal *j,
+ struct journal_buf *w,
+ struct dev_alloc_list *devs_sorted,
+ unsigned sectors,
+ unsigned *replicas,
+ unsigned replicas_want)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct journal_device *ja;
+ struct bch_dev *ca;
+ unsigned i;
+
+ if (*replicas >= replicas_want)
+ return;
+
+ for (i = 0; i < devs_sorted->nr; i++) {
+ ca = rcu_dereference(c->devs[devs_sorted->devs[i]]);
+ if (!ca)
+ continue;
+
+ ja = &ca->journal;
+
+ /*
+ * Check that we can use this device, and aren't already using
+ * it:
+ */
+ if (!ca->mi.durability ||
+ ca->mi.state != BCH_MEMBER_STATE_rw ||
+ !ja->nr ||
+ bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) ||
+ sectors > ja->sectors_free)
+ continue;
+
+ bch2_dev_stripe_increment(ca, &j->wp.stripe);
+
+ bch2_bkey_append_ptr(&w->key,
+ (struct bch_extent_ptr) {
+ .offset = bucket_to_sector(ca,
+ ja->buckets[ja->cur_idx]) +
+ ca->mi.bucket_size -
+ ja->sectors_free,
+ .dev = ca->dev_idx,
+ });
+
+ ja->sectors_free -= sectors;
+ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
+
+ *replicas += ca->mi.durability;
+
+ if (*replicas >= replicas_want)
+ break;
+ }
+}
+
+/**
+ * journal_write_alloc - decide where to write next journal entry
+ *
+ * @j: journal object
+ * @w: journal buf (entry to be written)
+ *
+ * Returns: 0 on success, or -EROFS on failure
+ */
+static int journal_write_alloc(struct journal *j, struct journal_buf *w)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bch_devs_mask devs;
+ struct journal_device *ja;
+ struct bch_dev *ca;
+ struct dev_alloc_list devs_sorted;
+ unsigned sectors = vstruct_sectors(w->data, c->block_bits);
+ unsigned target = c->opts.metadata_target ?:
+ c->opts.foreground_target;
+ unsigned i, replicas = 0, replicas_want =
+ READ_ONCE(c->opts.metadata_replicas);
+
+ rcu_read_lock();
+retry:
+ devs = target_rw_devs(c, BCH_DATA_journal, target);
+
+ devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
+
+ __journal_write_alloc(j, w, &devs_sorted,
+ sectors, &replicas, replicas_want);
+
+ if (replicas >= replicas_want)
+ goto done;
+
+ for (i = 0; i < devs_sorted.nr; i++) {
+ ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
+ if (!ca)
+ continue;
+
+ ja = &ca->journal;
+
+ if (sectors > ja->sectors_free &&
+ sectors <= ca->mi.bucket_size &&
+ bch2_journal_dev_buckets_available(j, ja,
+ journal_space_discarded)) {
+ ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+ ja->sectors_free = ca->mi.bucket_size;
+
+ /*
+ * ja->bucket_seq[ja->cur_idx] must always have
+ * something sensible:
+ */
+ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
+ }
+ }
+
+ __journal_write_alloc(j, w, &devs_sorted,
+ sectors, &replicas, replicas_want);
+
+ if (replicas < replicas_want && target) {
+ /* Retry from all devices: */
+ target = 0;
+ goto retry;
+ }
+done:
+ rcu_read_unlock();
+
+ BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
+
+ return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
+}
+
+static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
+{
+ /* we aren't holding j->lock: */
+ unsigned new_size = READ_ONCE(j->buf_size_want);
+ void *new_buf;
+
+ if (buf->buf_size >= new_size)
+ return;
+
+ new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
+ if (!new_buf)
+ return;
+
+ memcpy(new_buf, buf->data, buf->buf_size);
+
+ spin_lock(&j->lock);
+ swap(buf->data, new_buf);
+ swap(buf->buf_size, new_size);
+ spin_unlock(&j->lock);
+
+ kvpfree(new_buf, new_size);
+}
+
+static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
+{
+ return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
+}
+
+static CLOSURE_CALLBACK(journal_write_done)
+{
+ closure_type(j, struct journal, io);
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct journal_buf *w = journal_last_unwritten_buf(j);
+ struct bch_replicas_padded replicas;
+ union journal_res_state old, new;
+ u64 v, seq;
+ int err = 0;
+
+ bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
+ ? j->flush_write_time
+ : j->noflush_write_time, j->write_start_time);
+
+ if (!w->devs_written.nr) {
+ bch_err(c, "unable to write journal to sufficient devices");
+ err = -EIO;
+ } else {
+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
+ w->devs_written);
+ if (bch2_mark_replicas(c, &replicas.e))
+ err = -EIO;
+ }
+
+ if (err)
+ bch2_fatal_error(c);
+
+ spin_lock(&j->lock);
+ seq = le64_to_cpu(w->data->seq);
+
+ if (seq >= j->pin.front)
+ journal_seq_pin(j, seq)->devs = w->devs_written;
+
+ if (!err) {
+ if (!JSET_NO_FLUSH(w->data)) {
+ j->flushed_seq_ondisk = seq;
+ j->last_seq_ondisk = w->last_seq;
+
+ bch2_do_discards(c);
+ closure_wake_up(&c->freelist_wait);
+
+ bch2_reset_alloc_cursors(c);
+ }
+ } else if (!j->err_seq || seq < j->err_seq)
+ j->err_seq = seq;
+
+ j->seq_ondisk = seq;
+
+ /*
+ * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
+ * more buckets:
+ *
+ * Must come before signaling write completion, for
+ * bch2_fs_journal_stop():
+ */
+ if (j->watermark != BCH_WATERMARK_stripe)
+ journal_reclaim_kick(&c->journal);
+
+ /* also must come before signalling write completion: */
+ closure_debug_destroy(cl);
+
+ v = atomic64_read(&j->reservations.counter);
+ do {
+ old.v = new.v = v;
+ BUG_ON(journal_state_count(new, new.unwritten_idx));
+
+ new.unwritten_idx++;
+ } while ((v = atomic64_cmpxchg(&j->reservations.counter,
+ old.v, new.v)) != old.v);
+
+ bch2_journal_reclaim_fast(j);
+ bch2_journal_space_available(j);
+
+ closure_wake_up(&w->wait);
+ journal_wake(j);
+
+ if (!journal_state_count(new, new.unwritten_idx) &&
+ journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
+ spin_unlock(&j->lock);
+ closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+ } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
+ new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
+ struct journal_buf *buf = journal_cur_buf(j);
+ long delta = buf->expires - jiffies;
+
+ /*
+ * We don't close a journal entry to write it while there's
+ * previous entries still in flight - the current journal entry
+ * might want to be written now:
+ */
+
+ spin_unlock(&j->lock);
+ mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
+ } else {
+ spin_unlock(&j->lock);
+ }
+}
+
+static void journal_write_endio(struct bio *bio)
+{
+ struct bch_dev *ca = bio->bi_private;
+ struct journal *j = &ca->fs->journal;
+ struct journal_buf *w = journal_last_unwritten_buf(j);
+ unsigned long flags;
+
+ if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
+ "error writing journal entry %llu: %s",
+ le64_to_cpu(w->data->seq),
+ bch2_blk_status_to_str(bio->bi_status)) ||
+ bch2_meta_write_fault("journal")) {
+ spin_lock_irqsave(&j->err_lock, flags);
+ bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
+ spin_unlock_irqrestore(&j->err_lock, flags);
+ }
+
+ closure_put(&j->io);
+ percpu_ref_put(&ca->io_ref);
+}
+
+static CLOSURE_CALLBACK(do_journal_write)
+{
+ closure_type(j, struct journal, io);
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bch_dev *ca;
+ struct journal_buf *w = journal_last_unwritten_buf(j);
+ struct bch_extent_ptr *ptr;
+ struct bio *bio;
+ unsigned sectors = vstruct_sectors(w->data, c->block_bits);
+
+ extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
+ ca = bch_dev_bkey_exists(c, ptr->dev);
+ if (!percpu_ref_tryget(&ca->io_ref)) {
+ /* XXX: fix this */
+ bch_err(c, "missing device for journal write\n");
+ continue;
+ }
+
+ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
+ sectors);
+
+ bio = ca->journal.bio;
+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
+ bio->bi_iter.bi_sector = ptr->offset;
+ bio->bi_end_io = journal_write_endio;
+ bio->bi_private = ca;
+
+ BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
+ ca->prev_journal_sector = bio->bi_iter.bi_sector;
+
+ if (!JSET_NO_FLUSH(w->data))
+ bio->bi_opf |= REQ_FUA;
+ if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
+ bio->bi_opf |= REQ_PREFLUSH;
+
+ bch2_bio_map(bio, w->data, sectors << 9);
+
+ trace_and_count(c, journal_write, bio);
+ closure_bio_submit(bio, cl);
+
+ ca->journal.bucket_seq[ca->journal.cur_idx] =
+ le64_to_cpu(w->data->seq);
+ }
+
+ continue_at(cl, journal_write_done, c->io_complete_wq);
+}
+
+static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct jset_entry *start, *end, *i, *next, *prev = NULL;
+ struct jset *jset = w->data;
+ unsigned sectors, bytes, u64s;
+ bool validate_before_checksum = false;
+ unsigned long btree_roots_have = 0;
+ int ret;
+
+ /*
+ * Simple compaction, dropping empty jset_entries (from journal
+ * reservations that weren't fully used) and merging jset_entries that
+ * can be.
+ *
+ * If we wanted to be really fancy here, we could sort all the keys in
+ * the jset and drop keys that were overwritten - probably not worth it:
+ */
+ vstruct_for_each_safe(jset, i, next) {
+ unsigned u64s = le16_to_cpu(i->u64s);
+
+ /* Empty entry: */
+ if (!u64s)
+ continue;
+
+ /*
+ * New btree roots are set by journalling them; when the journal
+ * entry gets written we have to propagate them to
+ * c->btree_roots
+ *
+ * But, every journal entry we write has to contain all the
+ * btree roots (at least for now); so after we copy btree roots
+ * to c->btree_roots we have to get any missing btree roots and
+ * add them to this journal entry:
+ */
+ if (i->type == BCH_JSET_ENTRY_btree_root) {
+ bch2_journal_entry_to_btree_root(c, i);
+ __set_bit(i->btree_id, &btree_roots_have);
+ }
+
+ /* Can we merge with previous entry? */
+ if (prev &&
+ i->btree_id == prev->btree_id &&
+ i->level == prev->level &&
+ i->type == prev->type &&
+ i->type == BCH_JSET_ENTRY_btree_keys &&
+ le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
+ memmove_u64s_down(vstruct_next(prev),
+ i->_data,
+ u64s);
+ le16_add_cpu(&prev->u64s, u64s);
+ continue;
+ }
+
+ /* Couldn't merge, move i into new position (after prev): */
+ prev = prev ? vstruct_next(prev) : jset->start;
+ if (i != prev)
+ memmove_u64s_down(prev, i, jset_u64s(u64s));
+ }
+
+ prev = prev ? vstruct_next(prev) : jset->start;
+ jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
+
+ start = end = vstruct_last(jset);
+
+ end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
+
+ bch2_journal_super_entries_add_common(c, &end,
+ le64_to_cpu(jset->seq));
+ u64s = (u64 *) end - (u64 *) start;
+ BUG_ON(u64s > j->entry_u64s_reserved);
+
+ le32_add_cpu(&jset->u64s, u64s);
+
+ sectors = vstruct_sectors(jset, c->block_bits);
+ bytes = vstruct_bytes(jset);
+
+ if (sectors > w->sectors) {
+ bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
+ vstruct_bytes(jset), w->sectors << 9,
+ u64s, w->u64s_reserved, j->entry_u64s_reserved);
+ return -EINVAL;
+ }
+
+ jset->magic = cpu_to_le64(jset_magic(c));
+ jset->version = cpu_to_le32(c->sb.version);
+
+ SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
+ SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
+
+ if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
+ j->last_empty_seq = le64_to_cpu(jset->seq);
+
+ if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
+ validate_before_checksum = true;
+
+ if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current)
+ validate_before_checksum = true;
+
+ if (validate_before_checksum &&
+ (ret = jset_validate(c, NULL, jset, 0, WRITE)))
+ return ret;
+
+ ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
+ jset->encrypted_start,
+ vstruct_end(jset) - (void *) jset->encrypted_start);
+ if (bch2_fs_fatal_err_on(ret, c,
+ "error decrypting journal entry: %i", ret))
+ return ret;
+
+ jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
+ journal_nonce(jset), jset);
+
+ if (!validate_before_checksum &&
+ (ret = jset_validate(c, NULL, jset, 0, WRITE)))
+ return ret;
+
+ memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
+ return 0;
+}
+
+static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ int error = bch2_journal_error(j);
+
+ /*
+ * If the journal is in an error state - we did an emergency shutdown -
+ * we prefer to continue doing journal writes. We just mark them as
+ * noflush so they'll never be used, but they'll still be visible by the
+ * list_journal tool - this helps in debugging.
+ *
+ * There's a caveat: the first journal write after marking the
+ * superblock dirty must always be a flush write, because on startup
+ * from a clean shutdown we didn't necessarily read the journal and the
+ * new journal write might overwrite whatever was in the journal
+ * previously - we can't leave the journal without any flush writes in
+ * it.
+ *
+ * So if we're in an error state, and we're still starting up, we don't
+ * write anything at all.
+ */
+ if (error && test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags))
+ return -EIO;
+
+ if (error ||
+ w->noflush ||
+ (!w->must_flush &&
+ (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
+ test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
+ w->noflush = true;
+ SET_JSET_NO_FLUSH(w->data, true);
+ w->data->last_seq = 0;
+ w->last_seq = 0;
+
+ j->nr_noflush_writes++;
+ } else {
+ j->last_flush_write = jiffies;
+ j->nr_flush_writes++;
+ clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
+ }
+
+ return 0;
+}
+
+CLOSURE_CALLBACK(bch2_journal_write)
+{
+ closure_type(j, struct journal, io);
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bch_dev *ca;
+ struct journal_buf *w = journal_last_unwritten_buf(j);
+ struct bch_replicas_padded replicas;
+ struct bio *bio;
+ struct printbuf journal_debug_buf = PRINTBUF;
+ unsigned i, nr_rw_members = 0;
+ int ret;
+
+ BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+
+ j->write_start_time = local_clock();
+
+ spin_lock(&j->lock);
+ ret = bch2_journal_write_pick_flush(j, w);
+ spin_unlock(&j->lock);
+ if (ret)
+ goto err;
+
+ journal_buf_realloc(j, w);
+
+ ret = bch2_journal_write_prep(j, w);
+ if (ret)
+ goto err;
+
+ while (1) {
+ spin_lock(&j->lock);
+ ret = journal_write_alloc(j, w);
+ if (!ret || !j->can_discard)
+ break;
+
+ spin_unlock(&j->lock);
+ bch2_journal_do_discards(j);
+ }
+
+ if (ret) {
+ __bch2_journal_debug_to_text(&journal_debug_buf, j);
+ spin_unlock(&j->lock);
+ bch_err(c, "Unable to allocate journal write:\n%s",
+ journal_debug_buf.buf);
+ printbuf_exit(&journal_debug_buf);
+ goto err;
+ }
+
+ /*
+ * write is allocated, no longer need to account for it in
+ * bch2_journal_space_available():
+ */
+ w->sectors = 0;
+
+ /*
+ * journal entry has been compacted and allocated, recalculate space
+ * available:
+ */
+ bch2_journal_space_available(j);
+ spin_unlock(&j->lock);
+
+ w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
+
+ if (c->opts.nochanges)
+ goto no_io;
+
+ for_each_rw_member(ca, c, i)
+ nr_rw_members++;
+
+ if (nr_rw_members > 1)
+ w->separate_flush = true;
+
+ /*
+ * Mark journal replicas before we submit the write to guarantee
+ * recovery will find the journal entries after a crash.
+ */
+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
+ w->devs_written);
+ ret = bch2_mark_replicas(c, &replicas.e);
+ if (ret)
+ goto err;
+
+ if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
+ for_each_rw_member(ca, c, i) {
+ percpu_ref_get(&ca->io_ref);
+
+ bio = ca->journal.bio;
+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
+ bio->bi_end_io = journal_write_endio;
+ bio->bi_private = ca;
+ closure_bio_submit(bio, cl);
+ }
+ }
+
+ continue_at(cl, do_journal_write, c->io_complete_wq);
+ return;
+no_io:
+ continue_at(cl, journal_write_done, c->io_complete_wq);
+ return;
+err:
+ bch2_fatal_error(c);
+ continue_at(cl, journal_write_done, c->io_complete_wq);
+}
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
new file mode 100644
index 000000000000..c035e7c108e1
--- /dev/null
+++ b/fs/bcachefs/journal_io.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_IO_H
+#define _BCACHEFS_JOURNAL_IO_H
+
+/*
+ * Only used for holding the journal entries we read in btree_journal_read()
+ * during cache_registration
+ */
+struct journal_replay {
+ struct journal_ptr {
+ bool csum_good;
+ u8 dev;
+ u32 bucket;
+ u32 bucket_offset;
+ u64 sector;
+ } ptrs[BCH_REPLICAS_MAX];
+ unsigned nr_ptrs;
+
+ bool csum_good;
+ bool ignore;
+ /* must be last: */
+ struct jset j;
+};
+
+static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
+ struct jset_entry *entry, unsigned type)
+{
+ while (entry < vstruct_last(jset)) {
+ if (entry->type == type)
+ return entry;
+
+ entry = vstruct_next(entry);
+ }
+
+ return NULL;
+}
+
+#define for_each_jset_entry_type(entry, jset, type) \
+ for (entry = (jset)->start; \
+ (entry = __jset_entry_type_next(jset, entry, type)); \
+ entry = vstruct_next(entry))
+
+#define jset_entry_for_each_key(_e, _k) \
+ for (_k = (_e)->start; \
+ _k < vstruct_last(_e); \
+ _k = bkey_next(_k))
+
+#define for_each_jset_key(k, entry, jset) \
+ for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)\
+ jset_entry_for_each_key(entry, k)
+
+int bch2_journal_entry_validate(struct bch_fs *, struct jset *,
+ struct jset_entry *, unsigned, int,
+ enum bkey_invalid_flags);
+void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
+ struct jset_entry *);
+
+void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *,
+ struct journal_replay *);
+
+int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *);
+
+CLOSURE_CALLBACK(bch2_journal_write);
+
+#endif /* _BCACHEFS_JOURNAL_IO_H */
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
new file mode 100644
index 000000000000..ec712104addb
--- /dev/null
+++ b/fs/bcachefs/journal_reclaim.c
@@ -0,0 +1,867 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_key_cache.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "errcode.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "replicas.h"
+#include "sb-members.h"
+#include "trace.h"
+
+#include <linux/kthread.h>
+#include <linux/sched/mm.h>
+
+/* Free space calculations: */
+
+static unsigned journal_space_from(struct journal_device *ja,
+ enum journal_space_from from)
+{
+ switch (from) {
+ case journal_space_discarded:
+ return ja->discard_idx;
+ case journal_space_clean_ondisk:
+ return ja->dirty_idx_ondisk;
+ case journal_space_clean:
+ return ja->dirty_idx;
+ default:
+ BUG();
+ }
+}
+
+unsigned bch2_journal_dev_buckets_available(struct journal *j,
+ struct journal_device *ja,
+ enum journal_space_from from)
+{
+ unsigned available = (journal_space_from(ja, from) -
+ ja->cur_idx - 1 + ja->nr) % ja->nr;
+
+ /*
+ * Don't use the last bucket unless writing the new last_seq
+ * will make another bucket available:
+ */
+ if (available && ja->dirty_idx_ondisk == ja->dirty_idx)
+ --available;
+
+ return available;
+}
+
+static inline void journal_set_watermark(struct journal *j, bool low_on_space)
+{
+ unsigned watermark = BCH_WATERMARK_stripe;
+
+ if (low_on_space)
+ watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
+ if (fifo_free(&j->pin) < j->pin.size / 4)
+ watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
+
+ if (watermark == j->watermark)
+ return;
+
+ swap(watermark, j->watermark);
+ if (watermark > j->watermark)
+ journal_wake(j);
+}
+
+static struct journal_space
+journal_dev_space_available(struct journal *j, struct bch_dev *ca,
+ enum journal_space_from from)
+{
+ struct journal_device *ja = &ca->journal;
+ unsigned sectors, buckets, unwritten;
+ u64 seq;
+
+ if (from == journal_space_total)
+ return (struct journal_space) {
+ .next_entry = ca->mi.bucket_size,
+ .total = ca->mi.bucket_size * ja->nr,
+ };
+
+ buckets = bch2_journal_dev_buckets_available(j, ja, from);
+ sectors = ja->sectors_free;
+
+ /*
+ * We that we don't allocate the space for a journal entry
+ * until we write it out - thus, account for it here:
+ */
+ for (seq = journal_last_unwritten_seq(j);
+ seq <= journal_cur_seq(j);
+ seq++) {
+ unwritten = j->buf[seq & JOURNAL_BUF_MASK].sectors;
+
+ if (!unwritten)
+ continue;
+
+ /* entry won't fit on this device, skip: */
+ if (unwritten > ca->mi.bucket_size)
+ continue;
+
+ if (unwritten >= sectors) {
+ if (!buckets) {
+ sectors = 0;
+ break;
+ }
+
+ buckets--;
+ sectors = ca->mi.bucket_size;
+ }
+
+ sectors -= unwritten;
+ }
+
+ if (sectors < ca->mi.bucket_size && buckets) {
+ buckets--;
+ sectors = ca->mi.bucket_size;
+ }
+
+ return (struct journal_space) {
+ .next_entry = sectors,
+ .total = sectors + buckets * ca->mi.bucket_size,
+ };
+}
+
+static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want,
+ enum journal_space_from from)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bch_dev *ca;
+ unsigned i, pos, nr_devs = 0;
+ struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX];
+
+ BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
+
+ rcu_read_lock();
+ for_each_member_device_rcu(ca, c, i,
+ &c->rw_devs[BCH_DATA_journal]) {
+ if (!ca->journal.nr)
+ continue;
+
+ space = journal_dev_space_available(j, ca, from);
+ if (!space.next_entry)
+ continue;
+
+ for (pos = 0; pos < nr_devs; pos++)
+ if (space.total > dev_space[pos].total)
+ break;
+
+ array_insert_item(dev_space, nr_devs, pos, space);
+ }
+ rcu_read_unlock();
+
+ if (nr_devs < nr_devs_want)
+ return (struct journal_space) { 0, 0 };
+
+ /*
+ * We sorted largest to smallest, and we want the smallest out of the
+ * @nr_devs_want largest devices:
+ */
+ return dev_space[nr_devs_want - 1];
+}
+
+void bch2_journal_space_available(struct journal *j)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bch_dev *ca;
+ unsigned clean, clean_ondisk, total;
+ unsigned max_entry_size = min(j->buf[0].buf_size >> 9,
+ j->buf[1].buf_size >> 9);
+ unsigned i, nr_online = 0, nr_devs_want;
+ bool can_discard = false;
+ int ret = 0;
+
+ lockdep_assert_held(&j->lock);
+
+ rcu_read_lock();
+ for_each_member_device_rcu(ca, c, i,
+ &c->rw_devs[BCH_DATA_journal]) {
+ struct journal_device *ja = &ca->journal;
+
+ if (!ja->nr)
+ continue;
+
+ while (ja->dirty_idx != ja->cur_idx &&
+ ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j))
+ ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
+
+ while (ja->dirty_idx_ondisk != ja->dirty_idx &&
+ ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
+ ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
+
+ if (ja->discard_idx != ja->dirty_idx_ondisk)
+ can_discard = true;
+
+ max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
+ nr_online++;
+ }
+ rcu_read_unlock();
+
+ j->can_discard = can_discard;
+
+ if (nr_online < c->opts.metadata_replicas_required) {
+ ret = JOURNAL_ERR_insufficient_devices;
+ goto out;
+ }
+
+ nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
+
+ for (i = 0; i < journal_space_nr; i++)
+ j->space[i] = __journal_space_available(j, nr_devs_want, i);
+
+ clean_ondisk = j->space[journal_space_clean_ondisk].total;
+ clean = j->space[journal_space_clean].total;
+ total = j->space[journal_space_total].total;
+
+ if (!j->space[journal_space_discarded].next_entry)
+ ret = JOURNAL_ERR_journal_full;
+
+ if ((j->space[journal_space_clean_ondisk].next_entry <
+ j->space[journal_space_clean_ondisk].total) &&
+ (clean - clean_ondisk <= total / 8) &&
+ (clean_ondisk * 2 > clean))
+ set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+ else
+ clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+
+ journal_set_watermark(j, clean * 4 <= total);
+out:
+ j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
+ j->cur_entry_error = ret;
+
+ if (!ret)
+ journal_wake(j);
+}
+
+/* Discards - last part of journal reclaim: */
+
+static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+{
+ bool ret;
+
+ spin_lock(&j->lock);
+ ret = ja->discard_idx != ja->dirty_idx_ondisk;
+ spin_unlock(&j->lock);
+
+ return ret;
+}
+
+/*
+ * Advance ja->discard_idx as long as it points to buckets that are no longer
+ * dirty, issuing discards if necessary:
+ */
+void bch2_journal_do_discards(struct journal *j)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bch_dev *ca;
+ unsigned iter;
+
+ mutex_lock(&j->discard_lock);
+
+ for_each_rw_member(ca, c, iter) {
+ struct journal_device *ja = &ca->journal;
+
+ while (should_discard_bucket(j, ja)) {
+ if (!c->opts.nochanges &&
+ ca->mi.discard &&
+ bdev_max_discard_sectors(ca->disk_sb.bdev))
+ blkdev_issue_discard(ca->disk_sb.bdev,
+ bucket_to_sector(ca,
+ ja->buckets[ja->discard_idx]),
+ ca->mi.bucket_size, GFP_NOFS);
+
+ spin_lock(&j->lock);
+ ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
+
+ bch2_journal_space_available(j);
+ spin_unlock(&j->lock);
+ }
+ }
+
+ mutex_unlock(&j->discard_lock);
+}
+
+/*
+ * Journal entry pinning - machinery for holding a reference on a given journal
+ * entry, holding it open to ensure it gets replayed during recovery:
+ */
+
+void bch2_journal_reclaim_fast(struct journal *j)
+{
+ bool popped = false;
+
+ lockdep_assert_held(&j->lock);
+
+ /*
+ * Unpin journal entries whose reference counts reached zero, meaning
+ * all btree nodes got written out
+ */
+ while (!fifo_empty(&j->pin) &&
+ !atomic_read(&fifo_peek_front(&j->pin).count)) {
+ j->pin.front++;
+ popped = true;
+ }
+
+ if (popped)
+ bch2_journal_space_available(j);
+}
+
+bool __bch2_journal_pin_put(struct journal *j, u64 seq)
+{
+ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
+
+ return atomic_dec_and_test(&pin_list->count);
+}
+
+void bch2_journal_pin_put(struct journal *j, u64 seq)
+{
+ if (__bch2_journal_pin_put(j, seq)) {
+ spin_lock(&j->lock);
+ bch2_journal_reclaim_fast(j);
+ spin_unlock(&j->lock);
+ }
+}
+
+static inline bool __journal_pin_drop(struct journal *j,
+ struct journal_entry_pin *pin)
+{
+ struct journal_entry_pin_list *pin_list;
+
+ if (!journal_pin_active(pin))
+ return false;
+
+ if (j->flush_in_progress == pin)
+ j->flush_in_progress_dropped = true;
+
+ pin_list = journal_seq_pin(j, pin->seq);
+ pin->seq = 0;
+ list_del_init(&pin->list);
+
+ /*
+ * Unpinning a journal entry may make journal_next_bucket() succeed, if
+ * writing a new last_seq will now make another bucket available:
+ */
+ return atomic_dec_and_test(&pin_list->count) &&
+ pin_list == &fifo_peek_front(&j->pin);
+}
+
+void bch2_journal_pin_drop(struct journal *j,
+ struct journal_entry_pin *pin)
+{
+ spin_lock(&j->lock);
+ if (__journal_pin_drop(j, pin))
+ bch2_journal_reclaim_fast(j);
+ spin_unlock(&j->lock);
+}
+
+static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
+{
+ if (fn == bch2_btree_node_flush0 ||
+ fn == bch2_btree_node_flush1)
+ return JOURNAL_PIN_btree;
+ else if (fn == bch2_btree_key_cache_journal_flush)
+ return JOURNAL_PIN_key_cache;
+ else
+ return JOURNAL_PIN_other;
+}
+
+void bch2_journal_pin_set(struct journal *j, u64 seq,
+ struct journal_entry_pin *pin,
+ journal_pin_flush_fn flush_fn)
+{
+ struct journal_entry_pin_list *pin_list;
+ bool reclaim;
+
+ spin_lock(&j->lock);
+
+ if (seq < journal_last_seq(j)) {
+ /*
+ * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on
+ * the src pin - with the pin dropped, the entry to pin might no
+ * longer to exist, but that means there's no longer anything to
+ * copy and we can bail out here:
+ */
+ spin_unlock(&j->lock);
+ return;
+ }
+
+ pin_list = journal_seq_pin(j, seq);
+
+ reclaim = __journal_pin_drop(j, pin);
+
+ atomic_inc(&pin_list->count);
+ pin->seq = seq;
+ pin->flush = flush_fn;
+
+ if (flush_fn)
+ list_add(&pin->list, &pin_list->list[journal_pin_type(flush_fn)]);
+ else
+ list_add(&pin->list, &pin_list->flushed);
+
+ if (reclaim)
+ bch2_journal_reclaim_fast(j);
+ spin_unlock(&j->lock);
+
+ /*
+ * If the journal is currently full, we might want to call flush_fn
+ * immediately:
+ */
+ journal_wake(j);
+}
+
+/**
+ * bch2_journal_pin_flush: ensure journal pin callback is no longer running
+ * @j: journal object
+ * @pin: pin to flush
+ */
+void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
+{
+ BUG_ON(journal_pin_active(pin));
+
+ wait_event(j->pin_flush_wait, j->flush_in_progress != pin);
+}
+
+/*
+ * Journal reclaim: flush references to open journal entries to reclaim space in
+ * the journal
+ *
+ * May be done by the journal code in the background as needed to free up space
+ * for more journal entries, or as part of doing a clean shutdown, or to migrate
+ * data off of a specific device:
+ */
+
+static struct journal_entry_pin *
+journal_get_next_pin(struct journal *j,
+ u64 seq_to_flush,
+ unsigned allowed_below_seq,
+ unsigned allowed_above_seq,
+ u64 *seq)
+{
+ struct journal_entry_pin_list *pin_list;
+ struct journal_entry_pin *ret = NULL;
+ unsigned i;
+
+ fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) {
+ if (*seq > seq_to_flush && !allowed_above_seq)
+ break;
+
+ for (i = 0; i < JOURNAL_PIN_NR; i++)
+ if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) ||
+ ((1U << i) & allowed_above_seq)) {
+ ret = list_first_entry_or_null(&pin_list->list[i],
+ struct journal_entry_pin, list);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return NULL;
+}
+
+/* returns true if we did work */
+static size_t journal_flush_pins(struct journal *j,
+ u64 seq_to_flush,
+ unsigned allowed_below_seq,
+ unsigned allowed_above_seq,
+ unsigned min_any,
+ unsigned min_key_cache)
+{
+ struct journal_entry_pin *pin;
+ size_t nr_flushed = 0;
+ journal_pin_flush_fn flush_fn;
+ u64 seq;
+ int err;
+
+ lockdep_assert_held(&j->reclaim_lock);
+
+ while (1) {
+ unsigned allowed_above = allowed_above_seq;
+ unsigned allowed_below = allowed_below_seq;
+
+ if (min_any) {
+ allowed_above |= ~0;
+ allowed_below |= ~0;
+ }
+
+ if (min_key_cache) {
+ allowed_above |= 1U << JOURNAL_PIN_key_cache;
+ allowed_below |= 1U << JOURNAL_PIN_key_cache;
+ }
+
+ cond_resched();
+
+ j->last_flushed = jiffies;
+
+ spin_lock(&j->lock);
+ pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq);
+ if (pin) {
+ BUG_ON(j->flush_in_progress);
+ j->flush_in_progress = pin;
+ j->flush_in_progress_dropped = false;
+ flush_fn = pin->flush;
+ }
+ spin_unlock(&j->lock);
+
+ if (!pin)
+ break;
+
+ if (min_key_cache && pin->flush == bch2_btree_key_cache_journal_flush)
+ min_key_cache--;
+
+ if (min_any)
+ min_any--;
+
+ err = flush_fn(j, pin, seq);
+
+ spin_lock(&j->lock);
+ /* Pin might have been dropped or rearmed: */
+ if (likely(!err && !j->flush_in_progress_dropped))
+ list_move(&pin->list, &journal_seq_pin(j, seq)->flushed);
+ j->flush_in_progress = NULL;
+ j->flush_in_progress_dropped = false;
+ spin_unlock(&j->lock);
+
+ wake_up(&j->pin_flush_wait);
+
+ if (err)
+ break;
+
+ nr_flushed++;
+ }
+
+ return nr_flushed;
+}
+
+static u64 journal_seq_to_flush(struct journal *j)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bch_dev *ca;
+ u64 seq_to_flush = 0;
+ unsigned iter;
+
+ spin_lock(&j->lock);
+
+ for_each_rw_member(ca, c, iter) {
+ struct journal_device *ja = &ca->journal;
+ unsigned nr_buckets, bucket_to_flush;
+
+ if (!ja->nr)
+ continue;
+
+ /* Try to keep the journal at most half full: */
+ nr_buckets = ja->nr / 2;
+
+ nr_buckets = min(nr_buckets, ja->nr);
+
+ bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
+ seq_to_flush = max(seq_to_flush,
+ ja->bucket_seq[bucket_to_flush]);
+ }
+
+ /* Also flush if the pin fifo is more than half full */
+ seq_to_flush = max_t(s64, seq_to_flush,
+ (s64) journal_cur_seq(j) -
+ (j->pin.size >> 1));
+ spin_unlock(&j->lock);
+
+ return seq_to_flush;
+}
+
+/**
+ * __bch2_journal_reclaim - free up journal buckets
+ * @j: journal object
+ * @direct: direct or background reclaim?
+ * @kicked: requested to run since we last ran?
+ * Returns: 0 on success, or -EIO if the journal has been shutdown
+ *
+ * Background journal reclaim writes out btree nodes. It should be run
+ * early enough so that we never completely run out of journal buckets.
+ *
+ * High watermarks for triggering background reclaim:
+ * - FIFO has fewer than 512 entries left
+ * - fewer than 25% journal buckets free
+ *
+ * Background reclaim runs until low watermarks are reached:
+ * - FIFO has more than 1024 entries left
+ * - more than 50% journal buckets free
+ *
+ * As long as a reclaim can complete in the time it takes to fill up
+ * 512 journal entries or 25% of all journal buckets, then
+ * journal_next_bucket() should not stall.
+ */
+static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ bool kthread = (current->flags & PF_KTHREAD) != 0;
+ u64 seq_to_flush;
+ size_t min_nr, min_key_cache, nr_flushed;
+ unsigned flags;
+ int ret = 0;
+
+ /*
+ * We can't invoke memory reclaim while holding the reclaim_lock -
+ * journal reclaim is required to make progress for memory reclaim
+ * (cleaning the caches), so we can't get stuck in memory reclaim while
+ * we're holding the reclaim lock:
+ */
+ lockdep_assert_held(&j->reclaim_lock);
+ flags = memalloc_noreclaim_save();
+
+ do {
+ if (kthread && kthread_should_stop())
+ break;
+
+ if (bch2_journal_error(j)) {
+ ret = -EIO;
+ break;
+ }
+
+ bch2_journal_do_discards(j);
+
+ seq_to_flush = journal_seq_to_flush(j);
+ min_nr = 0;
+
+ /*
+ * If it's been longer than j->reclaim_delay_ms since we last flushed,
+ * make sure to flush at least one journal pin:
+ */
+ if (time_after(jiffies, j->last_flushed +
+ msecs_to_jiffies(c->opts.journal_reclaim_delay)))
+ min_nr = 1;
+
+ if (j->watermark != BCH_WATERMARK_stripe)
+ min_nr = 1;
+
+ if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used)
+ min_nr = 1;
+
+ min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
+
+ trace_and_count(c, journal_reclaim_start, c,
+ direct, kicked,
+ min_nr, min_key_cache,
+ atomic_read(&c->btree_cache.dirty),
+ c->btree_cache.used,
+ atomic_long_read(&c->btree_key_cache.nr_dirty),
+ atomic_long_read(&c->btree_key_cache.nr_keys));
+
+ nr_flushed = journal_flush_pins(j, seq_to_flush,
+ ~0, 0,
+ min_nr, min_key_cache);
+
+ if (direct)
+ j->nr_direct_reclaim += nr_flushed;
+ else
+ j->nr_background_reclaim += nr_flushed;
+ trace_and_count(c, journal_reclaim_finish, c, nr_flushed);
+
+ if (nr_flushed)
+ wake_up(&j->reclaim_wait);
+ } while ((min_nr || min_key_cache) && nr_flushed && !direct);
+
+ memalloc_noreclaim_restore(flags);
+
+ return ret;
+}
+
+int bch2_journal_reclaim(struct journal *j)
+{
+ return __bch2_journal_reclaim(j, true, true);
+}
+
+static int bch2_journal_reclaim_thread(void *arg)
+{
+ struct journal *j = arg;
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ unsigned long delay, now;
+ bool journal_empty;
+ int ret = 0;
+
+ set_freezable();
+
+ j->last_flushed = jiffies;
+
+ while (!ret && !kthread_should_stop()) {
+ bool kicked = j->reclaim_kicked;
+
+ j->reclaim_kicked = false;
+
+ mutex_lock(&j->reclaim_lock);
+ ret = __bch2_journal_reclaim(j, false, kicked);
+ mutex_unlock(&j->reclaim_lock);
+
+ now = jiffies;
+ delay = msecs_to_jiffies(c->opts.journal_reclaim_delay);
+ j->next_reclaim = j->last_flushed + delay;
+
+ if (!time_in_range(j->next_reclaim, now, now + delay))
+ j->next_reclaim = now + delay;
+
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+ if (kthread_should_stop())
+ break;
+ if (j->reclaim_kicked)
+ break;
+
+ spin_lock(&j->lock);
+ journal_empty = fifo_empty(&j->pin);
+ spin_unlock(&j->lock);
+
+ if (journal_empty)
+ schedule();
+ else if (time_after(j->next_reclaim, jiffies))
+ schedule_timeout(j->next_reclaim - jiffies);
+ else
+ break;
+ }
+ __set_current_state(TASK_RUNNING);
+ }
+
+ return 0;
+}
+
+void bch2_journal_reclaim_stop(struct journal *j)
+{
+ struct task_struct *p = j->reclaim_thread;
+
+ j->reclaim_thread = NULL;
+
+ if (p) {
+ kthread_stop(p);
+ put_task_struct(p);
+ }
+}
+
+int bch2_journal_reclaim_start(struct journal *j)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct task_struct *p;
+ int ret;
+
+ if (j->reclaim_thread)
+ return 0;
+
+ p = kthread_create(bch2_journal_reclaim_thread, j,
+ "bch-reclaim/%s", c->name);
+ ret = PTR_ERR_OR_ZERO(p);
+ if (ret) {
+ bch_err_msg(c, ret, "creating journal reclaim thread");
+ return ret;
+ }
+
+ get_task_struct(p);
+ j->reclaim_thread = p;
+ wake_up_process(p);
+ return 0;
+}
+
+static int journal_flush_done(struct journal *j, u64 seq_to_flush,
+ bool *did_work)
+{
+ int ret;
+
+ ret = bch2_journal_error(j);
+ if (ret)
+ return ret;
+
+ mutex_lock(&j->reclaim_lock);
+
+ if (journal_flush_pins(j, seq_to_flush,
+ (1U << JOURNAL_PIN_key_cache)|
+ (1U << JOURNAL_PIN_other), 0, 0, 0) ||
+ journal_flush_pins(j, seq_to_flush,
+ (1U << JOURNAL_PIN_btree), 0, 0, 0))
+ *did_work = true;
+
+ if (seq_to_flush > journal_cur_seq(j))
+ bch2_journal_entry_close(j);
+
+ spin_lock(&j->lock);
+ /*
+ * If journal replay hasn't completed, the unreplayed journal entries
+ * hold refs on their corresponding sequence numbers
+ */
+ ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
+ journal_last_seq(j) > seq_to_flush ||
+ !fifo_used(&j->pin);
+
+ spin_unlock(&j->lock);
+ mutex_unlock(&j->reclaim_lock);
+
+ return ret;
+}
+
+bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
+{
+ bool did_work = false;
+
+ if (!test_bit(JOURNAL_STARTED, &j->flags))
+ return false;
+
+ closure_wait_event(&j->async_wait,
+ journal_flush_done(j, seq_to_flush, &did_work));
+
+ return did_work;
+}
+
+int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct journal_entry_pin_list *p;
+ u64 iter, seq = 0;
+ int ret = 0;
+
+ spin_lock(&j->lock);
+ fifo_for_each_entry_ptr(p, &j->pin, iter)
+ if (dev_idx >= 0
+ ? bch2_dev_list_has_dev(p->devs, dev_idx)
+ : p->devs.nr < c->opts.metadata_replicas)
+ seq = iter;
+ spin_unlock(&j->lock);
+
+ bch2_journal_flush_pins(j, seq);
+
+ ret = bch2_journal_error(j);
+ if (ret)
+ return ret;
+
+ mutex_lock(&c->replicas_gc_lock);
+ bch2_replicas_gc_start(c, 1 << BCH_DATA_journal);
+
+ /*
+ * Now that we've populated replicas_gc, write to the journal to mark
+ * active journal devices. This handles the case where the journal might
+ * be empty. Otherwise we could clear all journal replicas and
+ * temporarily put the fs into an unrecoverable state. Journal recovery
+ * expects to find devices marked for journal data on unclean mount.
+ */
+ ret = bch2_journal_meta(&c->journal);
+ if (ret)
+ goto err;
+
+ seq = 0;
+ spin_lock(&j->lock);
+ while (!ret) {
+ struct bch_replicas_padded replicas;
+
+ seq = max(seq, journal_last_seq(j));
+ if (seq >= j->pin.back)
+ break;
+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
+ journal_seq_pin(j, seq)->devs);
+ seq++;
+
+ spin_unlock(&j->lock);
+ ret = bch2_mark_replicas(c, &replicas.e);
+ spin_lock(&j->lock);
+ }
+ spin_unlock(&j->lock);
+err:
+ ret = bch2_replicas_gc_end(c, ret);
+ mutex_unlock(&c->replicas_gc_lock);
+
+ return ret;
+}
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
new file mode 100644
index 000000000000..494d1a6eddb0
--- /dev/null
+++ b/fs/bcachefs/journal_reclaim.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_RECLAIM_H
+#define _BCACHEFS_JOURNAL_RECLAIM_H
+
+#define JOURNAL_PIN (32 * 1024)
+
+static inline void journal_reclaim_kick(struct journal *j)
+{
+ struct task_struct *p = READ_ONCE(j->reclaim_thread);
+
+ j->reclaim_kicked = true;
+ if (p)
+ wake_up_process(p);
+}
+
+unsigned bch2_journal_dev_buckets_available(struct journal *,
+ struct journal_device *,
+ enum journal_space_from);
+void bch2_journal_space_available(struct journal *);
+
+static inline bool journal_pin_active(struct journal_entry_pin *pin)
+{
+ return pin->seq != 0;
+}
+
+static inline struct journal_entry_pin_list *
+journal_seq_pin(struct journal *j, u64 seq)
+{
+ EBUG_ON(seq < j->pin.front || seq >= j->pin.back);
+
+ return &j->pin.data[seq & j->pin.mask];
+}
+
+void bch2_journal_reclaim_fast(struct journal *);
+bool __bch2_journal_pin_put(struct journal *, u64);
+void bch2_journal_pin_put(struct journal *, u64);
+void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
+
+void bch2_journal_pin_set(struct journal *, u64, struct journal_entry_pin *,
+ journal_pin_flush_fn);
+
+static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
+ struct journal_entry_pin *pin,
+ journal_pin_flush_fn flush_fn)
+{
+ if (unlikely(!journal_pin_active(pin) || pin->seq > seq))
+ bch2_journal_pin_set(j, seq, pin, flush_fn);
+}
+
+static inline void bch2_journal_pin_copy(struct journal *j,
+ struct journal_entry_pin *dst,
+ struct journal_entry_pin *src,
+ journal_pin_flush_fn flush_fn)
+{
+ /* Guard against racing with journal_pin_drop(src): */
+ u64 seq = READ_ONCE(src->seq);
+
+ if (seq)
+ bch2_journal_pin_add(j, seq, dst, flush_fn);
+}
+
+static inline void bch2_journal_pin_update(struct journal *j, u64 seq,
+ struct journal_entry_pin *pin,
+ journal_pin_flush_fn flush_fn)
+{
+ if (unlikely(!journal_pin_active(pin) || pin->seq < seq))
+ bch2_journal_pin_set(j, seq, pin, flush_fn);
+}
+
+void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
+
+void bch2_journal_do_discards(struct journal *);
+int bch2_journal_reclaim(struct journal *);
+
+void bch2_journal_reclaim_stop(struct journal *);
+int bch2_journal_reclaim_start(struct journal *);
+
+bool bch2_journal_flush_pins(struct journal *, u64);
+
+static inline bool bch2_journal_flush_all_pins(struct journal *j)
+{
+ return bch2_journal_flush_pins(j, U64_MAX);
+}
+
+int bch2_journal_flush_device_pins(struct journal *, int);
+
+#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
new file mode 100644
index 000000000000..ae4fb8c3a2bc
--- /dev/null
+++ b/fs/bcachefs/journal_sb.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "journal_sb.h"
+#include "darray.h"
+
+#include <linux/sort.h>
+
+/* BCH_SB_FIELD_journal: */
+
+static int u64_cmp(const void *_l, const void *_r)
+{
+ const u64 *l = _l;
+ const u64 *r = _r;
+
+ return cmp_int(*l, *r);
+}
+
+static int bch2_sb_journal_validate(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ struct printbuf *err)
+{
+ struct bch_sb_field_journal *journal = field_to_type(f, journal);
+ struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
+ int ret = -BCH_ERR_invalid_sb_journal;
+ unsigned nr;
+ unsigned i;
+ u64 *b;
+
+ nr = bch2_nr_journal_buckets(journal);
+ if (!nr)
+ return 0;
+
+ b = kmalloc_array(nr, sizeof(u64), GFP_KERNEL);
+ if (!b)
+ return -BCH_ERR_ENOMEM_sb_journal_validate;
+
+ for (i = 0; i < nr; i++)
+ b[i] = le64_to_cpu(journal->buckets[i]);
+
+ sort(b, nr, sizeof(u64), u64_cmp, NULL);
+
+ if (!b[0]) {
+ prt_printf(err, "journal bucket at sector 0");
+ goto err;
+ }
+
+ if (b[0] < le16_to_cpu(m.first_bucket)) {
+ prt_printf(err, "journal bucket %llu before first bucket %u",
+ b[0], le16_to_cpu(m.first_bucket));
+ goto err;
+ }
+
+ if (b[nr - 1] >= le64_to_cpu(m.nbuckets)) {
+ prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
+ b[nr - 1], le64_to_cpu(m.nbuckets));
+ goto err;
+ }
+
+ for (i = 0; i + 1 < nr; i++)
+ if (b[i] == b[i + 1]) {
+ prt_printf(err, "duplicate journal buckets %llu", b[i]);
+ goto err;
+ }
+
+ ret = 0;
+err:
+ kfree(b);
+ return ret;
+}
+
+static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_journal *journal = field_to_type(f, journal);
+ unsigned i, nr = bch2_nr_journal_buckets(journal);
+
+ prt_printf(out, "Buckets: ");
+ for (i = 0; i < nr; i++)
+ prt_printf(out, " %llu", le64_to_cpu(journal->buckets[i]));
+ prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_journal = {
+ .validate = bch2_sb_journal_validate,
+ .to_text = bch2_sb_journal_to_text,
+};
+
+struct u64_range {
+ u64 start;
+ u64 end;
+};
+
+static int u64_range_cmp(const void *_l, const void *_r)
+{
+ const struct u64_range *l = _l;
+ const struct u64_range *r = _r;
+
+ return cmp_int(l->start, r->start);
+}
+
+static int bch2_sb_journal_v2_validate(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ struct printbuf *err)
+{
+ struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
+ struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
+ int ret = -BCH_ERR_invalid_sb_journal;
+ unsigned nr;
+ unsigned i;
+ struct u64_range *b;
+
+ nr = bch2_sb_field_journal_v2_nr_entries(journal);
+ if (!nr)
+ return 0;
+
+ b = kmalloc_array(nr, sizeof(*b), GFP_KERNEL);
+ if (!b)
+ return -BCH_ERR_ENOMEM_sb_journal_v2_validate;
+
+ for (i = 0; i < nr; i++) {
+ b[i].start = le64_to_cpu(journal->d[i].start);
+ b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr);
+ }
+
+ sort(b, nr, sizeof(*b), u64_range_cmp, NULL);
+
+ if (!b[0].start) {
+ prt_printf(err, "journal bucket at sector 0");
+ goto err;
+ }
+
+ if (b[0].start < le16_to_cpu(m.first_bucket)) {
+ prt_printf(err, "journal bucket %llu before first bucket %u",
+ b[0].start, le16_to_cpu(m.first_bucket));
+ goto err;
+ }
+
+ if (b[nr - 1].end > le64_to_cpu(m.nbuckets)) {
+ prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
+ b[nr - 1].end - 1, le64_to_cpu(m.nbuckets));
+ goto err;
+ }
+
+ for (i = 0; i + 1 < nr; i++) {
+ if (b[i].end > b[i + 1].start) {
+ prt_printf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu",
+ b[i].start, b[i].end, b[i + 1].start, b[i + 1].end);
+ goto err;
+ }
+ }
+
+ ret = 0;
+err:
+ kfree(b);
+ return ret;
+}
+
+static void bch2_sb_journal_v2_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
+ unsigned i, nr = bch2_sb_field_journal_v2_nr_entries(journal);
+
+ prt_printf(out, "Buckets: ");
+ for (i = 0; i < nr; i++)
+ prt_printf(out, " %llu-%llu",
+ le64_to_cpu(journal->d[i].start),
+ le64_to_cpu(journal->d[i].start) + le64_to_cpu(journal->d[i].nr));
+ prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = {
+ .validate = bch2_sb_journal_v2_validate,
+ .to_text = bch2_sb_journal_v2_to_text,
+};
+
+int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca,
+ u64 *buckets, unsigned nr)
+{
+ struct bch_sb_field_journal_v2 *j;
+ unsigned i, dst = 0, nr_compacted = 1;
+
+ if (c)
+ lockdep_assert_held(&c->sb_lock);
+
+ if (!nr) {
+ bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
+ bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2);
+ return 0;
+ }
+
+ for (i = 0; i + 1 < nr; i++)
+ if (buckets[i] + 1 != buckets[i + 1])
+ nr_compacted++;
+
+ j = bch2_sb_field_resize(&ca->disk_sb, journal_v2,
+ (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64));
+ if (!j)
+ return -BCH_ERR_ENOSPC_sb_journal;
+
+ bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
+
+ j->d[dst].start = cpu_to_le64(buckets[0]);
+ j->d[dst].nr = cpu_to_le64(1);
+
+ for (i = 1; i < nr; i++) {
+ if (buckets[i] == buckets[i - 1] + 1) {
+ le64_add_cpu(&j->d[dst].nr, 1);
+ } else {
+ dst++;
+ j->d[dst].start = cpu_to_le64(buckets[i]);
+ j->d[dst].nr = cpu_to_le64(1);
+ }
+ }
+
+ BUG_ON(dst + 1 != nr_compacted);
+ return 0;
+}
diff --git a/fs/bcachefs/journal_sb.h b/fs/bcachefs/journal_sb.h
new file mode 100644
index 000000000000..ba40a7e8d90a
--- /dev/null
+++ b/fs/bcachefs/journal_sb.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include "super-io.h"
+#include "vstructs.h"
+
+static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j)
+{
+ return j
+ ? (__le64 *) vstruct_end(&j->field) - j->buckets
+ : 0;
+}
+
+static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_journal_v2 *j)
+{
+ if (!j)
+ return 0;
+
+ return (struct bch_sb_field_journal_v2_entry *) vstruct_end(&j->field) - &j->d[0];
+}
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_journal;
+extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2;
+
+int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *, u64 *, unsigned);
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
new file mode 100644
index 000000000000..f9d9aa95bf3a
--- /dev/null
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -0,0 +1,320 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_iter.h"
+#include "eytzinger.h"
+#include "journal_seq_blacklist.h"
+#include "super-io.h"
+
+/*
+ * journal_seq_blacklist machinery:
+ *
+ * To guarantee order of btree updates after a crash, we need to detect when a
+ * btree node entry (bset) is newer than the newest journal entry that was
+ * successfully written, and ignore it - effectively ignoring any btree updates
+ * that didn't make it into the journal.
+ *
+ * If we didn't do this, we might have two btree nodes, a and b, both with
+ * updates that weren't written to the journal yet: if b was updated after a,
+ * but b was flushed and not a - oops; on recovery we'll find that the updates
+ * to b happened, but not the updates to a that happened before it.
+ *
+ * Ignoring bsets that are newer than the newest journal entry is always safe,
+ * because everything they contain will also have been journalled - and must
+ * still be present in the journal on disk until a journal entry has been
+ * written _after_ that bset was written.
+ *
+ * To accomplish this, bsets record the newest journal sequence number they
+ * contain updates for; then, on startup, the btree code queries the journal
+ * code to ask "Is this sequence number newer than the newest journal entry? If
+ * so, ignore it."
+ *
+ * When this happens, we must blacklist that journal sequence number: the
+ * journal must not write any entries with that sequence number, and it must
+ * record that it was blacklisted so that a) on recovery we don't think we have
+ * missing journal entries and b) so that the btree code continues to ignore
+ * that bset, until that btree node is rewritten.
+ */
+
+static unsigned sb_blacklist_u64s(unsigned nr)
+{
+ struct bch_sb_field_journal_seq_blacklist *bl;
+
+ return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64);
+}
+
+static struct bch_sb_field_journal_seq_blacklist *
+blacklist_entry_try_merge(struct bch_fs *c,
+ struct bch_sb_field_journal_seq_blacklist *bl,
+ unsigned i)
+{
+ unsigned nr = blacklist_nr_entries(bl);
+
+ if (le64_to_cpu(bl->start[i].end) >=
+ le64_to_cpu(bl->start[i + 1].start)) {
+ bl->start[i].end = bl->start[i + 1].end;
+ --nr;
+ memmove(&bl->start[i],
+ &bl->start[i + 1],
+ sizeof(bl->start[0]) * (nr - i));
+
+ bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
+ sb_blacklist_u64s(nr));
+ BUG_ON(!bl);
+ }
+
+ return bl;
+}
+
+static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e,
+ u64 start, u64 end)
+{
+ return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start);
+}
+
+int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
+{
+ struct bch_sb_field_journal_seq_blacklist *bl;
+ unsigned i, nr;
+ int ret = 0;
+
+ mutex_lock(&c->sb_lock);
+ bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
+ nr = blacklist_nr_entries(bl);
+
+ for (i = 0; i < nr; i++) {
+ struct journal_seq_blacklist_entry *e =
+ bl->start + i;
+
+ if (bl_entry_contig_or_overlaps(e, start, end)) {
+ e->start = cpu_to_le64(min(start, le64_to_cpu(e->start)));
+ e->end = cpu_to_le64(max(end, le64_to_cpu(e->end)));
+
+ if (i + 1 < nr)
+ bl = blacklist_entry_try_merge(c,
+ bl, i);
+ if (i)
+ bl = blacklist_entry_try_merge(c,
+ bl, i - 1);
+ goto out_write_sb;
+ }
+ }
+
+ bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
+ sb_blacklist_u64s(nr + 1));
+ if (!bl) {
+ ret = -BCH_ERR_ENOSPC_sb_journal_seq_blacklist;
+ goto out;
+ }
+
+ bl->start[nr].start = cpu_to_le64(start);
+ bl->start[nr].end = cpu_to_le64(end);
+out_write_sb:
+ c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3);
+
+ ret = bch2_write_super(c);
+out:
+ mutex_unlock(&c->sb_lock);
+
+ return ret ?: bch2_blacklist_table_initialize(c);
+}
+
+static int journal_seq_blacklist_table_cmp(const void *_l,
+ const void *_r, size_t size)
+{
+ const struct journal_seq_blacklist_table_entry *l = _l;
+ const struct journal_seq_blacklist_table_entry *r = _r;
+
+ return cmp_int(l->start, r->start);
+}
+
+bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq,
+ bool dirty)
+{
+ struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
+ struct journal_seq_blacklist_table_entry search = { .start = seq };
+ int idx;
+
+ if (!t)
+ return false;
+
+ idx = eytzinger0_find_le(t->entries, t->nr,
+ sizeof(t->entries[0]),
+ journal_seq_blacklist_table_cmp,
+ &search);
+ if (idx < 0)
+ return false;
+
+ BUG_ON(t->entries[idx].start > seq);
+
+ if (seq >= t->entries[idx].end)
+ return false;
+
+ if (dirty)
+ t->entries[idx].dirty = true;
+ return true;
+}
+
+int bch2_blacklist_table_initialize(struct bch_fs *c)
+{
+ struct bch_sb_field_journal_seq_blacklist *bl =
+ bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
+ struct journal_seq_blacklist_table *t;
+ unsigned i, nr = blacklist_nr_entries(bl);
+
+ if (!bl)
+ return 0;
+
+ t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr,
+ GFP_KERNEL);
+ if (!t)
+ return -BCH_ERR_ENOMEM_blacklist_table_init;
+
+ t->nr = nr;
+
+ for (i = 0; i < nr; i++) {
+ t->entries[i].start = le64_to_cpu(bl->start[i].start);
+ t->entries[i].end = le64_to_cpu(bl->start[i].end);
+ }
+
+ eytzinger0_sort(t->entries,
+ t->nr,
+ sizeof(t->entries[0]),
+ journal_seq_blacklist_table_cmp,
+ NULL);
+
+ kfree(c->journal_seq_blacklist_table);
+ c->journal_seq_blacklist_table = t;
+ return 0;
+}
+
+static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ struct printbuf *err)
+{
+ struct bch_sb_field_journal_seq_blacklist *bl =
+ field_to_type(f, journal_seq_blacklist);
+ unsigned i, nr = blacklist_nr_entries(bl);
+
+ for (i = 0; i < nr; i++) {
+ struct journal_seq_blacklist_entry *e = bl->start + i;
+
+ if (le64_to_cpu(e->start) >=
+ le64_to_cpu(e->end)) {
+ prt_printf(err, "entry %u start >= end (%llu >= %llu)",
+ i, le64_to_cpu(e->start), le64_to_cpu(e->end));
+ return -BCH_ERR_invalid_sb_journal_seq_blacklist;
+ }
+
+ if (i + 1 < nr &&
+ le64_to_cpu(e[0].end) >
+ le64_to_cpu(e[1].start)) {
+ prt_printf(err, "entry %u out of order with next entry (%llu > %llu)",
+ i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start));
+ return -BCH_ERR_invalid_sb_journal_seq_blacklist;
+ }
+ }
+
+ return 0;
+}
+
+static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out,
+ struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_journal_seq_blacklist *bl =
+ field_to_type(f, journal_seq_blacklist);
+ struct journal_seq_blacklist_entry *i;
+ unsigned nr = blacklist_nr_entries(bl);
+
+ for (i = bl->start; i < bl->start + nr; i++) {
+ if (i != bl->start)
+ prt_printf(out, " ");
+
+ prt_printf(out, "%llu-%llu",
+ le64_to_cpu(i->start),
+ le64_to_cpu(i->end));
+ }
+ prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = {
+ .validate = bch2_sb_journal_seq_blacklist_validate,
+ .to_text = bch2_sb_journal_seq_blacklist_to_text
+};
+
+void bch2_blacklist_entries_gc(struct work_struct *work)
+{
+ struct bch_fs *c = container_of(work, struct bch_fs,
+ journal_seq_blacklist_gc_work);
+ struct journal_seq_blacklist_table *t;
+ struct bch_sb_field_journal_seq_blacklist *bl;
+ struct journal_seq_blacklist_entry *src, *dst;
+ struct btree_trans *trans = bch2_trans_get(c);
+ unsigned i, nr, new_nr;
+ int ret;
+
+ for (i = 0; i < BTREE_ID_NR; i++) {
+ struct btree_iter iter;
+ struct btree *b;
+
+ bch2_trans_node_iter_init(trans, &iter, i, POS_MIN,
+ 0, 0, BTREE_ITER_PREFETCH);
+retry:
+ bch2_trans_begin(trans);
+
+ b = bch2_btree_iter_peek_node(&iter);
+
+ while (!(ret = PTR_ERR_OR_ZERO(b)) &&
+ b &&
+ !test_bit(BCH_FS_STOPPING, &c->flags))
+ b = bch2_btree_iter_next_node(&iter);
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
+
+ bch2_trans_iter_exit(trans, &iter);
+ }
+
+ bch2_trans_put(trans);
+ if (ret)
+ return;
+
+ mutex_lock(&c->sb_lock);
+ bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
+ if (!bl)
+ goto out;
+
+ nr = blacklist_nr_entries(bl);
+ dst = bl->start;
+
+ t = c->journal_seq_blacklist_table;
+ BUG_ON(nr != t->nr);
+
+ for (src = bl->start, i = eytzinger0_first(t->nr);
+ src < bl->start + nr;
+ src++, i = eytzinger0_next(i, nr)) {
+ BUG_ON(t->entries[i].start != le64_to_cpu(src->start));
+ BUG_ON(t->entries[i].end != le64_to_cpu(src->end));
+
+ if (t->entries[i].dirty)
+ *dst++ = *src;
+ }
+
+ new_nr = dst - bl->start;
+
+ bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr);
+
+ if (new_nr != nr) {
+ bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
+ new_nr ? sb_blacklist_u64s(new_nr) : 0);
+ BUG_ON(new_nr && !bl);
+
+ if (!new_nr)
+ c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3));
+
+ bch2_write_super(c);
+ }
+out:
+ mutex_unlock(&c->sb_lock);
+}
diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h
new file mode 100644
index 000000000000..afb886ec8e25
--- /dev/null
+++ b/fs/bcachefs/journal_seq_blacklist.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
+#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
+
+static inline unsigned
+blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl)
+{
+ return bl
+ ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) /
+ sizeof(struct journal_seq_blacklist_entry))
+ : 0;
+}
+
+bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool);
+int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64);
+int bch2_blacklist_table_initialize(struct bch_fs *);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist;
+
+void bch2_blacklist_entries_gc(struct work_struct *);
+
+#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
new file mode 100644
index 000000000000..a756b69582e3
--- /dev/null
+++ b/fs/bcachefs/journal_types.h
@@ -0,0 +1,319 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_TYPES_H
+#define _BCACHEFS_JOURNAL_TYPES_H
+
+#include <linux/cache.h>
+#include <linux/workqueue.h>
+
+#include "alloc_types.h"
+#include "super_types.h"
+#include "fifo.h"
+
+#define JOURNAL_BUF_BITS 2
+#define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS)
+#define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1)
+
+/*
+ * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to
+ * the journal that are being staged or in flight.
+ */
+struct journal_buf {
+ struct jset *data;
+
+ __BKEY_PADDED(key, BCH_REPLICAS_MAX);
+ struct bch_devs_list devs_written;
+
+ struct closure_waitlist wait;
+ u64 last_seq; /* copy of data->last_seq */
+ long expires;
+ u64 flush_time;
+
+ unsigned buf_size; /* size in bytes of @data */
+ unsigned sectors; /* maximum size for current entry */
+ unsigned disk_sectors; /* maximum size entry could have been, if
+ buf_size was bigger */
+ unsigned u64s_reserved;
+ bool noflush; /* write has already been kicked off, and was noflush */
+ bool must_flush; /* something wants a flush */
+ bool separate_flush;
+};
+
+/*
+ * Something that makes a journal entry dirty - i.e. a btree node that has to be
+ * flushed:
+ */
+
+enum journal_pin_type {
+ JOURNAL_PIN_btree,
+ JOURNAL_PIN_key_cache,
+ JOURNAL_PIN_other,
+ JOURNAL_PIN_NR,
+};
+
+struct journal_entry_pin_list {
+ struct list_head list[JOURNAL_PIN_NR];
+ struct list_head flushed;
+ atomic_t count;
+ struct bch_devs_list devs;
+};
+
+struct journal;
+struct journal_entry_pin;
+typedef int (*journal_pin_flush_fn)(struct journal *j,
+ struct journal_entry_pin *, u64);
+
+struct journal_entry_pin {
+ struct list_head list;
+ journal_pin_flush_fn flush;
+ u64 seq;
+};
+
+struct journal_res {
+ bool ref;
+ u8 idx;
+ u16 u64s;
+ u32 offset;
+ u64 seq;
+};
+
+union journal_res_state {
+ struct {
+ atomic64_t counter;
+ };
+
+ struct {
+ u64 v;
+ };
+
+ struct {
+ u64 cur_entry_offset:20,
+ idx:2,
+ unwritten_idx:2,
+ buf0_count:10,
+ buf1_count:10,
+ buf2_count:10,
+ buf3_count:10;
+ };
+};
+
+/* bytes: */
+#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */
+#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */
+
+/*
+ * We stash some journal state as sentinal values in cur_entry_offset:
+ * note - cur_entry_offset is in units of u64s
+ */
+#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1)
+
+#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1)
+#define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX)
+
+struct journal_space {
+ /* Units of 512 bytes sectors: */
+ unsigned next_entry; /* How big the next journal entry can be */
+ unsigned total;
+};
+
+enum journal_space_from {
+ journal_space_discarded,
+ journal_space_clean_ondisk,
+ journal_space_clean,
+ journal_space_total,
+ journal_space_nr,
+};
+
+enum journal_flags {
+ JOURNAL_REPLAY_DONE,
+ JOURNAL_STARTED,
+ JOURNAL_MAY_SKIP_FLUSH,
+ JOURNAL_NEED_FLUSH_WRITE,
+};
+
+/* Reasons we may fail to get a journal reservation: */
+#define JOURNAL_ERRORS() \
+ x(ok) \
+ x(blocked) \
+ x(max_in_flight) \
+ x(journal_full) \
+ x(journal_pin_full) \
+ x(journal_stuck) \
+ x(insufficient_devices)
+
+enum journal_errors {
+#define x(n) JOURNAL_ERR_##n,
+ JOURNAL_ERRORS()
+#undef x
+};
+
+typedef DARRAY(u64) darray_u64;
+
+/* Embedded in struct bch_fs */
+struct journal {
+ /* Fastpath stuff up front: */
+ struct {
+
+ union journal_res_state reservations;
+ enum bch_watermark watermark;
+
+ } __aligned(SMP_CACHE_BYTES);
+
+ unsigned long flags;
+
+ /* Max size of current journal entry */
+ unsigned cur_entry_u64s;
+ unsigned cur_entry_sectors;
+
+ /* Reserved space in journal entry to be used just prior to write */
+ unsigned entry_u64s_reserved;
+
+
+ /*
+ * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
+ * insufficient devices:
+ */
+ enum journal_errors cur_entry_error;
+
+ unsigned buf_size_want;
+ /*
+ * We may queue up some things to be journalled (log messages) before
+ * the journal has actually started - stash them here:
+ */
+ darray_u64 early_journal_entries;
+
+ /*
+ * Two journal entries -- one is currently open for new entries, the
+ * other is possibly being written out.
+ */
+ struct journal_buf buf[JOURNAL_BUF_NR];
+
+ spinlock_t lock;
+
+ /* if nonzero, we may not open a new journal entry: */
+ unsigned blocked;
+
+ /* Used when waiting because the journal was full */
+ wait_queue_head_t wait;
+ struct closure_waitlist async_wait;
+ struct closure_waitlist preres_wait;
+
+ struct closure io;
+ struct delayed_work write_work;
+
+ /* Sequence number of most recent journal entry (last entry in @pin) */
+ atomic64_t seq;
+
+ /* seq, last_seq from the most recent journal entry successfully written */
+ u64 seq_ondisk;
+ u64 flushed_seq_ondisk;
+ u64 last_seq_ondisk;
+ u64 err_seq;
+ u64 last_empty_seq;
+
+ /*
+ * FIFO of journal entries whose btree updates have not yet been
+ * written out.
+ *
+ * Each entry is a reference count. The position in the FIFO is the
+ * entry's sequence number relative to @seq.
+ *
+ * The journal entry itself holds a reference count, put when the
+ * journal entry is written out. Each btree node modified by the journal
+ * entry also holds a reference count, put when the btree node is
+ * written.
+ *
+ * When a reference count reaches zero, the journal entry is no longer
+ * needed. When all journal entries in the oldest journal bucket are no
+ * longer needed, the bucket can be discarded and reused.
+ */
+ struct {
+ u64 front, back, size, mask;
+ struct journal_entry_pin_list *data;
+ } pin;
+
+ struct journal_space space[journal_space_nr];
+
+ u64 replay_journal_seq;
+ u64 replay_journal_seq_end;
+
+ struct write_point wp;
+ spinlock_t err_lock;
+
+ struct mutex reclaim_lock;
+ /*
+ * Used for waiting until journal reclaim has freed up space in the
+ * journal:
+ */
+ wait_queue_head_t reclaim_wait;
+ struct task_struct *reclaim_thread;
+ bool reclaim_kicked;
+ unsigned long next_reclaim;
+ u64 nr_direct_reclaim;
+ u64 nr_background_reclaim;
+
+ unsigned long last_flushed;
+ struct journal_entry_pin *flush_in_progress;
+ bool flush_in_progress_dropped;
+ wait_queue_head_t pin_flush_wait;
+
+ /* protects advancing ja->discard_idx: */
+ struct mutex discard_lock;
+ bool can_discard;
+
+ unsigned long last_flush_write;
+
+ u64 res_get_blocked_start;
+ u64 write_start_time;
+
+ u64 nr_flush_writes;
+ u64 nr_noflush_writes;
+
+ struct bch2_time_stats *flush_write_time;
+ struct bch2_time_stats *noflush_write_time;
+ struct bch2_time_stats *blocked_time;
+ struct bch2_time_stats *flush_seq_time;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map res_map;
+#endif
+} __aligned(SMP_CACHE_BYTES);
+
+/*
+ * Embedded in struct bch_dev. First three fields refer to the array of journal
+ * buckets, in bch_sb.
+ */
+struct journal_device {
+ /*
+ * For each journal bucket, contains the max sequence number of the
+ * journal writes it contains - so we know when a bucket can be reused.
+ */
+ u64 *bucket_seq;
+
+ unsigned sectors_free;
+
+ /*
+ * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx:
+ */
+ unsigned discard_idx; /* Next bucket to discard */
+ unsigned dirty_idx_ondisk;
+ unsigned dirty_idx;
+ unsigned cur_idx; /* Journal bucket we're currently writing to */
+ unsigned nr;
+
+ u64 *buckets;
+
+ /* Bio for journal reads/writes to this device */
+ struct bio *bio;
+
+ /* for bch_journal_read_device */
+ struct closure read;
+};
+
+/*
+ * journal_entry_res - reserve space in every journal entry:
+ */
+struct journal_entry_res {
+ unsigned u64s;
+};
+
+#endif /* _BCACHEFS_JOURNAL_TYPES_H */
diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c
new file mode 100644
index 000000000000..5699cd4873c8
--- /dev/null
+++ b/fs/bcachefs/keylist.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey.h"
+#include "keylist.h"
+
+int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
+ size_t nr_inline_u64s, size_t new_u64s)
+{
+ size_t oldsize = bch2_keylist_u64s(l);
+ size_t newsize = oldsize + new_u64s;
+ u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p;
+ u64 *new_keys;
+
+ newsize = roundup_pow_of_two(newsize);
+
+ if (newsize <= nr_inline_u64s ||
+ (old_buf && roundup_pow_of_two(oldsize) == newsize))
+ return 0;
+
+ new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOFS);
+ if (!new_keys)
+ return -ENOMEM;
+
+ if (!old_buf)
+ memcpy_u64s(new_keys, inline_u64s, oldsize);
+
+ l->keys_p = new_keys;
+ l->top_p = new_keys + oldsize;
+
+ return 0;
+}
+
+void bch2_keylist_pop_front(struct keylist *l)
+{
+ l->top_p -= bch2_keylist_front(l)->k.u64s;
+
+ memmove_u64s_down(l->keys,
+ bkey_next(l->keys),
+ bch2_keylist_u64s(l));
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_verify_keylist_sorted(struct keylist *l)
+{
+ struct bkey_i *k;
+
+ for_each_keylist_key(l, k)
+ BUG_ON(bkey_next(k) != l->top &&
+ bpos_ge(k->k.p, bkey_next(k)->k.p));
+}
+#endif
diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h
new file mode 100644
index 000000000000..fe759c7031e0
--- /dev/null
+++ b/fs/bcachefs/keylist.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_KEYLIST_H
+#define _BCACHEFS_KEYLIST_H
+
+#include "keylist_types.h"
+
+int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t);
+void bch2_keylist_pop_front(struct keylist *);
+
+static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys)
+{
+ l->top_p = l->keys_p = inline_keys;
+}
+
+static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys)
+{
+ if (l->keys_p != inline_keys)
+ kfree(l->keys_p);
+}
+
+static inline void bch2_keylist_push(struct keylist *l)
+{
+ l->top = bkey_next(l->top);
+}
+
+static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k)
+{
+ bkey_copy(l->top, k);
+ bch2_keylist_push(l);
+}
+
+static inline bool bch2_keylist_empty(struct keylist *l)
+{
+ return l->top == l->keys;
+}
+
+static inline size_t bch2_keylist_u64s(struct keylist *l)
+{
+ return l->top_p - l->keys_p;
+}
+
+static inline size_t bch2_keylist_bytes(struct keylist *l)
+{
+ return bch2_keylist_u64s(l) * sizeof(u64);
+}
+
+static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
+{
+ return l->keys;
+}
+
+#define for_each_keylist_key(_keylist, _k) \
+ for (_k = (_keylist)->keys; \
+ _k != (_keylist)->top; \
+ _k = bkey_next(_k))
+
+static inline u64 keylist_sectors(struct keylist *keys)
+{
+ struct bkey_i *k;
+ u64 ret = 0;
+
+ for_each_keylist_key(keys, k)
+ ret += k->k.size;
+
+ return ret;
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_verify_keylist_sorted(struct keylist *);
+#else
+static inline void bch2_verify_keylist_sorted(struct keylist *l) {}
+#endif
+
+#endif /* _BCACHEFS_KEYLIST_H */
diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h
new file mode 100644
index 000000000000..4b3ff7d8a875
--- /dev/null
+++ b/fs/bcachefs/keylist_types.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_KEYLIST_TYPES_H
+#define _BCACHEFS_KEYLIST_TYPES_H
+
+struct keylist {
+ union {
+ struct bkey_i *keys;
+ u64 *keys_p;
+ };
+ union {
+ struct bkey_i *top;
+ u64 *top_p;
+ };
+};
+
+#endif /* _BCACHEFS_KEYLIST_TYPES_H */
diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
new file mode 100644
index 000000000000..8640f7dee0de
--- /dev/null
+++ b/fs/bcachefs/logged_ops.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "error.h"
+#include "io_misc.h"
+#include "logged_ops.h"
+#include "super.h"
+
+struct bch_logged_op_fn {
+ u8 type;
+ int (*resume)(struct btree_trans *, struct bkey_i *);
+};
+
+static const struct bch_logged_op_fn logged_op_fns[] = {
+#define x(n) { \
+ .type = KEY_TYPE_logged_op_##n, \
+ .resume = bch2_resume_logged_op_##n, \
+},
+ BCH_LOGGED_OPS()
+#undef x
+};
+
+static const struct bch_logged_op_fn *logged_op_fn(enum bch_bkey_type type)
+{
+ for (unsigned i = 0; i < ARRAY_SIZE(logged_op_fns); i++)
+ if (logged_op_fns[i].type == type)
+ return logged_op_fns + i;
+ return NULL;
+}
+
+static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type);
+ struct bkey_buf sk;
+ u32 restart_count = trans->restart_count;
+ int ret;
+
+ if (!fn)
+ return 0;
+
+ bch2_bkey_buf_init(&sk);
+ bch2_bkey_buf_reassemble(&sk, c, k);
+
+ ret = drop_locks_do(trans, (bch2_fs_lazy_rw(c), 0)) ?:
+ fn->resume(trans, sk.k) ?: trans_was_restarted(trans, restart_count);
+
+ bch2_bkey_buf_exit(&sk, c);
+ return ret;
+}
+
+int bch2_resume_logged_ops(struct bch_fs *c)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ ret = bch2_trans_run(c,
+ for_each_btree_key2(trans, iter,
+ BTREE_ID_logged_ops, POS_MIN, BTREE_ITER_PREFETCH, k,
+ resume_logged_op(trans, &iter, k)));
+ if (ret)
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
+{
+ struct btree_iter iter;
+ int ret;
+
+ ret = bch2_bkey_get_empty_slot(trans, &iter, BTREE_ID_logged_ops, POS_MAX);
+ if (ret)
+ return ret;
+
+ k->k.p = iter.pos;
+
+ ret = bch2_trans_update(trans, &iter, k, 0);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
+{
+ return commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ __bch2_logged_op_start(trans, k));
+}
+
+void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
+{
+ int ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0));
+ /*
+ * This needs to be a fatal error because we've left an unfinished
+ * operation in the logged ops btree.
+ *
+ * We should only ever see an error here if the filesystem has already
+ * been shut down, but make sure of that here:
+ */
+ if (ret) {
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+ bch2_fs_fatal_error(c, "%s: error deleting logged operation %s: %s",
+ __func__, buf.buf, bch2_err_str(ret));
+ printbuf_exit(&buf);
+ }
+}
diff --git a/fs/bcachefs/logged_ops.h b/fs/bcachefs/logged_ops.h
new file mode 100644
index 000000000000..4d1e786a27a8
--- /dev/null
+++ b/fs/bcachefs/logged_ops.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LOGGED_OPS_H
+#define _BCACHEFS_LOGGED_OPS_H
+
+#include "bkey.h"
+
+#define BCH_LOGGED_OPS() \
+ x(truncate) \
+ x(finsert)
+
+static inline int bch2_logged_op_update(struct btree_trans *trans, struct bkey_i *op)
+{
+ return bch2_btree_insert_nonextent(trans, BTREE_ID_logged_ops, op, 0);
+}
+
+int bch2_resume_logged_ops(struct bch_fs *);
+int bch2_logged_op_start(struct btree_trans *, struct bkey_i *);
+void bch2_logged_op_finish(struct btree_trans *, struct bkey_i *);
+
+#endif /* _BCACHEFS_LOGGED_OPS_H */
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
new file mode 100644
index 000000000000..a5cc0ed195d6
--- /dev/null
+++ b/fs/bcachefs/lru.c
@@ -0,0 +1,164 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "btree_write_buffer.h"
+#include "error.h"
+#include "lru.h"
+#include "recovery.h"
+
+/* KEY_TYPE_lru is obsolete: */
+int bch2_lru_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
+{
+ int ret = 0;
+
+ bkey_fsck_err_on(!lru_pos_time(k.k->p), c, err,
+ lru_entry_at_time_0,
+ "lru entry at time=0");
+fsck_err:
+ return ret;
+}
+
+void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
+
+ prt_printf(out, "idx %llu", le64_to_cpu(lru->idx));
+}
+
+void bch2_lru_pos_to_text(struct printbuf *out, struct bpos lru)
+{
+ prt_printf(out, "%llu:%llu -> %llu:%llu",
+ lru_pos_id(lru),
+ lru_pos_time(lru),
+ u64_to_bucket(lru.offset).inode,
+ u64_to_bucket(lru.offset).offset);
+}
+
+static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id,
+ u64 dev_bucket, u64 time, bool set)
+{
+ return time
+ ? bch2_btree_bit_mod(trans, BTREE_ID_lru,
+ lru_pos(lru_id, dev_bucket, time), set)
+ : 0;
+}
+
+int bch2_lru_del(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
+{
+ return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_deleted);
+}
+
+int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
+{
+ return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set);
+}
+
+int bch2_lru_change(struct btree_trans *trans,
+ u16 lru_id, u64 dev_bucket,
+ u64 old_time, u64 new_time)
+{
+ if (old_time == new_time)
+ return 0;
+
+ return bch2_lru_del(trans, lru_id, dev_bucket, old_time) ?:
+ bch2_lru_set(trans, lru_id, dev_bucket, new_time);
+}
+
+static const char * const bch2_lru_types[] = {
+#define x(n) #n,
+ BCH_LRU_TYPES()
+#undef x
+ NULL
+};
+
+static int bch2_check_lru_key(struct btree_trans *trans,
+ struct btree_iter *lru_iter,
+ struct bkey_s_c lru_k,
+ struct bpos *last_flushed_pos)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a;
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
+ enum bch_lru_type type = lru_type(lru_k);
+ struct bpos alloc_pos = u64_to_bucket(lru_k.k->p.offset);
+ u64 idx;
+ int ret;
+
+ if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c,
+ lru_entry_to_invalid_bucket,
+ "lru key points to nonexistent device:bucket %llu:%llu",
+ alloc_pos.inode, alloc_pos.offset))
+ return bch2_btree_delete_at(trans, lru_iter, 0);
+
+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, alloc_pos, 0);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ a = bch2_alloc_to_v4(k, &a_convert);
+
+ switch (type) {
+ case BCH_LRU_read:
+ idx = alloc_lru_idx_read(*a);
+ break;
+ case BCH_LRU_fragmentation:
+ idx = a->fragmentation_lru;
+ break;
+ }
+
+ if (lru_k.k->type != KEY_TYPE_set ||
+ lru_pos_time(lru_k.k->p) != idx) {
+ if (!bpos_eq(*last_flushed_pos, lru_k.k->p)) {
+ *last_flushed_pos = lru_k.k->p;
+ ret = bch2_btree_write_buffer_flush_sync(trans) ?:
+ -BCH_ERR_transaction_restart_write_buffer_flush;
+ goto out;
+ }
+
+ if (c->opts.reconstruct_alloc ||
+ fsck_err(c, lru_entry_bad,
+ "incorrect lru entry: lru %s time %llu\n"
+ " %s\n"
+ " for %s",
+ bch2_lru_types[type],
+ lru_pos_time(lru_k.k->p),
+ (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
+ (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf)))
+ ret = bch2_btree_delete_at(trans, lru_iter, 0);
+ }
+out:
+err:
+fsck_err:
+ bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
+ return ret;
+}
+
+int bch2_check_lrus(struct bch_fs *c)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bpos last_flushed_pos = POS_MIN;
+ int ret = 0;
+
+ ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter,
+ BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ bch2_check_lru_key(trans, &iter, k, &last_flushed_pos)));
+ if (ret)
+ bch_err_fn(c, ret);
+ return ret;
+
+}
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
new file mode 100644
index 000000000000..429dca816df5
--- /dev/null
+++ b/fs/bcachefs/lru.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LRU_H
+#define _BCACHEFS_LRU_H
+
+#define LRU_TIME_BITS 48
+#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1)
+
+static inline u64 lru_pos_id(struct bpos pos)
+{
+ return pos.inode >> LRU_TIME_BITS;
+}
+
+static inline u64 lru_pos_time(struct bpos pos)
+{
+ return pos.inode & ~(~0ULL << LRU_TIME_BITS);
+}
+
+static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time)
+{
+ struct bpos pos = POS(((u64) lru_id << LRU_TIME_BITS)|time, dev_bucket);
+
+ EBUG_ON(time > LRU_TIME_MAX);
+ EBUG_ON(lru_pos_id(pos) != lru_id);
+ EBUG_ON(lru_pos_time(pos) != time);
+ EBUG_ON(pos.offset != dev_bucket);
+
+ return pos;
+}
+
+#define BCH_LRU_TYPES() \
+ x(read) \
+ x(fragmentation)
+
+enum bch_lru_type {
+#define x(n) BCH_LRU_##n,
+ BCH_LRU_TYPES()
+#undef x
+};
+
+#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1)
+
+static inline enum bch_lru_type lru_type(struct bkey_s_c l)
+{
+ u16 lru_id = l.k->p.inode >> 48;
+
+ if (lru_id == BCH_LRU_FRAGMENTATION_START)
+ return BCH_LRU_fragmentation;
+ return BCH_LRU_read;
+}
+
+int bch2_lru_invalid(struct bch_fs *, struct bkey_s_c,
+ enum bkey_invalid_flags, struct printbuf *);
+void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+void bch2_lru_pos_to_text(struct printbuf *, struct bpos);
+
+#define bch2_bkey_ops_lru ((struct bkey_ops) { \
+ .key_invalid = bch2_lru_invalid, \
+ .val_to_text = bch2_lru_to_text, \
+ .min_val_size = 8, \
+})
+
+int bch2_lru_del(struct btree_trans *, u16, u64, u64);
+int bch2_lru_set(struct btree_trans *, u16, u64, u64);
+int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64);
+
+int bch2_check_lrus(struct bch_fs *);
+
+#endif /* _BCACHEFS_LRU_H */
diff --git a/fs/bcachefs/mean_and_variance.c b/fs/bcachefs/mean_and_variance.c
new file mode 100644
index 000000000000..1f0801e2e565
--- /dev/null
+++ b/fs/bcachefs/mean_and_variance.c
@@ -0,0 +1,159 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Functions for incremental mean and variance.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * Copyright © 2022 Daniel B. Hill
+ *
+ * Author: Daniel B. Hill <daniel@gluo.nz>
+ *
+ * Description:
+ *
+ * This is includes some incremental algorithms for mean and variance calculation
+ *
+ * Derived from the paper: https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf
+ *
+ * Create a struct and if it's the weighted variant set the w field (weight = 2^k).
+ *
+ * Use mean_and_variance[_weighted]_update() on the struct to update it's state.
+ *
+ * Use the mean_and_variance[_weighted]_get_* functions to calculate the mean and variance, some computation
+ * is deferred to these functions for performance reasons.
+ *
+ * see lib/math/mean_and_variance_test.c for examples of usage.
+ *
+ * DO NOT access the mean and variance fields of the weighted variants directly.
+ * DO NOT change the weight after calling update.
+ */
+
+#include <linux/bug.h>
+#include <linux/compiler.h>
+#include <linux/export.h>
+#include <linux/limits.h>
+#include <linux/math.h>
+#include <linux/math64.h>
+#include <linux/module.h>
+
+#include "mean_and_variance.h"
+
+u128_u u128_div(u128_u n, u64 d)
+{
+ u128_u r;
+ u64 rem;
+ u64 hi = u128_hi(n);
+ u64 lo = u128_lo(n);
+ u64 h = hi & ((u64) U32_MAX << 32);
+ u64 l = (hi & (u64) U32_MAX) << 32;
+
+ r = u128_shl(u64_to_u128(div64_u64_rem(h, d, &rem)), 64);
+ r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l + (rem << 32), d, &rem)), 32));
+ r = u128_add(r, u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem)));
+ return r;
+}
+EXPORT_SYMBOL_GPL(u128_div);
+
+/**
+ * mean_and_variance_get_mean() - get mean from @s
+ */
+s64 mean_and_variance_get_mean(struct mean_and_variance s)
+{
+ return s.n ? div64_u64(s.sum, s.n) : 0;
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_get_mean);
+
+/**
+ * mean_and_variance_get_variance() - get variance from @s1
+ *
+ * see linked pdf equation 12.
+ */
+u64 mean_and_variance_get_variance(struct mean_and_variance s1)
+{
+ if (s1.n) {
+ u128_u s2 = u128_div(s1.sum_squares, s1.n);
+ u64 s3 = abs(mean_and_variance_get_mean(s1));
+
+ return u128_lo(u128_sub(s2, u128_square(s3)));
+ } else {
+ return 0;
+ }
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_get_variance);
+
+/**
+ * mean_and_variance_get_stddev() - get standard deviation from @s
+ */
+u32 mean_and_variance_get_stddev(struct mean_and_variance s)
+{
+ return int_sqrt64(mean_and_variance_get_variance(s));
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev);
+
+/**
+ * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update()
+ * @s1: ..
+ * @s2: ..
+ *
+ * see linked pdf: function derived from equations 140-143 where alpha = 2^w.
+ * values are stored bitshifted for performance and added precision.
+ */
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 x)
+{
+ // previous weighted variance.
+ u8 w = s->weight;
+ u64 var_w0 = s->variance;
+ // new value weighted.
+ s64 x_w = x << w;
+ s64 diff_w = x_w - s->mean;
+ s64 diff = fast_divpow2(diff_w, w);
+ // new mean weighted.
+ s64 u_w1 = s->mean + diff;
+
+ if (!s->init) {
+ s->mean = x_w;
+ s->variance = 0;
+ } else {
+ s->mean = u_w1;
+ s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w;
+ }
+ s->init = true;
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
+
+/**
+ * mean_and_variance_weighted_get_mean() - get mean from @s
+ */
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s)
+{
+ return fast_divpow2(s.mean, s.weight);
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
+
+/**
+ * mean_and_variance_weighted_get_variance() -- get variance from @s
+ */
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s)
+{
+ // always positive don't need fast divpow2
+ return s.variance >> s.weight;
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
+
+/**
+ * mean_and_variance_weighted_get_stddev() - get standard deviation from @s
+ */
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s)
+{
+ return int_sqrt64(mean_and_variance_weighted_get_variance(s));
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev);
+
+MODULE_AUTHOR("Daniel B. Hill");
+MODULE_LICENSE("GPL");
diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h
new file mode 100644
index 000000000000..647505010b39
--- /dev/null
+++ b/fs/bcachefs/mean_and_variance.h
@@ -0,0 +1,198 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef MEAN_AND_VARIANCE_H_
+#define MEAN_AND_VARIANCE_H_
+
+#include <linux/types.h>
+#include <linux/limits.h>
+#include <linux/math.h>
+#include <linux/math64.h>
+
+#define SQRT_U64_MAX 4294967295ULL
+
+/*
+ * u128_u: u128 user mode, because not all architectures support a real int128
+ * type
+ */
+
+#ifdef __SIZEOF_INT128__
+
+typedef struct {
+ unsigned __int128 v;
+} __aligned(16) u128_u;
+
+static inline u128_u u64_to_u128(u64 a)
+{
+ return (u128_u) { .v = a };
+}
+
+static inline u64 u128_lo(u128_u a)
+{
+ return a.v;
+}
+
+static inline u64 u128_hi(u128_u a)
+{
+ return a.v >> 64;
+}
+
+static inline u128_u u128_add(u128_u a, u128_u b)
+{
+ a.v += b.v;
+ return a;
+}
+
+static inline u128_u u128_sub(u128_u a, u128_u b)
+{
+ a.v -= b.v;
+ return a;
+}
+
+static inline u128_u u128_shl(u128_u a, s8 shift)
+{
+ a.v <<= shift;
+ return a;
+}
+
+static inline u128_u u128_square(u64 a)
+{
+ u128_u b = u64_to_u128(a);
+
+ b.v *= b.v;
+ return b;
+}
+
+#else
+
+typedef struct {
+ u64 hi, lo;
+} __aligned(16) u128_u;
+
+/* conversions */
+
+static inline u128_u u64_to_u128(u64 a)
+{
+ return (u128_u) { .lo = a };
+}
+
+static inline u64 u128_lo(u128_u a)
+{
+ return a.lo;
+}
+
+static inline u64 u128_hi(u128_u a)
+{
+ return a.hi;
+}
+
+/* arithmetic */
+
+static inline u128_u u128_add(u128_u a, u128_u b)
+{
+ u128_u c;
+
+ c.lo = a.lo + b.lo;
+ c.hi = a.hi + b.hi + (c.lo < a.lo);
+ return c;
+}
+
+static inline u128_u u128_sub(u128_u a, u128_u b)
+{
+ u128_u c;
+
+ c.lo = a.lo - b.lo;
+ c.hi = a.hi - b.hi - (c.lo > a.lo);
+ return c;
+}
+
+static inline u128_u u128_shl(u128_u i, s8 shift)
+{
+ u128_u r;
+
+ r.lo = i.lo << shift;
+ if (shift < 64)
+ r.hi = (i.hi << shift) | (i.lo >> (64 - shift));
+ else {
+ r.hi = i.lo << (shift - 64);
+ r.lo = 0;
+ }
+ return r;
+}
+
+static inline u128_u u128_square(u64 i)
+{
+ u128_u r;
+ u64 h = i >> 32, l = i & U32_MAX;
+
+ r = u128_shl(u64_to_u128(h*h), 64);
+ r = u128_add(r, u128_shl(u64_to_u128(h*l), 32));
+ r = u128_add(r, u128_shl(u64_to_u128(l*h), 32));
+ r = u128_add(r, u64_to_u128(l*l));
+ return r;
+}
+
+#endif
+
+static inline u128_u u64s_to_u128(u64 hi, u64 lo)
+{
+ u128_u c = u64_to_u128(hi);
+
+ c = u128_shl(c, 64);
+ c = u128_add(c, u64_to_u128(lo));
+ return c;
+}
+
+u128_u u128_div(u128_u n, u64 d);
+
+struct mean_and_variance {
+ s64 n;
+ s64 sum;
+ u128_u sum_squares;
+};
+
+/* expontentially weighted variant */
+struct mean_and_variance_weighted {
+ bool init;
+ u8 weight; /* base 2 logarithim */
+ s64 mean;
+ u64 variance;
+};
+
+/**
+ * fast_divpow2() - fast approximation for n / (1 << d)
+ * @n: numerator
+ * @d: the power of 2 denominator.
+ *
+ * note: this rounds towards 0.
+ */
+static inline s64 fast_divpow2(s64 n, u8 d)
+{
+ return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d;
+}
+
+/**
+ * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1
+ * and return it.
+ * @s1: the mean_and_variance to update.
+ * @v1: the new sample.
+ *
+ * see linked pdf equation 12.
+ */
+static inline void
+mean_and_variance_update(struct mean_and_variance *s, s64 v)
+{
+ s->n++;
+ s->sum += v;
+ s->sum_squares = u128_add(s->sum_squares, u128_square(abs(v)));
+}
+
+s64 mean_and_variance_get_mean(struct mean_and_variance s);
+u64 mean_and_variance_get_variance(struct mean_and_variance s1);
+u32 mean_and_variance_get_stddev(struct mean_and_variance s);
+
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 v);
+
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s);
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s);
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s);
+
+#endif // MEAN_AND_VAIRANCE_H_
diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c
new file mode 100644
index 000000000000..019583c3ca0e
--- /dev/null
+++ b/fs/bcachefs/mean_and_variance_test.c
@@ -0,0 +1,240 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <kunit/test.h>
+
+#include "mean_and_variance.h"
+
+#define MAX_SQR (SQRT_U64_MAX*SQRT_U64_MAX)
+
+static void mean_and_variance_basic_test(struct kunit *test)
+{
+ struct mean_and_variance s = {};
+
+ mean_and_variance_update(&s, 2);
+ mean_and_variance_update(&s, 2);
+
+ KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 2);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 0);
+ KUNIT_EXPECT_EQ(test, s.n, 2);
+
+ mean_and_variance_update(&s, 4);
+ mean_and_variance_update(&s, 4);
+
+ KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 3);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 1);
+ KUNIT_EXPECT_EQ(test, s.n, 4);
+}
+
+/*
+ * Test values computed using a spreadsheet from the psuedocode at the bottom:
+ * https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf
+ */
+
+static void mean_and_variance_weighted_test(struct kunit *test)
+{
+ struct mean_and_variance_weighted s = { .weight = 2 };
+
+ mean_and_variance_weighted_update(&s, 10);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 10);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0);
+
+ mean_and_variance_weighted_update(&s, 20);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 12);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18);
+
+ mean_and_variance_weighted_update(&s, 30);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 16);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72);
+
+ s = (struct mean_and_variance_weighted) { .weight = 2 };
+
+ mean_and_variance_weighted_update(&s, -10);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -10);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0);
+
+ mean_and_variance_weighted_update(&s, -20);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -12);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18);
+
+ mean_and_variance_weighted_update(&s, -30);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -16);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72);
+}
+
+static void mean_and_variance_weighted_advanced_test(struct kunit *test)
+{
+ struct mean_and_variance_weighted s = { .weight = 8 };
+ s64 i;
+
+ for (i = 10; i <= 100; i += 10)
+ mean_and_variance_weighted_update(&s, i);
+
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 11);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107);
+
+ s = (struct mean_and_variance_weighted) { .weight = 8 };
+
+ for (i = -10; i >= -100; i -= 10)
+ mean_and_variance_weighted_update(&s, i);
+
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -11);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107);
+}
+
+static void do_mean_and_variance_test(struct kunit *test,
+ s64 initial_value,
+ s64 initial_n,
+ s64 n,
+ unsigned weight,
+ s64 *data,
+ s64 *mean,
+ s64 *stddev,
+ s64 *weighted_mean,
+ s64 *weighted_stddev)
+{
+ struct mean_and_variance mv = {};
+ struct mean_and_variance_weighted vw = { .weight = weight };
+
+ for (unsigned i = 0; i < initial_n; i++) {
+ mean_and_variance_update(&mv, initial_value);
+ mean_and_variance_weighted_update(&vw, initial_value);
+
+ KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), initial_value);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), 0);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw), initial_value);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),0);
+ }
+
+ for (unsigned i = 0; i < n; i++) {
+ mean_and_variance_update(&mv, data[i]);
+ mean_and_variance_weighted_update(&vw, data[i]);
+
+ KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), mean[i]);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), stddev[i]);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw), weighted_mean[i]);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),weighted_stddev[i]);
+ }
+
+ KUNIT_EXPECT_EQ(test, mv.n, initial_n + n);
+}
+
+/* Test behaviour with a single outlier, then back to steady state: */
+static void mean_and_variance_test_1(struct kunit *test)
+{
+ s64 d[] = { 100, 10, 10, 10, 10, 10, 10 };
+ s64 mean[] = { 22, 21, 20, 19, 18, 17, 16 };
+ s64 stddev[] = { 32, 29, 28, 27, 26, 25, 24 };
+ s64 weighted_mean[] = { 32, 27, 22, 19, 17, 15, 14 };
+ s64 weighted_stddev[] = { 38, 35, 31, 27, 24, 21, 18 };
+
+ do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
+ d, mean, stddev, weighted_mean, weighted_stddev);
+}
+
+static void mean_and_variance_test_2(struct kunit *test)
+{
+ s64 d[] = { 100, 10, 10, 10, 10, 10, 10 };
+ s64 mean[] = { 10, 10, 10, 10, 10, 10, 10 };
+ s64 stddev[] = { 9, 9, 9, 9, 9, 9, 9 };
+ s64 weighted_mean[] = { 32, 27, 22, 19, 17, 15, 14 };
+ s64 weighted_stddev[] = { 38, 35, 31, 27, 24, 21, 18 };
+
+ do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
+ d, mean, stddev, weighted_mean, weighted_stddev);
+}
+
+/* Test behaviour where we switch from one steady state to another: */
+static void mean_and_variance_test_3(struct kunit *test)
+{
+ s64 d[] = { 100, 100, 100, 100, 100 };
+ s64 mean[] = { 22, 32, 40, 46, 50 };
+ s64 stddev[] = { 32, 39, 42, 44, 45 };
+ s64 weighted_mean[] = { 32, 49, 61, 71, 78 };
+ s64 weighted_stddev[] = { 38, 44, 44, 41, 38 };
+
+ do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
+ d, mean, stddev, weighted_mean, weighted_stddev);
+}
+
+static void mean_and_variance_test_4(struct kunit *test)
+{
+ s64 d[] = { 100, 100, 100, 100, 100 };
+ s64 mean[] = { 10, 11, 12, 13, 14 };
+ s64 stddev[] = { 9, 13, 15, 17, 19 };
+ s64 weighted_mean[] = { 32, 49, 61, 71, 78 };
+ s64 weighted_stddev[] = { 38, 44, 44, 41, 38 };
+
+ do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
+ d, mean, stddev, weighted_mean, weighted_stddev);
+}
+
+static void mean_and_variance_fast_divpow2(struct kunit *test)
+{
+ s64 i;
+ u8 d;
+
+ for (i = 0; i < 100; i++) {
+ d = 0;
+ KUNIT_EXPECT_EQ(test, fast_divpow2(i, d), div_u64(i, 1LLU << d));
+ KUNIT_EXPECT_EQ(test, abs(fast_divpow2(-i, d)), div_u64(i, 1LLU << d));
+ for (d = 1; d < 32; d++) {
+ KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(i, d)),
+ div_u64(i, 1 << d), "%lld %u", i, d);
+ KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(-i, d)),
+ div_u64(i, 1 << d), "%lld %u", -i, d);
+ }
+ }
+}
+
+static void mean_and_variance_u128_basic_test(struct kunit *test)
+{
+ u128_u a = u64s_to_u128(0, U64_MAX);
+ u128_u a1 = u64s_to_u128(0, 1);
+ u128_u b = u64s_to_u128(1, 0);
+ u128_u c = u64s_to_u128(0, 1LLU << 63);
+ u128_u c2 = u64s_to_u128(U64_MAX, U64_MAX);
+
+ KUNIT_EXPECT_EQ(test, u128_hi(u128_add(a, a1)), 1);
+ KUNIT_EXPECT_EQ(test, u128_lo(u128_add(a, a1)), 0);
+ KUNIT_EXPECT_EQ(test, u128_hi(u128_add(a1, a)), 1);
+ KUNIT_EXPECT_EQ(test, u128_lo(u128_add(a1, a)), 0);
+
+ KUNIT_EXPECT_EQ(test, u128_lo(u128_sub(b, a1)), U64_MAX);
+ KUNIT_EXPECT_EQ(test, u128_hi(u128_sub(b, a1)), 0);
+
+ KUNIT_EXPECT_EQ(test, u128_hi(u128_shl(c, 1)), 1);
+ KUNIT_EXPECT_EQ(test, u128_lo(u128_shl(c, 1)), 0);
+
+ KUNIT_EXPECT_EQ(test, u128_hi(u128_square(U64_MAX)), U64_MAX - 1);
+ KUNIT_EXPECT_EQ(test, u128_lo(u128_square(U64_MAX)), 1);
+
+ KUNIT_EXPECT_EQ(test, u128_lo(u128_div(b, 2)), 1LLU << 63);
+
+ KUNIT_EXPECT_EQ(test, u128_hi(u128_div(c2, 2)), U64_MAX >> 1);
+ KUNIT_EXPECT_EQ(test, u128_lo(u128_div(c2, 2)), U64_MAX);
+
+ KUNIT_EXPECT_EQ(test, u128_hi(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U32_MAX >> 1);
+ KUNIT_EXPECT_EQ(test, u128_lo(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U64_MAX << 31);
+}
+
+static struct kunit_case mean_and_variance_test_cases[] = {
+ KUNIT_CASE(mean_and_variance_fast_divpow2),
+ KUNIT_CASE(mean_and_variance_u128_basic_test),
+ KUNIT_CASE(mean_and_variance_basic_test),
+ KUNIT_CASE(mean_and_variance_weighted_test),
+ KUNIT_CASE(mean_and_variance_weighted_advanced_test),
+ KUNIT_CASE(mean_and_variance_test_1),
+ KUNIT_CASE(mean_and_variance_test_2),
+ KUNIT_CASE(mean_and_variance_test_3),
+ KUNIT_CASE(mean_and_variance_test_4),
+ {}
+};
+
+static struct kunit_suite mean_and_variance_test_suite = {
+ .name = "mean and variance tests",
+ .test_cases = mean_and_variance_test_cases
+};
+
+kunit_test_suite(mean_and_variance_test_suite);
+
+MODULE_AUTHOR("Daniel B. Hill");
+MODULE_LICENSE("GPL");
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
new file mode 100644
index 000000000000..e3a51f6d6c9b
--- /dev/null
+++ b/fs/bcachefs/migrate.c
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Code for moving data off a device.
+ */
+
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "errcode.h"
+#include "extents.h"
+#include "io_write.h"
+#include "journal.h"
+#include "keylist.h"
+#include "migrate.h"
+#include "move.h"
+#include "replicas.h"
+#include "super-io.h"
+
+static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
+ unsigned dev_idx, int flags, bool metadata)
+{
+ unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
+ unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
+ unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED;
+ unsigned nr_good;
+
+ bch2_bkey_drop_device(k, dev_idx);
+
+ nr_good = bch2_bkey_durability(c, k.s_c);
+ if ((!nr_good && !(flags & lost)) ||
+ (nr_good < replicas && !(flags & degraded)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ unsigned dev_idx,
+ int flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_i *n;
+ int ret;
+
+ if (!bch2_bkey_has_device_c(k, dev_idx))
+ return 0;
+
+ n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
+
+ ret = drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, false);
+ if (ret)
+ return ret;
+
+ /*
+ * If the new extent no longer has any pointers, bch2_extent_normalize()
+ * will do the appropriate thing with it (turning it into a
+ * KEY_TYPE_error key, or just a discard if it was a cached extent)
+ */
+ bch2_extent_normalize(c, bkey_i_to_s(n));
+
+ /*
+ * Since we're not inserting through an extent iterator
+ * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
+ * we aren't using the extent overwrite path to delete, we're
+ * just using the normal key deletion path:
+ */
+ if (bkey_deleted(&n->k))
+ n->k.size = 0;
+ return 0;
+}
+
+static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ enum btree_id id;
+ int ret = 0;
+
+ for (id = 0; id < BTREE_ID_NR; id++) {
+ if (!btree_type_has_ptrs(id))
+ continue;
+
+ ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL,
+ bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags));
+ if (ret)
+ break;
+ }
+
+ bch2_trans_put(trans);
+
+ return ret;
+}
+
+static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+ struct btree_trans *trans;
+ struct btree_iter iter;
+ struct closure cl;
+ struct btree *b;
+ struct bkey_buf k;
+ unsigned id;
+ int ret;
+
+ /* don't handle this yet: */
+ if (flags & BCH_FORCE_IF_METADATA_LOST)
+ return -EINVAL;
+
+ trans = bch2_trans_get(c);
+ bch2_bkey_buf_init(&k);
+ closure_init_stack(&cl);
+
+ for (id = 0; id < BTREE_ID_NR; id++) {
+ bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
+ BTREE_ITER_PREFETCH);
+retry:
+ ret = 0;
+ while (bch2_trans_begin(trans),
+ (b = bch2_btree_iter_peek_node(&iter)) &&
+ !(ret = PTR_ERR_OR_ZERO(b))) {
+ if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx))
+ goto next;
+
+ bch2_bkey_buf_copy(&k, c, &b->key);
+
+ ret = drop_dev_ptrs(c, bkey_i_to_s(k.k),
+ dev_idx, flags, true);
+ if (ret) {
+ bch_err(c, "Cannot drop device without losing data");
+ break;
+ }
+
+ ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+ ret = 0;
+ continue;
+ }
+
+ if (ret) {
+ bch_err_msg(c, ret, "updating btree node key");
+ break;
+ }
+next:
+ bch2_btree_iter_next_node(&iter);
+ }
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
+
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (ret)
+ goto err;
+ }
+
+ bch2_btree_interior_updates_flush(c);
+ ret = 0;
+err:
+ bch2_bkey_buf_exit(&k, c);
+ bch2_trans_put(trans);
+
+ BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
+
+ return ret;
+}
+
+int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+ return bch2_dev_usrdata_drop(c, dev_idx, flags) ?:
+ bch2_dev_metadata_drop(c, dev_idx, flags);
+}
diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h
new file mode 100644
index 000000000000..027efaa0d575
--- /dev/null
+++ b/fs/bcachefs/migrate.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_MIGRATE_H
+#define _BCACHEFS_MIGRATE_H
+
+int bch2_dev_data_drop(struct bch_fs *, unsigned, int);
+
+#endif /* _BCACHEFS_MIGRATE_H */
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
new file mode 100644
index 000000000000..54830ee0ed88
--- /dev/null
+++ b/fs/bcachefs/move.c
@@ -0,0 +1,1154 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "backpointers.h"
+#include "bkey_buf.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "errcode.h"
+#include "error.h"
+#include "inode.h"
+#include "io_read.h"
+#include "io_write.h"
+#include "journal_reclaim.h"
+#include "keylist.h"
+#include "move.h"
+#include "replicas.h"
+#include "snapshot.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/ioprio.h>
+#include <linux/kthread.h>
+
+static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k)
+{
+ if (trace_move_extent_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, k);
+ trace_move_extent(c, buf.buf);
+ printbuf_exit(&buf);
+ }
+}
+
+static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k)
+{
+ if (trace_move_extent_read_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, k);
+ trace_move_extent_read(c, buf.buf);
+ printbuf_exit(&buf);
+ }
+}
+
+struct moving_io {
+ struct list_head read_list;
+ struct list_head io_list;
+ struct move_bucket_in_flight *b;
+ struct closure cl;
+ bool read_completed;
+
+ unsigned read_sectors;
+ unsigned write_sectors;
+
+ struct bch_read_bio rbio;
+
+ struct data_update write;
+ /* Must be last since it is variable size */
+ struct bio_vec bi_inline_vecs[0];
+};
+
+static void move_free(struct moving_io *io)
+{
+ struct moving_context *ctxt = io->write.ctxt;
+
+ if (io->b)
+ atomic_dec(&io->b->count);
+
+ bch2_data_update_exit(&io->write);
+
+ mutex_lock(&ctxt->lock);
+ list_del(&io->io_list);
+ wake_up(&ctxt->wait);
+ mutex_unlock(&ctxt->lock);
+
+ kfree(io);
+}
+
+static void move_write_done(struct bch_write_op *op)
+{
+ struct moving_io *io = container_of(op, struct moving_io, write.op);
+ struct moving_context *ctxt = io->write.ctxt;
+
+ if (io->write.op.error)
+ ctxt->write_error = true;
+
+ atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
+ atomic_dec(&io->write.ctxt->write_ios);
+ move_free(io);
+ closure_put(&ctxt->cl);
+}
+
+static void move_write(struct moving_io *io)
+{
+ if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
+ move_free(io);
+ return;
+ }
+
+ closure_get(&io->write.ctxt->cl);
+ atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
+ atomic_inc(&io->write.ctxt->write_ios);
+
+ bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
+}
+
+struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
+{
+ struct moving_io *io =
+ list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list);
+
+ return io && io->read_completed ? io : NULL;
+}
+
+static void move_read_endio(struct bio *bio)
+{
+ struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
+ struct moving_context *ctxt = io->write.ctxt;
+
+ atomic_sub(io->read_sectors, &ctxt->read_sectors);
+ atomic_dec(&ctxt->read_ios);
+ io->read_completed = true;
+
+ wake_up(&ctxt->wait);
+ closure_put(&ctxt->cl);
+}
+
+void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt)
+{
+ struct moving_io *io;
+
+ while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
+ bch2_trans_unlock_long(ctxt->trans);
+ list_del(&io->read_list);
+ move_write(io);
+ }
+}
+
+void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
+{
+ unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
+
+ move_ctxt_wait_event(ctxt,
+ !atomic_read(&ctxt->write_sectors) ||
+ atomic_read(&ctxt->write_sectors) != sectors_pending);
+}
+
+static void bch2_moving_ctxt_flush_all(struct moving_context *ctxt)
+{
+ move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
+ bch2_trans_unlock_long(ctxt->trans);
+ closure_sync(&ctxt->cl);
+}
+
+void bch2_moving_ctxt_exit(struct moving_context *ctxt)
+{
+ struct bch_fs *c = ctxt->trans->c;
+
+ bch2_moving_ctxt_flush_all(ctxt);
+
+ EBUG_ON(atomic_read(&ctxt->write_sectors));
+ EBUG_ON(atomic_read(&ctxt->write_ios));
+ EBUG_ON(atomic_read(&ctxt->read_sectors));
+ EBUG_ON(atomic_read(&ctxt->read_ios));
+
+ mutex_lock(&c->moving_context_lock);
+ list_del(&ctxt->list);
+ mutex_unlock(&c->moving_context_lock);
+
+ bch2_trans_put(ctxt->trans);
+ memset(ctxt, 0, sizeof(*ctxt));
+}
+
+void bch2_moving_ctxt_init(struct moving_context *ctxt,
+ struct bch_fs *c,
+ struct bch_ratelimit *rate,
+ struct bch_move_stats *stats,
+ struct write_point_specifier wp,
+ bool wait_on_copygc)
+{
+ memset(ctxt, 0, sizeof(*ctxt));
+
+ ctxt->trans = bch2_trans_get(c);
+ ctxt->fn = (void *) _RET_IP_;
+ ctxt->rate = rate;
+ ctxt->stats = stats;
+ ctxt->wp = wp;
+ ctxt->wait_on_copygc = wait_on_copygc;
+
+ closure_init_stack(&ctxt->cl);
+
+ mutex_init(&ctxt->lock);
+ INIT_LIST_HEAD(&ctxt->reads);
+ INIT_LIST_HEAD(&ctxt->ios);
+ init_waitqueue_head(&ctxt->wait);
+
+ mutex_lock(&c->moving_context_lock);
+ list_add(&ctxt->list, &c->moving_context_list);
+ mutex_unlock(&c->moving_context_lock);
+}
+
+void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
+{
+ trace_move_data(c, stats);
+}
+
+void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
+{
+ memset(stats, 0, sizeof(*stats));
+ stats->data_type = BCH_DATA_user;
+ scnprintf(stats->name, sizeof(stats->name), "%s", name);
+}
+
+int bch2_move_extent(struct moving_context *ctxt,
+ struct move_bucket_in_flight *bucket_in_flight,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ struct bch_io_opts io_opts,
+ struct data_update_opts data_opts)
+{
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs *c = trans->c;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ struct moving_io *io;
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ unsigned sectors = k.k->size, pages;
+ int ret = -ENOMEM;
+
+ if (ctxt->stats)
+ ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
+ trace_move_extent2(c, k);
+
+ bch2_data_update_opts_normalize(k, &data_opts);
+
+ if (!data_opts.rewrite_ptrs &&
+ !data_opts.extra_replicas) {
+ if (data_opts.kill_ptrs)
+ return bch2_extent_drop_ptrs(trans, iter, k, data_opts);
+ return 0;
+ }
+
+ /*
+ * Before memory allocations & taking nocow locks in
+ * bch2_data_update_init():
+ */
+ bch2_trans_unlock(trans);
+
+ /* write path might have to decompress data: */
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
+
+ pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
+ io = kzalloc(sizeof(struct moving_io) +
+ sizeof(struct bio_vec) * pages, GFP_KERNEL);
+ if (!io)
+ goto err;
+
+ INIT_LIST_HEAD(&io->io_list);
+ io->write.ctxt = ctxt;
+ io->read_sectors = k.k->size;
+ io->write_sectors = k.k->size;
+
+ bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
+ bio_set_prio(&io->write.op.wbio.bio,
+ IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+
+ if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
+ GFP_KERNEL))
+ goto err_free;
+
+ io->rbio.c = c;
+ io->rbio.opts = io_opts;
+ bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
+ io->rbio.bio.bi_vcnt = pages;
+ bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+ io->rbio.bio.bi_iter.bi_size = sectors << 9;
+
+ io->rbio.bio.bi_opf = REQ_OP_READ;
+ io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
+ io->rbio.bio.bi_end_io = move_read_endio;
+
+ ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
+ io_opts, data_opts, iter->btree_id, k);
+ if (ret)
+ goto err_free_pages;
+
+ io->write.op.end_io = move_write_done;
+
+ if (ctxt->rate)
+ bch2_ratelimit_increment(ctxt->rate, k.k->size);
+
+ if (ctxt->stats) {
+ atomic64_inc(&ctxt->stats->keys_moved);
+ atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
+ }
+
+ if (bucket_in_flight) {
+ io->b = bucket_in_flight;
+ atomic_inc(&io->b->count);
+ }
+
+ this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
+ this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
+ trace_move_extent_read2(c, k);
+
+ mutex_lock(&ctxt->lock);
+ atomic_add(io->read_sectors, &ctxt->read_sectors);
+ atomic_inc(&ctxt->read_ios);
+
+ list_add_tail(&io->read_list, &ctxt->reads);
+ list_add_tail(&io->io_list, &ctxt->ios);
+ mutex_unlock(&ctxt->lock);
+
+ /*
+ * dropped by move_read_endio() - guards against use after free of
+ * ctxt when doing wakeup
+ */
+ closure_get(&ctxt->cl);
+ bch2_read_extent(trans, &io->rbio,
+ bkey_start_pos(k.k),
+ iter->btree_id, k, 0,
+ BCH_READ_NODECODE|
+ BCH_READ_LAST_FRAGMENT);
+ return 0;
+err_free_pages:
+ bio_free_pages(&io->write.op.wbio.bio);
+err_free:
+ kfree(io);
+err:
+ if (ret == -BCH_ERR_data_update_done)
+ return 0;
+
+ if (bch2_err_matches(ret, EROFS) ||
+ bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ return ret;
+
+ this_cpu_inc(c->counters[BCH_COUNTER_move_extent_start_fail]);
+ if (trace_move_extent_start_fail_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_str(&buf, ": ");
+ prt_str(&buf, bch2_err_str(ret));
+ trace_move_extent_start_fail(c, buf.buf);
+ printbuf_exit(&buf);
+ }
+ return ret;
+}
+
+struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
+ struct per_snapshot_io_opts *io_opts,
+ struct bkey_s_c extent_k)
+{
+ struct bch_fs *c = trans->c;
+ u32 restart_count = trans->restart_count;
+ int ret = 0;
+
+ if (io_opts->cur_inum != extent_k.k->p.inode) {
+ struct btree_iter iter;
+ struct bkey_s_c k;
+
+ io_opts->d.nr = 0;
+
+ for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode),
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ if (k.k->p.offset != extent_k.k->p.inode)
+ break;
+
+ if (!bkey_is_inode(k.k))
+ continue;
+
+ struct bch_inode_unpacked inode;
+ BUG_ON(bch2_inode_unpack(k, &inode));
+
+ struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
+ bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
+
+ ret = darray_push(&io_opts->d, e);
+ if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+ io_opts->cur_inum = extent_k.k->p.inode;
+ }
+
+ ret = ret ?: trans_was_restarted(trans, restart_count);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (extent_k.k->p.snapshot) {
+ struct snapshot_io_opts_entry *i;
+ darray_for_each(io_opts->d, i)
+ if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot))
+ return &i->io_opts;
+ }
+
+ return &io_opts->fs_io_opts;
+}
+
+int bch2_move_get_io_opts_one(struct btree_trans *trans,
+ struct bch_io_opts *io_opts,
+ struct bkey_s_c extent_k)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ /* reflink btree? */
+ if (!extent_k.k->p.inode) {
+ *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
+ return 0;
+ }
+
+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot),
+ BTREE_ITER_CACHED);
+ ret = bkey_err(k);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ return ret;
+
+ if (!ret && bkey_is_inode(k.k)) {
+ struct bch_inode_unpacked inode;
+ bch2_inode_unpack(k, &inode);
+ bch2_inode_opts_get(io_opts, trans->c, &inode);
+ } else {
+ *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
+ }
+
+ bch2_trans_iter_exit(trans, &iter);
+ return 0;
+}
+
+int bch2_move_ratelimit(struct moving_context *ctxt)
+{
+ struct bch_fs *c = ctxt->trans->c;
+ bool is_kthread = current->flags & PF_KTHREAD;
+ u64 delay;
+
+ if (ctxt->wait_on_copygc && c->copygc_running) {
+ bch2_moving_ctxt_flush_all(ctxt);
+ wait_event_killable(c->copygc_running_wq,
+ !c->copygc_running ||
+ (is_kthread && kthread_should_stop()));
+ }
+
+ do {
+ delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
+
+ if (is_kthread && kthread_should_stop())
+ return 1;
+
+ if (delay)
+ move_ctxt_wait_event_timeout(ctxt,
+ freezing(current) ||
+ (is_kthread && kthread_should_stop()),
+ delay);
+
+ if (unlikely(freezing(current))) {
+ bch2_moving_ctxt_flush_all(ctxt);
+ try_to_freeze();
+ }
+ } while (delay);
+
+ /*
+ * XXX: these limits really ought to be per device, SSDs and hard drives
+ * will want different limits
+ */
+ move_ctxt_wait_event(ctxt,
+ atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
+ atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
+ atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
+ atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight);
+
+ return 0;
+}
+
+static int bch2_move_data_btree(struct moving_context *ctxt,
+ struct bpos start,
+ struct bpos end,
+ move_pred_fn pred, void *arg,
+ enum btree_id btree_id)
+{
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs *c = trans->c;
+ struct per_snapshot_io_opts snapshot_io_opts;
+ struct bch_io_opts *io_opts;
+ struct bkey_buf sk;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct data_update_opts data_opts;
+ int ret = 0, ret2;
+
+ per_snapshot_io_opts_init(&snapshot_io_opts, c);
+ bch2_bkey_buf_init(&sk);
+
+ if (ctxt->stats) {
+ ctxt->stats->data_type = BCH_DATA_user;
+ ctxt->stats->pos = BBPOS(btree_id, start);
+ }
+
+ bch2_trans_iter_init(trans, &iter, btree_id, start,
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS);
+
+ if (ctxt->rate)
+ bch2_ratelimit_reset(ctxt->rate);
+
+ while (!bch2_move_ratelimit(ctxt)) {
+ bch2_trans_begin(trans);
+
+ k = bch2_btree_iter_peek(&iter);
+ if (!k.k)
+ break;
+
+ ret = bkey_err(k);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ break;
+
+ if (bkey_ge(bkey_start_pos(k.k), end))
+ break;
+
+ if (ctxt->stats)
+ ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
+
+ if (!bkey_extent_is_direct_data(k.k))
+ goto next_nondata;
+
+ io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k);
+ ret = PTR_ERR_OR_ZERO(io_opts);
+ if (ret)
+ continue;
+
+ memset(&data_opts, 0, sizeof(data_opts));
+ if (!pred(c, arg, k, io_opts, &data_opts))
+ goto next;
+
+ /*
+ * The iterator gets unlocked by __bch2_read_extent - need to
+ * save a copy of @k elsewhere:
+ */
+ bch2_bkey_buf_reassemble(&sk, c, k);
+ k = bkey_i_to_s_c(sk.k);
+
+ ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts);
+ if (ret2) {
+ if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
+ continue;
+
+ if (ret2 == -ENOMEM) {
+ /* memory allocation failure, wait for some IO to finish */
+ bch2_move_ctxt_wait_for_io(ctxt);
+ continue;
+ }
+
+ /* XXX signal failure */
+ goto next;
+ }
+next:
+ if (ctxt->stats)
+ atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
+next_nondata:
+ bch2_btree_iter_advance(&iter);
+ }
+
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_bkey_buf_exit(&sk, c);
+ per_snapshot_io_opts_exit(&snapshot_io_opts);
+
+ return ret;
+}
+
+int __bch2_move_data(struct moving_context *ctxt,
+ struct bbpos start,
+ struct bbpos end,
+ move_pred_fn pred, void *arg)
+{
+ struct bch_fs *c = ctxt->trans->c;
+ enum btree_id id;
+ int ret = 0;
+
+ for (id = start.btree;
+ id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
+ id++) {
+ ctxt->stats->pos = BBPOS(id, POS_MIN);
+
+ if (!btree_type_has_ptrs(id) ||
+ !bch2_btree_id_root(c, id)->b)
+ continue;
+
+ ret = bch2_move_data_btree(ctxt,
+ id == start.btree ? start.pos : POS_MIN,
+ id == end.btree ? end.pos : POS_MAX,
+ pred, arg, id);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+int bch2_move_data(struct bch_fs *c,
+ struct bbpos start,
+ struct bbpos end,
+ struct bch_ratelimit *rate,
+ struct bch_move_stats *stats,
+ struct write_point_specifier wp,
+ bool wait_on_copygc,
+ move_pred_fn pred, void *arg)
+{
+
+ struct moving_context ctxt;
+ int ret;
+
+ bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
+ ret = __bch2_move_data(&ctxt, start, end, pred, arg);
+ bch2_moving_ctxt_exit(&ctxt);
+
+ return ret;
+}
+
+int __bch2_evacuate_bucket(struct moving_context *ctxt,
+ struct move_bucket_in_flight *bucket_in_flight,
+ struct bpos bucket, int gen,
+ struct data_update_opts _data_opts)
+{
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs *c = trans->c;
+ bool is_kthread = current->flags & PF_KTHREAD;
+ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
+ struct btree_iter iter;
+ struct bkey_buf sk;
+ struct bch_backpointer bp;
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a;
+ struct bkey_s_c k;
+ struct data_update_opts data_opts;
+ unsigned dirty_sectors, bucket_size;
+ u64 fragmentation;
+ struct bpos bp_pos = POS_MIN;
+ int ret = 0;
+
+ trace_bucket_evacuate(c, &bucket);
+
+ bch2_bkey_buf_init(&sk);
+
+ /*
+ * We're not run in a context that handles transaction restarts:
+ */
+ bch2_trans_begin(trans);
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+ bucket, BTREE_ITER_CACHED);
+ ret = lockrestart_do(trans,
+ bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (ret) {
+ bch_err_msg(c, ret, "looking up alloc key");
+ goto err;
+ }
+
+ a = bch2_alloc_to_v4(k, &a_convert);
+ dirty_sectors = a->dirty_sectors;
+ bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size;
+ fragmentation = a->fragmentation_lru;
+
+ ret = bch2_btree_write_buffer_flush(trans);
+ if (ret) {
+ bch_err_msg(c, ret, "flushing btree write buffer");
+ goto err;
+ }
+
+ while (!(ret = bch2_move_ratelimit(ctxt))) {
+ if (is_kthread && kthread_should_stop())
+ break;
+
+ bch2_trans_begin(trans);
+
+ ret = bch2_get_next_backpointer(trans, bucket, gen,
+ &bp_pos, &bp,
+ BTREE_ITER_CACHED);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ goto err;
+ if (bkey_eq(bp_pos, POS_MAX))
+ break;
+
+ if (!bp.level) {
+ const struct bch_extent_ptr *ptr;
+ unsigned i = 0;
+
+ k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
+ ret = bkey_err(k);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ goto err;
+ if (!k.k)
+ goto next;
+
+ bch2_bkey_buf_reassemble(&sk, c, k);
+ k = bkey_i_to_s_c(sk.k);
+
+ ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
+ if (ret) {
+ bch2_trans_iter_exit(trans, &iter);
+ continue;
+ }
+
+ data_opts = _data_opts;
+ data_opts.target = io_opts.background_target;
+ data_opts.rewrite_ptrs = 0;
+
+ bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
+ if (ptr->dev == bucket.inode) {
+ data_opts.rewrite_ptrs |= 1U << i;
+ if (ptr->cached) {
+ bch2_trans_iter_exit(trans, &iter);
+ goto next;
+ }
+ }
+ i++;
+ }
+
+ ret = bch2_move_extent(ctxt, bucket_in_flight,
+ &iter, k, io_opts, data_opts);
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret == -ENOMEM) {
+ /* memory allocation failure, wait for some IO to finish */
+ bch2_move_ctxt_wait_for_io(ctxt);
+ continue;
+ }
+ if (ret)
+ goto err;
+
+ if (ctxt->stats)
+ atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
+ } else {
+ struct btree *b;
+
+ b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp);
+ ret = PTR_ERR_OR_ZERO(b);
+ if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
+ continue;
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ goto err;
+ if (!b)
+ goto next;
+
+ ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ goto err;
+
+ if (ctxt->rate)
+ bch2_ratelimit_increment(ctxt->rate,
+ c->opts.btree_node_size >> 9);
+ if (ctxt->stats) {
+ atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen);
+ atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved);
+ }
+ }
+next:
+ bp_pos = bpos_nosnap_successor(bp_pos);
+ }
+
+ trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);
+err:
+ bch2_bkey_buf_exit(&sk, c);
+ return ret;
+}
+
+int bch2_evacuate_bucket(struct bch_fs *c,
+ struct bpos bucket, int gen,
+ struct data_update_opts data_opts,
+ struct bch_ratelimit *rate,
+ struct bch_move_stats *stats,
+ struct write_point_specifier wp,
+ bool wait_on_copygc)
+{
+ struct moving_context ctxt;
+ int ret;
+
+ bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
+ ret = __bch2_evacuate_bucket(&ctxt, NULL, bucket, gen, data_opts);
+ bch2_moving_ctxt_exit(&ctxt);
+
+ return ret;
+}
+
+typedef bool (*move_btree_pred)(struct bch_fs *, void *,
+ struct btree *, struct bch_io_opts *,
+ struct data_update_opts *);
+
+static int bch2_move_btree(struct bch_fs *c,
+ enum btree_id start_btree_id, struct bpos start_pos,
+ enum btree_id end_btree_id, struct bpos end_pos,
+ move_btree_pred pred, void *arg,
+ struct bch_move_stats *stats)
+{
+ bool kthread = (current->flags & PF_KTHREAD) != 0;
+ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
+ struct moving_context ctxt;
+ struct btree_trans *trans;
+ struct btree_iter iter;
+ struct btree *b;
+ enum btree_id id;
+ struct data_update_opts data_opts;
+ int ret = 0;
+
+ bch2_moving_ctxt_init(&ctxt, c, NULL, stats,
+ writepoint_ptr(&c->btree_write_point),
+ true);
+ trans = ctxt.trans;
+
+ stats->data_type = BCH_DATA_btree;
+
+ for (id = start_btree_id;
+ id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
+ id++) {
+ stats->pos = BBPOS(id, POS_MIN);
+
+ if (!bch2_btree_id_root(c, id)->b)
+ continue;
+
+ bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
+ BTREE_ITER_PREFETCH);
+retry:
+ ret = 0;
+ while (bch2_trans_begin(trans),
+ (b = bch2_btree_iter_peek_node(&iter)) &&
+ !(ret = PTR_ERR_OR_ZERO(b))) {
+ if (kthread && kthread_should_stop())
+ break;
+
+ if ((cmp_int(id, end_btree_id) ?:
+ bpos_cmp(b->key.k.p, end_pos)) > 0)
+ break;
+
+ stats->pos = BBPOS(iter.btree_id, iter.pos);
+
+ if (!pred(c, arg, b, &io_opts, &data_opts))
+ goto next;
+
+ ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret;
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ break;
+next:
+ bch2_btree_iter_next_node(&iter);
+ }
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
+
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (kthread && kthread_should_stop())
+ break;
+ }
+
+ bch_err_fn(c, ret);
+ bch2_moving_ctxt_exit(&ctxt);
+ bch2_btree_interior_updates_flush(c);
+
+ return ret;
+}
+
+static bool rereplicate_pred(struct bch_fs *c, void *arg,
+ struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
+{
+ unsigned nr_good = bch2_bkey_durability(c, k);
+ unsigned replicas = bkey_is_btree_ptr(k.k)
+ ? c->opts.metadata_replicas
+ : io_opts->data_replicas;
+
+ if (!nr_good || nr_good >= replicas)
+ return false;
+
+ data_opts->target = 0;
+ data_opts->extra_replicas = replicas - nr_good;
+ data_opts->btree_insert_flags = 0;
+ return true;
+}
+
+static bool migrate_pred(struct bch_fs *c, void *arg,
+ struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
+ struct bch_ioctl_data *op = arg;
+ unsigned i = 0;
+
+ data_opts->rewrite_ptrs = 0;
+ data_opts->target = 0;
+ data_opts->extra_replicas = 0;
+ data_opts->btree_insert_flags = 0;
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (ptr->dev == op->migrate.dev)
+ data_opts->rewrite_ptrs |= 1U << i;
+ i++;
+ }
+
+ return data_opts->rewrite_ptrs != 0;
+}
+
+static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
+ struct btree *b,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
+{
+ return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
+}
+
+static bool migrate_btree_pred(struct bch_fs *c, void *arg,
+ struct btree *b,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
+{
+ return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
+}
+
+static bool bformat_needs_redo(struct bkey_format *f)
+{
+ unsigned i;
+
+ for (i = 0; i < f->nr_fields; i++) {
+ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+ u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
+ u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+ if (f->bits_per_field[i] > unpacked_bits)
+ return true;
+
+ if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
+ return true;
+
+ if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
+ unpacked_mask) <
+ field_offset)
+ return true;
+ }
+
+ return false;
+}
+
+static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
+ struct btree *b,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
+{
+ if (b->version_ondisk != c->sb.version ||
+ btree_node_need_rewrite(b) ||
+ bformat_needs_redo(&b->format)) {
+ data_opts->target = 0;
+ data_opts->extra_replicas = 0;
+ data_opts->btree_insert_flags = 0;
+ return true;
+ }
+
+ return false;
+}
+
+int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
+{
+ int ret;
+
+ ret = bch2_move_btree(c,
+ 0, POS_MIN,
+ BTREE_ID_NR, SPOS_MAX,
+ rewrite_old_nodes_pred, c, stats);
+ if (!ret) {
+ mutex_lock(&c->sb_lock);
+ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
+ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
+ c->disk_sb.sb->version_min = c->disk_sb.sb->version;
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+ }
+
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+int bch2_data_job(struct bch_fs *c,
+ struct bch_move_stats *stats,
+ struct bch_ioctl_data op)
+{
+ int ret = 0;
+
+ switch (op.op) {
+ case BCH_DATA_OP_REREPLICATE:
+ bch2_move_stats_init(stats, "rereplicate");
+ stats->data_type = BCH_DATA_journal;
+ ret = bch2_journal_flush_device_pins(&c->journal, -1);
+
+ ret = bch2_move_btree(c,
+ op.start_btree, op.start_pos,
+ op.end_btree, op.end_pos,
+ rereplicate_btree_pred, c, stats) ?: ret;
+ ret = bch2_replicas_gc2(c) ?: ret;
+
+ ret = bch2_move_data(c,
+ (struct bbpos) { op.start_btree, op.start_pos },
+ (struct bbpos) { op.end_btree, op.end_pos },
+ NULL,
+ stats,
+ writepoint_hashed((unsigned long) current),
+ true,
+ rereplicate_pred, c) ?: ret;
+ ret = bch2_replicas_gc2(c) ?: ret;
+
+ bch2_move_stats_exit(stats, c);
+ break;
+ case BCH_DATA_OP_MIGRATE:
+ if (op.migrate.dev >= c->sb.nr_devices)
+ return -EINVAL;
+
+ bch2_move_stats_init(stats, "migrate");
+ stats->data_type = BCH_DATA_journal;
+ ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
+
+ ret = bch2_move_btree(c,
+ op.start_btree, op.start_pos,
+ op.end_btree, op.end_pos,
+ migrate_btree_pred, &op, stats) ?: ret;
+ ret = bch2_replicas_gc2(c) ?: ret;
+
+ ret = bch2_move_data(c,
+ (struct bbpos) { op.start_btree, op.start_pos },
+ (struct bbpos) { op.end_btree, op.end_pos },
+ NULL,
+ stats,
+ writepoint_hashed((unsigned long) current),
+ true,
+ migrate_pred, &op) ?: ret;
+ ret = bch2_replicas_gc2(c) ?: ret;
+
+ bch2_move_stats_exit(stats, c);
+ break;
+ case BCH_DATA_OP_REWRITE_OLD_NODES:
+ bch2_move_stats_init(stats, "rewrite_old_nodes");
+ ret = bch2_scan_old_btree_nodes(c, stats);
+ bch2_move_stats_exit(stats, c);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
+{
+ prt_printf(out, "%s: data type=%s pos=",
+ stats->name,
+ bch2_data_types[stats->data_type]);
+ bch2_bbpos_to_text(out, stats->pos);
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+
+ prt_str(out, "keys moved: ");
+ prt_u64(out, atomic64_read(&stats->keys_moved));
+ prt_newline(out);
+
+ prt_str(out, "keys raced: ");
+ prt_u64(out, atomic64_read(&stats->keys_raced));
+ prt_newline(out);
+
+ prt_str(out, "bytes seen: ");
+ prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
+ prt_newline(out);
+
+ prt_str(out, "bytes moved: ");
+ prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
+ prt_newline(out);
+
+ prt_str(out, "bytes raced: ");
+ prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
+ prt_newline(out);
+
+ printbuf_indent_sub(out, 2);
+}
+
+static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
+{
+ struct moving_io *io;
+
+ bch2_move_stats_to_text(out, ctxt->stats);
+ printbuf_indent_add(out, 2);
+
+ prt_printf(out, "reads: ios %u/%u sectors %u/%u",
+ atomic_read(&ctxt->read_ios),
+ c->opts.move_ios_in_flight,
+ atomic_read(&ctxt->read_sectors),
+ c->opts.move_bytes_in_flight >> 9);
+ prt_newline(out);
+
+ prt_printf(out, "writes: ios %u/%u sectors %u/%u",
+ atomic_read(&ctxt->write_ios),
+ c->opts.move_ios_in_flight,
+ atomic_read(&ctxt->write_sectors),
+ c->opts.move_bytes_in_flight >> 9);
+ prt_newline(out);
+
+ printbuf_indent_add(out, 2);
+
+ mutex_lock(&ctxt->lock);
+ list_for_each_entry(io, &ctxt->ios, io_list)
+ bch2_write_op_to_text(out, &io->write.op);
+ mutex_unlock(&ctxt->lock);
+
+ printbuf_indent_sub(out, 4);
+}
+
+void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ struct moving_context *ctxt;
+
+ mutex_lock(&c->moving_context_lock);
+ list_for_each_entry(ctxt, &c->moving_context_list, list)
+ bch2_moving_ctxt_to_text(out, c, ctxt);
+ mutex_unlock(&c->moving_context_lock);
+}
+
+void bch2_fs_move_init(struct bch_fs *c)
+{
+ INIT_LIST_HEAD(&c->moving_context_list);
+ mutex_init(&c->moving_context_lock);
+}
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
new file mode 100644
index 000000000000..0906aa2d1de2
--- /dev/null
+++ b/fs/bcachefs/move.h
@@ -0,0 +1,158 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_MOVE_H
+#define _BCACHEFS_MOVE_H
+
+#include "bbpos.h"
+#include "bcachefs_ioctl.h"
+#include "btree_iter.h"
+#include "buckets.h"
+#include "data_update.h"
+#include "move_types.h"
+
+struct bch_read_bio;
+
+struct moving_context {
+ struct btree_trans *trans;
+ struct list_head list;
+ void *fn;
+
+ struct bch_ratelimit *rate;
+ struct bch_move_stats *stats;
+ struct write_point_specifier wp;
+ bool wait_on_copygc;
+ bool write_error;
+
+ /* For waiting on outstanding reads and writes: */
+ struct closure cl;
+
+ struct mutex lock;
+ struct list_head reads;
+ struct list_head ios;
+
+ /* in flight sectors: */
+ atomic_t read_sectors;
+ atomic_t write_sectors;
+ atomic_t read_ios;
+ atomic_t write_ios;
+
+ wait_queue_head_t wait;
+};
+
+#define move_ctxt_wait_event_timeout(_ctxt, _cond, _timeout) \
+({ \
+ int _ret = 0; \
+ while (true) { \
+ bool cond_finished = false; \
+ bch2_moving_ctxt_do_pending_writes(_ctxt); \
+ \
+ if (_cond) \
+ break; \
+ bch2_trans_unlock_long((_ctxt)->trans); \
+ _ret = __wait_event_timeout((_ctxt)->wait, \
+ bch2_moving_ctxt_next_pending_write(_ctxt) || \
+ (cond_finished = (_cond)), _timeout); \
+ if (_ret || ( cond_finished)) \
+ break; \
+ } \
+ _ret; \
+})
+
+#define move_ctxt_wait_event(_ctxt, _cond) \
+do { \
+ bool cond_finished = false; \
+ bch2_moving_ctxt_do_pending_writes(_ctxt); \
+ \
+ if (_cond) \
+ break; \
+ bch2_trans_unlock_long((_ctxt)->trans); \
+ __wait_event((_ctxt)->wait, \
+ bch2_moving_ctxt_next_pending_write(_ctxt) || \
+ (cond_finished = (_cond))); \
+ if (cond_finished) \
+ break; \
+} while (1)
+
+typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c,
+ struct bch_io_opts *, struct data_update_opts *);
+
+void bch2_moving_ctxt_exit(struct moving_context *);
+void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
+ struct bch_ratelimit *, struct bch_move_stats *,
+ struct write_point_specifier, bool);
+struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *);
+void bch2_moving_ctxt_do_pending_writes(struct moving_context *);
+void bch2_move_ctxt_wait_for_io(struct moving_context *);
+int bch2_move_ratelimit(struct moving_context *);
+
+/* Inodes in different snapshots may have different IO options: */
+struct snapshot_io_opts_entry {
+ u32 snapshot;
+ struct bch_io_opts io_opts;
+};
+
+struct per_snapshot_io_opts {
+ u64 cur_inum;
+ struct bch_io_opts fs_io_opts;
+ DARRAY(struct snapshot_io_opts_entry) d;
+};
+
+static inline void per_snapshot_io_opts_init(struct per_snapshot_io_opts *io_opts, struct bch_fs *c)
+{
+ memset(io_opts, 0, sizeof(*io_opts));
+ io_opts->fs_io_opts = bch2_opts_to_inode_opts(c->opts);
+}
+
+static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opts)
+{
+ darray_exit(&io_opts->d);
+}
+
+struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *,
+ struct per_snapshot_io_opts *, struct bkey_s_c);
+int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, struct bkey_s_c);
+
+int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
+
+int bch2_move_extent(struct moving_context *,
+ struct move_bucket_in_flight *,
+ struct btree_iter *,
+ struct bkey_s_c,
+ struct bch_io_opts,
+ struct data_update_opts);
+
+int __bch2_move_data(struct moving_context *,
+ struct bbpos,
+ struct bbpos,
+ move_pred_fn, void *);
+int bch2_move_data(struct bch_fs *,
+ struct bbpos start,
+ struct bbpos end,
+ struct bch_ratelimit *,
+ struct bch_move_stats *,
+ struct write_point_specifier,
+ bool,
+ move_pred_fn, void *);
+
+int __bch2_evacuate_bucket(struct moving_context *,
+ struct move_bucket_in_flight *,
+ struct bpos, int,
+ struct data_update_opts);
+int bch2_evacuate_bucket(struct bch_fs *, struct bpos, int,
+ struct data_update_opts,
+ struct bch_ratelimit *,
+ struct bch_move_stats *,
+ struct write_point_specifier,
+ bool);
+int bch2_data_job(struct bch_fs *,
+ struct bch_move_stats *,
+ struct bch_ioctl_data);
+
+void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *);
+void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *);
+void bch2_move_stats_init(struct bch_move_stats *, char *);
+
+void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *);
+
+void bch2_fs_move_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_MOVE_H */
diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
new file mode 100644
index 000000000000..e22841ef31e4
--- /dev/null
+++ b/fs/bcachefs/move_types.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_MOVE_TYPES_H
+#define _BCACHEFS_MOVE_TYPES_H
+
+#include "bbpos_types.h"
+
+struct bch_move_stats {
+ enum bch_data_type data_type;
+ struct bbpos pos;
+ char name[32];
+
+ atomic64_t keys_moved;
+ atomic64_t keys_raced;
+ atomic64_t sectors_seen;
+ atomic64_t sectors_moved;
+ atomic64_t sectors_raced;
+};
+
+struct move_bucket_key {
+ struct bpos bucket;
+ u8 gen;
+};
+
+struct move_bucket {
+ struct move_bucket_key k;
+ unsigned sectors;
+};
+
+struct move_bucket_in_flight {
+ struct move_bucket_in_flight *next;
+ struct rhash_head hash;
+ struct move_bucket bucket;
+ atomic_t count;
+};
+
+#endif /* _BCACHEFS_MOVE_TYPES_H */
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
new file mode 100644
index 000000000000..a84e79f79e5e
--- /dev/null
+++ b/fs/bcachefs/movinggc.c
@@ -0,0 +1,431 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Moving/copying garbage collector
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "btree_write_buffer.h"
+#include "buckets.h"
+#include "clock.h"
+#include "errcode.h"
+#include "error.h"
+#include "lru.h"
+#include "move.h"
+#include "movinggc.h"
+#include "trace.h"
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <linux/sched/task.h>
+#include <linux/wait.h>
+
+struct buckets_in_flight {
+ struct rhashtable table;
+ struct move_bucket_in_flight *first;
+ struct move_bucket_in_flight *last;
+ size_t nr;
+ size_t sectors;
+};
+
+static const struct rhashtable_params bch_move_bucket_params = {
+ .head_offset = offsetof(struct move_bucket_in_flight, hash),
+ .key_offset = offsetof(struct move_bucket_in_flight, bucket.k),
+ .key_len = sizeof(struct move_bucket_key),
+};
+
+static struct move_bucket_in_flight *
+move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket b)
+{
+ struct move_bucket_in_flight *new = kzalloc(sizeof(*new), GFP_KERNEL);
+ int ret;
+
+ if (!new)
+ return ERR_PTR(-ENOMEM);
+
+ new->bucket = b;
+
+ ret = rhashtable_lookup_insert_fast(&list->table, &new->hash,
+ bch_move_bucket_params);
+ if (ret) {
+ kfree(new);
+ return ERR_PTR(ret);
+ }
+
+ if (!list->first)
+ list->first = new;
+ else
+ list->last->next = new;
+
+ list->last = new;
+ list->nr++;
+ list->sectors += b.sectors;
+ return new;
+}
+
+static int bch2_bucket_is_movable(struct btree_trans *trans,
+ struct move_bucket *b, u64 time)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_alloc_v4 _a;
+ const struct bch_alloc_v4 *a;
+ int ret;
+
+ if (bch2_bucket_is_open(trans->c,
+ b->k.bucket.inode,
+ b->k.bucket.offset))
+ return 0;
+
+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
+ b->k.bucket, BTREE_ITER_CACHED);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ a = bch2_alloc_to_v4(k, &_a);
+ b->k.gen = a->gen;
+ b->sectors = a->dirty_sectors;
+
+ ret = data_type_movable(a->data_type) &&
+ a->fragmentation_lru &&
+ a->fragmentation_lru <= time;
+
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static void move_buckets_wait(struct moving_context *ctxt,
+ struct buckets_in_flight *list,
+ bool flush)
+{
+ struct move_bucket_in_flight *i;
+ int ret;
+
+ while ((i = list->first)) {
+ if (flush)
+ move_ctxt_wait_event(ctxt, !atomic_read(&i->count));
+
+ if (atomic_read(&i->count))
+ break;
+
+ list->first = i->next;
+ if (!list->first)
+ list->last = NULL;
+
+ list->nr--;
+ list->sectors -= i->bucket.sectors;
+
+ ret = rhashtable_remove_fast(&list->table, &i->hash,
+ bch_move_bucket_params);
+ BUG_ON(ret);
+ kfree(i);
+ }
+
+ bch2_trans_unlock_long(ctxt->trans);
+}
+
+static bool bucket_in_flight(struct buckets_in_flight *list,
+ struct move_bucket_key k)
+{
+ return rhashtable_lookup_fast(&list->table, &k, bch_move_bucket_params);
+}
+
+typedef DARRAY(struct move_bucket) move_buckets;
+
+static int bch2_copygc_get_buckets(struct moving_context *ctxt,
+ struct buckets_in_flight *buckets_in_flight,
+ move_buckets *buckets)
+{
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4);
+ size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0;
+ int ret;
+
+ move_buckets_wait(ctxt, buckets_in_flight, false);
+
+ ret = bch2_btree_write_buffer_flush(trans);
+ if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()",
+ __func__, bch2_err_str(ret)))
+ return ret;
+
+ ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru,
+ lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0),
+ lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
+ 0, k, ({
+ struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) };
+ int ret2 = 0;
+
+ saw++;
+
+ if (!bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p)))
+ not_movable++;
+ else if (bucket_in_flight(buckets_in_flight, b.k))
+ in_flight++;
+ else {
+ ret2 = darray_push(buckets, b) ?: buckets->nr >= nr_to_get;
+ if (ret2 >= 0)
+ sectors += b.sectors;
+ }
+ ret2;
+ }));
+
+ pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i",
+ buckets_in_flight->nr, buckets_in_flight->sectors,
+ saw, in_flight, not_movable, buckets->nr, sectors, nr_to_get, ret);
+
+ return ret < 0 ? ret : 0;
+}
+
+noinline
+static int bch2_copygc(struct moving_context *ctxt,
+ struct buckets_in_flight *buckets_in_flight,
+ bool *did_work)
+{
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs *c = trans->c;
+ struct data_update_opts data_opts = {
+ .btree_insert_flags = BCH_WATERMARK_copygc,
+ };
+ move_buckets buckets = { 0 };
+ struct move_bucket_in_flight *f;
+ struct move_bucket *i;
+ u64 moved = atomic64_read(&ctxt->stats->sectors_moved);
+ int ret = 0;
+
+ ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets);
+ if (ret)
+ goto err;
+
+ darray_for_each(buckets, i) {
+ if (kthread_should_stop() || freezing(current))
+ break;
+
+ f = move_bucket_in_flight_add(buckets_in_flight, *i);
+ ret = PTR_ERR_OR_ZERO(f);
+ if (ret == -EEXIST) { /* rare race: copygc_get_buckets returned same bucket more than once */
+ ret = 0;
+ continue;
+ }
+ if (ret == -ENOMEM) { /* flush IO, continue later */
+ ret = 0;
+ break;
+ }
+
+ ret = __bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket,
+ f->bucket.k.gen, data_opts);
+ if (ret)
+ goto err;
+
+ *did_work = true;
+ }
+err:
+ darray_exit(&buckets);
+
+ /* no entries in LRU btree found, or got to end: */
+ if (bch2_err_matches(ret, ENOENT))
+ ret = 0;
+
+ if (ret < 0 && !bch2_err_matches(ret, EROFS))
+ bch_err_msg(c, ret, "from bch2_move_data()");
+
+ moved = atomic64_read(&ctxt->stats->sectors_moved) - moved;
+ trace_and_count(c, copygc, c, moved, 0, 0, 0);
+ return ret;
+}
+
+/*
+ * Copygc runs when the amount of fragmented data is above some arbitrary
+ * threshold:
+ *
+ * The threshold at the limit - when the device is full - is the amount of space
+ * we reserved in bch2_recalc_capacity; we can't have more than that amount of
+ * disk space stranded due to fragmentation and store everything we have
+ * promised to store.
+ *
+ * But we don't want to be running copygc unnecessarily when the device still
+ * has plenty of free space - rather, we want copygc to smoothly run every so
+ * often and continually reduce the amount of fragmented space as the device
+ * fills up. So, we increase the threshold by half the current free space.
+ */
+unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned dev_idx;
+ s64 wait = S64_MAX, fragmented_allowed, fragmented;
+ unsigned i;
+
+ for_each_rw_member(ca, c, dev_idx) {
+ struct bch_dev_usage usage = bch2_dev_usage_read(ca);
+
+ fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) *
+ ca->mi.bucket_size) >> 1);
+ fragmented = 0;
+
+ for (i = 0; i < BCH_DATA_NR; i++)
+ if (data_type_movable(i))
+ fragmented += usage.d[i].fragmented;
+
+ wait = min(wait, max(0LL, fragmented_allowed - fragmented));
+ }
+
+ return wait;
+}
+
+void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ prt_printf(out, "Currently waiting for: ");
+ prt_human_readable_u64(out, max(0LL, c->copygc_wait -
+ atomic64_read(&c->io_clock[WRITE].now)) << 9);
+ prt_newline(out);
+
+ prt_printf(out, "Currently waiting since: ");
+ prt_human_readable_u64(out, max(0LL,
+ atomic64_read(&c->io_clock[WRITE].now) -
+ c->copygc_wait_at) << 9);
+ prt_newline(out);
+
+ prt_printf(out, "Currently calculated wait: ");
+ prt_human_readable_u64(out, bch2_copygc_wait_amount(c));
+ prt_newline(out);
+}
+
+static int bch2_copygc_thread(void *arg)
+{
+ struct bch_fs *c = arg;
+ struct moving_context ctxt;
+ struct bch_move_stats move_stats;
+ struct io_clock *clock = &c->io_clock[WRITE];
+ struct buckets_in_flight *buckets;
+ u64 last, wait;
+ int ret = 0;
+
+ buckets = kzalloc(sizeof(struct buckets_in_flight), GFP_KERNEL);
+ if (!buckets)
+ return -ENOMEM;
+ ret = rhashtable_init(&buckets->table, &bch_move_bucket_params);
+ if (ret) {
+ kfree(buckets);
+ bch_err_msg(c, ret, "allocating copygc buckets in flight");
+ return ret;
+ }
+
+ set_freezable();
+
+ bch2_move_stats_init(&move_stats, "copygc");
+ bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
+ writepoint_ptr(&c->copygc_write_point),
+ false);
+
+ while (!ret && !kthread_should_stop()) {
+ bool did_work = false;
+
+ bch2_trans_unlock_long(ctxt.trans);
+ cond_resched();
+
+ if (!c->copy_gc_enabled) {
+ move_buckets_wait(&ctxt, buckets, true);
+ kthread_wait_freezable(c->copy_gc_enabled);
+ }
+
+ if (unlikely(freezing(current))) {
+ move_buckets_wait(&ctxt, buckets, true);
+ __refrigerator(false);
+ continue;
+ }
+
+ last = atomic64_read(&clock->now);
+ wait = bch2_copygc_wait_amount(c);
+
+ if (wait > clock->max_slop) {
+ c->copygc_wait_at = last;
+ c->copygc_wait = last + wait;
+ move_buckets_wait(&ctxt, buckets, true);
+ trace_and_count(c, copygc_wait, c, wait, last + wait);
+ bch2_kthread_io_clock_wait(clock, last + wait,
+ MAX_SCHEDULE_TIMEOUT);
+ continue;
+ }
+
+ c->copygc_wait = 0;
+
+ c->copygc_running = true;
+ ret = bch2_copygc(&ctxt, buckets, &did_work);
+ c->copygc_running = false;
+
+ wake_up(&c->copygc_running_wq);
+
+ if (!wait && !did_work) {
+ u64 min_member_capacity = bch2_min_rw_member_capacity(c);
+
+ if (min_member_capacity == U64_MAX)
+ min_member_capacity = 128 * 2048;
+
+ bch2_trans_unlock_long(ctxt.trans);
+ bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6),
+ MAX_SCHEDULE_TIMEOUT);
+ }
+ }
+
+ move_buckets_wait(&ctxt, buckets, true);
+
+ rhashtable_destroy(&buckets->table);
+ kfree(buckets);
+ bch2_moving_ctxt_exit(&ctxt);
+ bch2_move_stats_exit(&move_stats, c);
+
+ return 0;
+}
+
+void bch2_copygc_stop(struct bch_fs *c)
+{
+ if (c->copygc_thread) {
+ kthread_stop(c->copygc_thread);
+ put_task_struct(c->copygc_thread);
+ }
+ c->copygc_thread = NULL;
+}
+
+int bch2_copygc_start(struct bch_fs *c)
+{
+ struct task_struct *t;
+ int ret;
+
+ if (c->copygc_thread)
+ return 0;
+
+ if (c->opts.nochanges)
+ return 0;
+
+ if (bch2_fs_init_fault("copygc_start"))
+ return -ENOMEM;
+
+ t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
+ ret = PTR_ERR_OR_ZERO(t);
+ if (ret) {
+ bch_err_msg(c, ret, "creating copygc thread");
+ return ret;
+ }
+
+ get_task_struct(t);
+
+ c->copygc_thread = t;
+ wake_up_process(c->copygc_thread);
+
+ return 0;
+}
+
+void bch2_fs_copygc_init(struct bch_fs *c)
+{
+ init_waitqueue_head(&c->copygc_running_wq);
+ c->copygc_running = false;
+}
diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h
new file mode 100644
index 000000000000..ea181fef5bc9
--- /dev/null
+++ b/fs/bcachefs/movinggc.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_MOVINGGC_H
+#define _BCACHEFS_MOVINGGC_H
+
+unsigned long bch2_copygc_wait_amount(struct bch_fs *);
+void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *);
+
+void bch2_copygc_stop(struct bch_fs *);
+int bch2_copygc_start(struct bch_fs *);
+void bch2_fs_copygc_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_MOVINGGC_H */
diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c
new file mode 100644
index 000000000000..3c21981a4a1c
--- /dev/null
+++ b/fs/bcachefs/nocow_locking.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "nocow_locking.h"
+#include "util.h"
+
+#include <linux/closure.h>
+
+bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos bucket)
+{
+ u64 dev_bucket = bucket_to_u64(bucket);
+ struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
+ unsigned i;
+
+ for (i = 0; i < ARRAY_SIZE(l->b); i++)
+ if (l->b[i] == dev_bucket && atomic_read(&l->l[i]))
+ return true;
+ return false;
+}
+
+#define sign(v) (v < 0 ? -1 : v > 0 ? 1 : 0)
+
+void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos bucket, int flags)
+{
+ u64 dev_bucket = bucket_to_u64(bucket);
+ struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
+ int lock_val = flags ? 1 : -1;
+ unsigned i;
+
+ for (i = 0; i < ARRAY_SIZE(l->b); i++)
+ if (l->b[i] == dev_bucket) {
+ int v = atomic_sub_return(lock_val, &l->l[i]);
+
+ BUG_ON(v && sign(v) != lock_val);
+ if (!v)
+ closure_wake_up(&l->wait);
+ return;
+ }
+
+ BUG();
+}
+
+bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l,
+ u64 dev_bucket, int flags)
+{
+ int v, lock_val = flags ? 1 : -1;
+ unsigned i;
+
+ spin_lock(&l->lock);
+
+ for (i = 0; i < ARRAY_SIZE(l->b); i++)
+ if (l->b[i] == dev_bucket)
+ goto got_entry;
+
+ for (i = 0; i < ARRAY_SIZE(l->b); i++)
+ if (!atomic_read(&l->l[i])) {
+ l->b[i] = dev_bucket;
+ goto take_lock;
+ }
+fail:
+ spin_unlock(&l->lock);
+ return false;
+got_entry:
+ v = atomic_read(&l->l[i]);
+ if (lock_val > 0 ? v < 0 : v > 0)
+ goto fail;
+take_lock:
+ v = atomic_read(&l->l[i]);
+ /* Overflow? */
+ if (v && sign(v + lock_val) != sign(v))
+ goto fail;
+
+ atomic_add(lock_val, &l->l[i]);
+ spin_unlock(&l->lock);
+ return true;
+}
+
+void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
+ struct nocow_lock_bucket *l,
+ u64 dev_bucket, int flags)
+{
+ if (!__bch2_bucket_nocow_trylock(l, dev_bucket, flags)) {
+ struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks);
+ u64 start_time = local_clock();
+
+ __closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags));
+ bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
+ }
+}
+
+void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_table *t)
+
+{
+ unsigned i, nr_zero = 0;
+ struct nocow_lock_bucket *l;
+
+ for (l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) {
+ unsigned v = 0;
+
+ for (i = 0; i < ARRAY_SIZE(l->l); i++)
+ v |= atomic_read(&l->l[i]);
+
+ if (!v) {
+ nr_zero++;
+ continue;
+ }
+
+ if (nr_zero)
+ prt_printf(out, "(%u empty entries)\n", nr_zero);
+ nr_zero = 0;
+
+ for (i = 0; i < ARRAY_SIZE(l->l); i++) {
+ int v = atomic_read(&l->l[i]);
+ if (v) {
+ bch2_bpos_to_text(out, u64_to_bucket(l->b[i]));
+ prt_printf(out, ": %s %u ", v < 0 ? "copy" : "update", abs(v));
+ }
+ }
+ prt_newline(out);
+ }
+
+ if (nr_zero)
+ prt_printf(out, "(%u empty entries)\n", nr_zero);
+}
+
+void bch2_fs_nocow_locking_exit(struct bch_fs *c)
+{
+ struct bucket_nocow_lock_table *t = &c->nocow_locks;
+
+ for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++)
+ for (unsigned j = 0; j < ARRAY_SIZE(l->l); j++)
+ BUG_ON(atomic_read(&l->l[j]));
+}
+
+int bch2_fs_nocow_locking_init(struct bch_fs *c)
+{
+ struct bucket_nocow_lock_table *t = &c->nocow_locks;
+
+ for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++)
+ spin_lock_init(&l->lock);
+
+ return 0;
+}
diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h
new file mode 100644
index 000000000000..f9d6a426a960
--- /dev/null
+++ b/fs/bcachefs/nocow_locking.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_NOCOW_LOCKING_H
+#define _BCACHEFS_NOCOW_LOCKING_H
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "nocow_locking_types.h"
+
+#include <linux/hash.h>
+
+static inline struct nocow_lock_bucket *bucket_nocow_lock(struct bucket_nocow_lock_table *t,
+ u64 dev_bucket)
+{
+ unsigned h = hash_64(dev_bucket, BUCKET_NOCOW_LOCKS_BITS);
+
+ return t->l + (h & (BUCKET_NOCOW_LOCKS - 1));
+}
+
+#define BUCKET_NOCOW_LOCK_UPDATE (1 << 0)
+
+bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *, struct bpos);
+void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *, struct bpos, int);
+bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *, u64, int);
+void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *,
+ struct nocow_lock_bucket *, u64, int);
+
+static inline void bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
+ struct bpos bucket, int flags)
+{
+ u64 dev_bucket = bucket_to_u64(bucket);
+ struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
+
+ __bch2_bucket_nocow_lock(t, l, dev_bucket, flags);
+}
+
+static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t,
+ struct bpos bucket, int flags)
+{
+ u64 dev_bucket = bucket_to_u64(bucket);
+ struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
+
+ return __bch2_bucket_nocow_trylock(l, dev_bucket, flags);
+}
+
+void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *);
+
+void bch2_fs_nocow_locking_exit(struct bch_fs *);
+int bch2_fs_nocow_locking_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_NOCOW_LOCKING_H */
diff --git a/fs/bcachefs/nocow_locking_types.h b/fs/bcachefs/nocow_locking_types.h
new file mode 100644
index 000000000000..bd12bf677924
--- /dev/null
+++ b/fs/bcachefs/nocow_locking_types.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_NOCOW_LOCKING_TYPES_H
+#define _BCACHEFS_NOCOW_LOCKING_TYPES_H
+
+#define BUCKET_NOCOW_LOCKS_BITS 10
+#define BUCKET_NOCOW_LOCKS (1U << BUCKET_NOCOW_LOCKS_BITS)
+
+struct nocow_lock_bucket {
+ struct closure_waitlist wait;
+ spinlock_t lock;
+ u64 b[4];
+ atomic_t l[4];
+} __aligned(SMP_CACHE_BYTES);
+
+struct bucket_nocow_lock_table {
+ struct nocow_lock_bucket l[BUCKET_NOCOW_LOCKS];
+};
+
+#endif /* _BCACHEFS_NOCOW_LOCKING_TYPES_H */
+
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
new file mode 100644
index 000000000000..8dd4046cca41
--- /dev/null
+++ b/fs/bcachefs/opts.c
@@ -0,0 +1,602 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/kernel.h>
+
+#include "bcachefs.h"
+#include "compress.h"
+#include "disk_groups.h"
+#include "error.h"
+#include "opts.h"
+#include "super-io.h"
+#include "util.h"
+
+#define x(t, n, ...) [n] = #t,
+
+const char * const bch2_error_actions[] = {
+ BCH_ERROR_ACTIONS()
+ NULL
+};
+
+const char * const bch2_fsck_fix_opts[] = {
+ BCH_FIX_ERRORS_OPTS()
+ NULL
+};
+
+const char * const bch2_version_upgrade_opts[] = {
+ BCH_VERSION_UPGRADE_OPTS()
+ NULL
+};
+
+const char * const bch2_sb_features[] = {
+ BCH_SB_FEATURES()
+ NULL
+};
+
+const char * const bch2_sb_compat[] = {
+ BCH_SB_COMPAT()
+ NULL
+};
+
+const char * const __bch2_btree_ids[] = {
+ BCH_BTREE_IDS()
+ NULL
+};
+
+const char * const bch2_csum_types[] = {
+ BCH_CSUM_TYPES()
+ NULL
+};
+
+const char * const bch2_csum_opts[] = {
+ BCH_CSUM_OPTS()
+ NULL
+};
+
+const char * const bch2_compression_types[] = {
+ BCH_COMPRESSION_TYPES()
+ NULL
+};
+
+const char * const bch2_compression_opts[] = {
+ BCH_COMPRESSION_OPTS()
+ NULL
+};
+
+const char * const bch2_str_hash_types[] = {
+ BCH_STR_HASH_TYPES()
+ NULL
+};
+
+const char * const bch2_str_hash_opts[] = {
+ BCH_STR_HASH_OPTS()
+ NULL
+};
+
+const char * const bch2_data_types[] = {
+ BCH_DATA_TYPES()
+ NULL
+};
+
+const char * const bch2_member_states[] = {
+ BCH_MEMBER_STATES()
+ NULL
+};
+
+const char * const bch2_jset_entry_types[] = {
+ BCH_JSET_ENTRY_TYPES()
+ NULL
+};
+
+const char * const bch2_fs_usage_types[] = {
+ BCH_FS_USAGE_TYPES()
+ NULL
+};
+
+#undef x
+
+static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res,
+ struct printbuf *err)
+{
+ if (!val) {
+ *res = FSCK_FIX_yes;
+ } else {
+ int ret = match_string(bch2_fsck_fix_opts, -1, val);
+
+ if (ret < 0 && err)
+ prt_str(err, "fix_errors: invalid selection");
+ if (ret < 0)
+ return ret;
+ *res = ret;
+ }
+
+ return 0;
+}
+
+static void bch2_opt_fix_errors_to_text(struct printbuf *out,
+ struct bch_fs *c,
+ struct bch_sb *sb,
+ u64 v)
+{
+ prt_str(out, bch2_fsck_fix_opts[v]);
+}
+
+#define bch2_opt_fix_errors (struct bch_opt_fn) { \
+ .parse = bch2_opt_fix_errors_parse, \
+ .to_text = bch2_opt_fix_errors_to_text, \
+}
+
+const char * const bch2_d_types[BCH_DT_MAX] = {
+ [DT_UNKNOWN] = "unknown",
+ [DT_FIFO] = "fifo",
+ [DT_CHR] = "chr",
+ [DT_DIR] = "dir",
+ [DT_BLK] = "blk",
+ [DT_REG] = "reg",
+ [DT_LNK] = "lnk",
+ [DT_SOCK] = "sock",
+ [DT_WHT] = "whiteout",
+ [DT_SUBVOL] = "subvol",
+};
+
+u64 BCH2_NO_SB_OPT(const struct bch_sb *sb)
+{
+ BUG();
+}
+
+void SET_BCH2_NO_SB_OPT(struct bch_sb *sb, u64 v)
+{
+ BUG();
+}
+
+void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
+{
+#define x(_name, ...) \
+ if (opt_defined(src, _name)) \
+ opt_set(*dst, _name, src._name);
+
+ BCH_OPTS()
+#undef x
+}
+
+bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id)
+{
+ switch (id) {
+#define x(_name, ...) \
+ case Opt_##_name: \
+ return opt_defined(*opts, _name);
+ BCH_OPTS()
+#undef x
+ default:
+ BUG();
+ }
+}
+
+u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
+{
+ switch (id) {
+#define x(_name, ...) \
+ case Opt_##_name: \
+ return opts->_name;
+ BCH_OPTS()
+#undef x
+ default:
+ BUG();
+ }
+}
+
+void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v)
+{
+ switch (id) {
+#define x(_name, ...) \
+ case Opt_##_name: \
+ opt_set(*opts, _name, v); \
+ break;
+ BCH_OPTS()
+#undef x
+ default:
+ BUG();
+ }
+}
+
+const struct bch_option bch2_opt_table[] = {
+#define OPT_BOOL() .type = BCH_OPT_BOOL, .min = 0, .max = 2
+#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \
+ .min = _min, .max = _max
+#define OPT_STR(_choices) .type = BCH_OPT_STR, \
+ .min = 0, .max = ARRAY_SIZE(_choices), \
+ .choices = _choices
+#define OPT_FN(_fn) .type = BCH_OPT_FN, .fn = _fn
+
+#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \
+ [Opt_##_name] = { \
+ .attr = { \
+ .name = #_name, \
+ .mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \
+ }, \
+ .flags = _flags, \
+ .hint = _hint, \
+ .help = _help, \
+ .get_sb = _sb_opt, \
+ .set_sb = SET_##_sb_opt, \
+ _type \
+ },
+
+ BCH_OPTS()
+#undef x
+};
+
+int bch2_opt_lookup(const char *name)
+{
+ const struct bch_option *i;
+
+ for (i = bch2_opt_table;
+ i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table);
+ i++)
+ if (!strcmp(name, i->attr.name))
+ return i - bch2_opt_table;
+
+ return -1;
+}
+
+struct synonym {
+ const char *s1, *s2;
+};
+
+static const struct synonym bch_opt_synonyms[] = {
+ { "quota", "usrquota" },
+};
+
+static int bch2_mount_opt_lookup(const char *name)
+{
+ const struct synonym *i;
+
+ for (i = bch_opt_synonyms;
+ i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms);
+ i++)
+ if (!strcmp(name, i->s1))
+ name = i->s2;
+
+ return bch2_opt_lookup(name);
+}
+
+int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
+{
+ if (v < opt->min) {
+ if (err)
+ prt_printf(err, "%s: too small (min %llu)",
+ opt->attr.name, opt->min);
+ return -BCH_ERR_ERANGE_option_too_small;
+ }
+
+ if (opt->max && v >= opt->max) {
+ if (err)
+ prt_printf(err, "%s: too big (max %llu)",
+ opt->attr.name, opt->max);
+ return -BCH_ERR_ERANGE_option_too_big;
+ }
+
+ if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) {
+ if (err)
+ prt_printf(err, "%s: not a multiple of 512",
+ opt->attr.name);
+ return -EINVAL;
+ }
+
+ if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) {
+ if (err)
+ prt_printf(err, "%s: must be a power of two",
+ opt->attr.name);
+ return -EINVAL;
+ }
+
+ if (opt->fn.validate)
+ return opt->fn.validate(v, err);
+
+ return 0;
+}
+
+int bch2_opt_parse(struct bch_fs *c,
+ const struct bch_option *opt,
+ const char *val, u64 *res,
+ struct printbuf *err)
+{
+ ssize_t ret;
+
+ switch (opt->type) {
+ case BCH_OPT_BOOL:
+ if (val) {
+ ret = kstrtou64(val, 10, res);
+ } else {
+ ret = 0;
+ *res = 1;
+ }
+
+ if (ret < 0 || (*res != 0 && *res != 1)) {
+ if (err)
+ prt_printf(err, "%s: must be bool", opt->attr.name);
+ return ret;
+ }
+ break;
+ case BCH_OPT_UINT:
+ if (!val) {
+ prt_printf(err, "%s: required value",
+ opt->attr.name);
+ return -EINVAL;
+ }
+
+ ret = opt->flags & OPT_HUMAN_READABLE
+ ? bch2_strtou64_h(val, res)
+ : kstrtou64(val, 10, res);
+ if (ret < 0) {
+ if (err)
+ prt_printf(err, "%s: must be a number",
+ opt->attr.name);
+ return ret;
+ }
+ break;
+ case BCH_OPT_STR:
+ if (!val) {
+ prt_printf(err, "%s: required value",
+ opt->attr.name);
+ return -EINVAL;
+ }
+
+ ret = match_string(opt->choices, -1, val);
+ if (ret < 0) {
+ if (err)
+ prt_printf(err, "%s: invalid selection",
+ opt->attr.name);
+ return ret;
+ }
+
+ *res = ret;
+ break;
+ case BCH_OPT_FN:
+ ret = opt->fn.parse(c, val, res, err);
+ if (ret < 0) {
+ if (err)
+ prt_printf(err, "%s: parse error",
+ opt->attr.name);
+ return ret;
+ }
+ }
+
+ return bch2_opt_validate(opt, *res, err);
+}
+
+void bch2_opt_to_text(struct printbuf *out,
+ struct bch_fs *c, struct bch_sb *sb,
+ const struct bch_option *opt, u64 v,
+ unsigned flags)
+{
+ if (flags & OPT_SHOW_MOUNT_STYLE) {
+ if (opt->type == BCH_OPT_BOOL) {
+ prt_printf(out, "%s%s",
+ v ? "" : "no",
+ opt->attr.name);
+ return;
+ }
+
+ prt_printf(out, "%s=", opt->attr.name);
+ }
+
+ switch (opt->type) {
+ case BCH_OPT_BOOL:
+ case BCH_OPT_UINT:
+ if (opt->flags & OPT_HUMAN_READABLE)
+ prt_human_readable_u64(out, v);
+ else
+ prt_printf(out, "%lli", v);
+ break;
+ case BCH_OPT_STR:
+ if (flags & OPT_SHOW_FULL_LIST)
+ prt_string_option(out, opt->choices, v);
+ else
+ prt_str(out, opt->choices[v]);
+ break;
+ case BCH_OPT_FN:
+ opt->fn.to_text(out, c, sb, v);
+ break;
+ default:
+ BUG();
+ }
+}
+
+int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
+{
+ int ret = 0;
+
+ switch (id) {
+ case Opt_compression:
+ case Opt_background_compression:
+ ret = bch2_check_set_has_compressed_data(c, v);
+ break;
+ case Opt_erasure_code:
+ if (v)
+ bch2_check_set_feature(c, BCH_FEATURE_ec);
+ break;
+ }
+
+ return ret;
+}
+
+int bch2_opts_check_may_set(struct bch_fs *c)
+{
+ unsigned i;
+ int ret;
+
+ for (i = 0; i < bch2_opts_nr; i++) {
+ ret = bch2_opt_check_may_set(c, i,
+ bch2_opt_get_by_id(&c->opts, i));
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
+ char *options)
+{
+ char *copied_opts, *copied_opts_start;
+ char *opt, *name, *val;
+ int ret, id;
+ struct printbuf err = PRINTBUF;
+ u64 v;
+
+ if (!options)
+ return 0;
+
+ /*
+ * sys_fsconfig() is now occasionally providing us with option lists
+ * starting with a comma - weird.
+ */
+ if (*options == ',')
+ options++;
+
+ copied_opts = kstrdup(options, GFP_KERNEL);
+ if (!copied_opts)
+ return -1;
+ copied_opts_start = copied_opts;
+
+ while ((opt = strsep(&copied_opts, ",")) != NULL) {
+ name = strsep(&opt, "=");
+ val = opt;
+
+ id = bch2_mount_opt_lookup(name);
+
+ /* Check for the form "noopt", negation of a boolean opt: */
+ if (id < 0 &&
+ !val &&
+ !strncmp("no", name, 2)) {
+ id = bch2_mount_opt_lookup(name + 2);
+ val = "0";
+ }
+
+ /* Unknown options are ignored: */
+ if (id < 0)
+ continue;
+
+ if (!(bch2_opt_table[id].flags & OPT_MOUNT))
+ goto bad_opt;
+
+ if (id == Opt_acl &&
+ !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
+ goto bad_opt;
+
+ if ((id == Opt_usrquota ||
+ id == Opt_grpquota) &&
+ !IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
+ goto bad_opt;
+
+ ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err);
+ if (ret < 0)
+ goto bad_val;
+
+ bch2_opt_set_by_id(opts, id, v);
+ }
+
+ ret = 0;
+ goto out;
+
+bad_opt:
+ pr_err("Bad mount option %s", name);
+ ret = -1;
+ goto out;
+bad_val:
+ pr_err("Invalid mount option %s", err.buf);
+ ret = -1;
+ goto out;
+out:
+ kfree(copied_opts_start);
+ printbuf_exit(&err);
+ return ret;
+}
+
+u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id)
+{
+ const struct bch_option *opt = bch2_opt_table + id;
+ u64 v;
+
+ v = opt->get_sb(sb);
+
+ if (opt->flags & OPT_SB_FIELD_ILOG2)
+ v = 1ULL << v;
+
+ if (opt->flags & OPT_SB_FIELD_SECTORS)
+ v <<= 9;
+
+ return v;
+}
+
+/*
+ * Initial options from superblock - here we don't want any options undefined,
+ * any options the superblock doesn't specify are set to 0:
+ */
+int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
+{
+ unsigned id;
+
+ for (id = 0; id < bch2_opts_nr; id++) {
+ const struct bch_option *opt = bch2_opt_table + id;
+
+ if (opt->get_sb == BCH2_NO_SB_OPT)
+ continue;
+
+ bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id));
+ }
+
+ return 0;
+}
+
+void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v)
+{
+ if (opt->set_sb == SET_BCH2_NO_SB_OPT)
+ return;
+
+ if (opt->flags & OPT_SB_FIELD_SECTORS)
+ v >>= 9;
+
+ if (opt->flags & OPT_SB_FIELD_ILOG2)
+ v = ilog2(v);
+
+ opt->set_sb(sb, v);
+}
+
+void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v)
+{
+ if (opt->set_sb == SET_BCH2_NO_SB_OPT)
+ return;
+
+ mutex_lock(&c->sb_lock);
+ __bch2_opt_set_sb(c->disk_sb.sb, opt, v);
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+}
+
+/* io opts: */
+
+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
+{
+ return (struct bch_io_opts) {
+#define x(_name, _bits) ._name = src._name,
+ BCH_INODE_OPTS()
+#undef x
+ };
+}
+
+bool bch2_opt_is_inode_opt(enum bch_opt_id id)
+{
+ static const enum bch_opt_id inode_opt_list[] = {
+#define x(_name, _bits) Opt_##_name,
+ BCH_INODE_OPTS()
+#undef x
+ };
+ unsigned i;
+
+ for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++)
+ if (inode_opt_list[i] == id)
+ return true;
+
+ return false;
+}
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
new file mode 100644
index 000000000000..8526f177450a
--- /dev/null
+++ b/fs/bcachefs/opts.h
@@ -0,0 +1,564 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_OPTS_H
+#define _BCACHEFS_OPTS_H
+
+#include <linux/bug.h>
+#include <linux/log2.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include "bcachefs_format.h"
+
+struct bch_fs;
+
+extern const char * const bch2_error_actions[];
+extern const char * const bch2_fsck_fix_opts[];
+extern const char * const bch2_version_upgrade_opts[];
+extern const char * const bch2_sb_features[];
+extern const char * const bch2_sb_compat[];
+extern const char * const __bch2_btree_ids[];
+extern const char * const bch2_csum_types[];
+extern const char * const bch2_csum_opts[];
+extern const char * const bch2_compression_types[];
+extern const char * const bch2_compression_opts[];
+extern const char * const bch2_str_hash_types[];
+extern const char * const bch2_str_hash_opts[];
+extern const char * const bch2_data_types[];
+extern const char * const bch2_member_states[];
+extern const char * const bch2_jset_entry_types[];
+extern const char * const bch2_fs_usage_types[];
+extern const char * const bch2_d_types[];
+
+static inline const char *bch2_d_type_str(unsigned d_type)
+{
+ return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)";
+}
+
+/*
+ * Mount options; we also store defaults in the superblock.
+ *
+ * Also exposed via sysfs: if an option is writeable, and it's also stored in
+ * the superblock, changing it via sysfs (currently? might change this) also
+ * updates the superblock.
+ *
+ * We store options as signed integers, where -1 means undefined. This means we
+ * can pass the mount options to bch2_fs_alloc() as a whole struct, and then only
+ * apply the options from that struct that are defined.
+ */
+
+/* dummy option, for options that aren't stored in the superblock */
+u64 BCH2_NO_SB_OPT(const struct bch_sb *);
+void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64);
+
+/* When can be set: */
+enum opt_flags {
+ OPT_FS = (1 << 0), /* Filesystem option */
+ OPT_DEVICE = (1 << 1), /* Device option */
+ OPT_INODE = (1 << 2), /* Inode option */
+ OPT_FORMAT = (1 << 3), /* May be specified at format time */
+ OPT_MOUNT = (1 << 4), /* May be specified at mount time */
+ OPT_RUNTIME = (1 << 5), /* May be specified at runtime */
+ OPT_HUMAN_READABLE = (1 << 6),
+ OPT_MUST_BE_POW_2 = (1 << 7), /* Must be power of 2 */
+ OPT_SB_FIELD_SECTORS = (1 << 8),/* Superblock field is >> 9 of actual value */
+ OPT_SB_FIELD_ILOG2 = (1 << 9), /* Superblock field is ilog2 of actual value */
+};
+
+enum opt_type {
+ BCH_OPT_BOOL,
+ BCH_OPT_UINT,
+ BCH_OPT_STR,
+ BCH_OPT_FN,
+};
+
+struct bch_opt_fn {
+ int (*parse)(struct bch_fs *, const char *, u64 *, struct printbuf *);
+ void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
+ int (*validate)(u64, struct printbuf *);
+};
+
+/**
+ * x(name, shortopt, type, in mem type, mode, sb_opt)
+ *
+ * @name - name of mount option, sysfs attribute, and struct bch_opts
+ * member
+ *
+ * @mode - when opt may be set
+ *
+ * @sb_option - name of corresponding superblock option
+ *
+ * @type - one of OPT_BOOL, OPT_UINT, OPT_STR
+ */
+
+/*
+ * XXX: add fields for
+ * - default value
+ * - helptext
+ */
+
+#ifdef __KERNEL__
+#define RATELIMIT_ERRORS_DEFAULT true
+#else
+#define RATELIMIT_ERRORS_DEFAULT false
+#endif
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define BCACHEFS_VERBOSE_DEFAULT true
+#else
+#define BCACHEFS_VERBOSE_DEFAULT false
+#endif
+
+#define BCH_FIX_ERRORS_OPTS() \
+ x(exit, 0) \
+ x(yes, 1) \
+ x(no, 2) \
+ x(ask, 3)
+
+enum fsck_err_opts {
+#define x(t, n) FSCK_FIX_##t,
+ BCH_FIX_ERRORS_OPTS()
+#undef x
+};
+
+#define BCH_OPTS() \
+ x(block_size, u16, \
+ OPT_FS|OPT_FORMAT| \
+ OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \
+ OPT_UINT(512, 1U << 16), \
+ BCH_SB_BLOCK_SIZE, 8, \
+ "size", NULL) \
+ x(btree_node_size, u32, \
+ OPT_FS|OPT_FORMAT| \
+ OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \
+ OPT_UINT(512, 1U << 20), \
+ BCH_SB_BTREE_NODE_SIZE, 512, \
+ "size", "Btree node size, default 256k") \
+ x(errors, u8, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_STR(bch2_error_actions), \
+ BCH_SB_ERROR_ACTION, BCH_ON_ERROR_ro, \
+ NULL, "Action to take on filesystem error") \
+ x(metadata_replicas, u8, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_UINT(1, BCH_REPLICAS_MAX), \
+ BCH_SB_META_REPLICAS_WANT, 1, \
+ "#", "Number of metadata replicas") \
+ x(data_replicas, u8, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_UINT(1, BCH_REPLICAS_MAX), \
+ BCH_SB_DATA_REPLICAS_WANT, 1, \
+ "#", "Number of data replicas") \
+ x(metadata_replicas_required, u8, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT, \
+ OPT_UINT(1, BCH_REPLICAS_MAX), \
+ BCH_SB_META_REPLICAS_REQ, 1, \
+ "#", NULL) \
+ x(data_replicas_required, u8, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT, \
+ OPT_UINT(1, BCH_REPLICAS_MAX), \
+ BCH_SB_DATA_REPLICAS_REQ, 1, \
+ "#", NULL) \
+ x(encoded_extent_max, u32, \
+ OPT_FS|OPT_FORMAT| \
+ OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS|OPT_SB_FIELD_ILOG2,\
+ OPT_UINT(4096, 2U << 20), \
+ BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10, \
+ "size", "Maximum size of checksummed/compressed extents")\
+ x(metadata_checksum, u8, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_STR(bch2_csum_opts), \
+ BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
+ NULL, NULL) \
+ x(data_checksum, u8, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_STR(bch2_csum_opts), \
+ BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
+ NULL, NULL) \
+ x(compression, u8, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FN(bch2_opt_compression), \
+ BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \
+ NULL, NULL) \
+ x(background_compression, u8, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FN(bch2_opt_compression), \
+ BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \
+ NULL, NULL) \
+ x(str_hash, u8, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_STR(bch2_str_hash_opts), \
+ BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \
+ NULL, "Hash function for directory entries and xattrs")\
+ x(metadata_target, u16, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FN(bch2_opt_target), \
+ BCH_SB_METADATA_TARGET, 0, \
+ "(target)", "Device or label for metadata writes") \
+ x(foreground_target, u16, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FN(bch2_opt_target), \
+ BCH_SB_FOREGROUND_TARGET, 0, \
+ "(target)", "Device or label for foreground writes") \
+ x(background_target, u16, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FN(bch2_opt_target), \
+ BCH_SB_BACKGROUND_TARGET, 0, \
+ "(target)", "Device or label to move data to in the background")\
+ x(promote_target, u16, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FN(bch2_opt_target), \
+ BCH_SB_PROMOTE_TARGET, 0, \
+ "(target)", "Device or label to promote data to on read") \
+ x(erasure_code, u16, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_BOOL(), \
+ BCH_SB_ERASURE_CODE, false, \
+ NULL, "Enable erasure coding (DO NOT USE YET)") \
+ x(inodes_32bit, u8, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_BOOL(), \
+ BCH_SB_INODE_32BIT, true, \
+ NULL, "Constrain inode numbers to 32 bits") \
+ x(shard_inode_numbers, u8, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_BOOL(), \
+ BCH_SB_SHARD_INUMS, true, \
+ NULL, "Shard new inode numbers by CPU id") \
+ x(inodes_use_key_cache, u8, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH_SB_INODES_USE_KEY_CACHE, true, \
+ NULL, "Use the btree key cache for the inodes btree") \
+ x(btree_node_mem_ptr_optimization, u8, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, true, \
+ NULL, "Stash pointer to in memory btree node in btree ptr")\
+ x(btree_write_buffer_size, u32, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_UINT(16, (1U << 20) - 1), \
+ BCH2_NO_SB_OPT, 1U << 13, \
+ NULL, "Number of btree write buffer entries") \
+ x(gc_reserve_percent, u8, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_UINT(5, 21), \
+ BCH_SB_GC_RESERVE, 8, \
+ "%", "Percentage of disk space to reserve for copygc")\
+ x(gc_reserve_bytes, u64, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME| \
+ OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \
+ OPT_UINT(0, U64_MAX), \
+ BCH_SB_GC_RESERVE_BYTES, 0, \
+ "%", "Amount of disk space to reserve for copygc\n" \
+ "Takes precedence over gc_reserve_percent if set")\
+ x(root_reserve_percent, u8, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT, \
+ OPT_UINT(0, 100), \
+ BCH_SB_ROOT_RESERVE, 0, \
+ "%", "Percentage of disk space to reserve for superuser")\
+ x(wide_macs, u8, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_BOOL(), \
+ BCH_SB_128_BIT_MACS, false, \
+ NULL, "Store full 128 bits of cryptographic MACs, instead of 80")\
+ x(inline_data, u8, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, true, \
+ NULL, "Enable inline data extents") \
+ x(acl, u8, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH_SB_POSIX_ACL, true, \
+ NULL, "Enable POSIX acls") \
+ x(usrquota, u8, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH_SB_USRQUOTA, false, \
+ NULL, "Enable user quotas") \
+ x(grpquota, u8, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH_SB_GRPQUOTA, false, \
+ NULL, "Enable group quotas") \
+ x(prjquota, u8, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH_SB_PRJQUOTA, false, \
+ NULL, "Enable project quotas") \
+ x(degraded, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Allow mounting in degraded mode") \
+ x(very_degraded, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Allow mounting in when data will be missing") \
+ x(discard, u8, \
+ OPT_FS|OPT_MOUNT|OPT_DEVICE, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, true, \
+ NULL, "Enable discard/TRIM support") \
+ x(verbose, u8, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, BCACHEFS_VERBOSE_DEFAULT, \
+ NULL, "Extra debugging information during mount/recovery")\
+ x(journal_flush_delay, u32, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_UINT(1, U32_MAX), \
+ BCH_SB_JOURNAL_FLUSH_DELAY, 1000, \
+ NULL, "Delay in milliseconds before automatic journal commits")\
+ x(journal_flush_disabled, u8, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_BOOL(), \
+ BCH_SB_JOURNAL_FLUSH_DISABLED,false, \
+ NULL, "Disable journal flush on sync/fsync\n" \
+ "If enabled, writes can be lost, but only since the\n"\
+ "last journal write (default 1 second)") \
+ x(journal_reclaim_delay, u32, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_UINT(0, U32_MAX), \
+ BCH_SB_JOURNAL_RECLAIM_DELAY, 100, \
+ NULL, "Delay in milliseconds before automatic journal reclaim")\
+ x(move_bytes_in_flight, u32, \
+ OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_UINT(1024, U32_MAX), \
+ BCH2_NO_SB_OPT, 1U << 20, \
+ NULL, "Maximum Amount of IO to keep in flight by the move path")\
+ x(move_ios_in_flight, u32, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_UINT(1, 1024), \
+ BCH2_NO_SB_OPT, 32, \
+ NULL, "Maximum number of IOs to keep in flight by the move path")\
+ x(fsck, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Run fsck on mount") \
+ x(fix_errors, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_FN(bch2_opt_fix_errors), \
+ BCH2_NO_SB_OPT, FSCK_FIX_exit, \
+ NULL, "Fix errors during fsck without asking") \
+ x(ratelimit_errors, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, RATELIMIT_ERRORS_DEFAULT, \
+ NULL, "Ratelimit error messages during fsck") \
+ x(nochanges, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Super read only mode - no writes at all will be issued,\n"\
+ "even if we have to replay the journal") \
+ x(norecovery, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Don't replay the journal") \
+ x(keep_journal, u8, \
+ 0, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Don't free journal entries/keys after startup")\
+ x(read_entire_journal, u8, \
+ 0, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Read all journal entries, not just dirty ones")\
+ x(read_journal_only, u8, \
+ 0, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Only read the journal, skip the rest of recovery")\
+ x(journal_transaction_names, u8, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_BOOL(), \
+ BCH_SB_JOURNAL_TRANSACTION_NAMES, true, \
+ NULL, "Log transaction function names in journal") \
+ x(noexcl, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Don't open device in exclusive mode") \
+ x(direct_io, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, true, \
+ NULL, "Use O_DIRECT (userspace only)") \
+ x(sb, u64, \
+ OPT_MOUNT, \
+ OPT_UINT(0, S64_MAX), \
+ BCH2_NO_SB_OPT, BCH_SB_SECTOR, \
+ "offset", "Sector offset of superblock") \
+ x(read_only, u8, \
+ OPT_FS, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, NULL) \
+ x(nostart, u8, \
+ 0, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Don\'t start filesystem, only open devices") \
+ x(reconstruct_alloc, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Reconstruct alloc btree") \
+ x(version_upgrade, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_STR(bch2_version_upgrade_opts), \
+ BCH_SB_VERSION_UPGRADE, BCH_VERSION_UPGRADE_compatible, \
+ NULL, "Set superblock to latest version,\n" \
+ "allowing any new features to be used") \
+ x(buckets_nouse, u8, \
+ 0, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Allocate the buckets_nouse bitmap") \
+ x(project, u8, \
+ OPT_INODE, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, NULL) \
+ x(nocow, u8, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
+ OPT_BOOL(), \
+ BCH_SB_NOCOW, false, \
+ NULL, "Nocow mode: Writes will be done in place when possible.\n"\
+ "Snapshots and reflink will still caused writes to be COW\n"\
+ "Implicitly disables data checksumming, compression and encryption")\
+ x(nocow_enabled, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, true, \
+ NULL, "Enable nocow mode: enables runtime locking in\n"\
+ "data move path needed if nocow will ever be in use\n")\
+ x(no_data_io, u8, \
+ OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Skip submit_bio() for data reads and writes, " \
+ "for performance testing purposes") \
+ x(fs_size, u64, \
+ OPT_DEVICE, \
+ OPT_UINT(0, S64_MAX), \
+ BCH2_NO_SB_OPT, 0, \
+ "size", "Size of filesystem on device") \
+ x(bucket, u32, \
+ OPT_DEVICE, \
+ OPT_UINT(0, S64_MAX), \
+ BCH2_NO_SB_OPT, 0, \
+ "size", "Size of filesystem on device") \
+ x(durability, u8, \
+ OPT_DEVICE, \
+ OPT_UINT(0, BCH_REPLICAS_MAX), \
+ BCH2_NO_SB_OPT, 1, \
+ "n", "Data written to this device will be considered\n"\
+ "to have already been replicated n times")
+
+struct bch_opts {
+#define x(_name, _bits, ...) unsigned _name##_defined:1;
+ BCH_OPTS()
+#undef x
+
+#define x(_name, _bits, ...) _bits _name;
+ BCH_OPTS()
+#undef x
+};
+
+static const __maybe_unused struct bch_opts bch2_opts_default = {
+#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...) \
+ ._name##_defined = true, \
+ ._name = _default, \
+
+ BCH_OPTS()
+#undef x
+};
+
+#define opt_defined(_opts, _name) ((_opts)._name##_defined)
+
+#define opt_get(_opts, _name) \
+ (opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name)
+
+#define opt_set(_opts, _name, _v) \
+do { \
+ (_opts)._name##_defined = true; \
+ (_opts)._name = _v; \
+} while (0)
+
+static inline struct bch_opts bch2_opts_empty(void)
+{
+ return (struct bch_opts) { 0 };
+}
+
+void bch2_opts_apply(struct bch_opts *, struct bch_opts);
+
+enum bch_opt_id {
+#define x(_name, ...) Opt_##_name,
+ BCH_OPTS()
+#undef x
+ bch2_opts_nr
+};
+
+struct bch_fs;
+struct printbuf;
+
+struct bch_option {
+ struct attribute attr;
+ u64 (*get_sb)(const struct bch_sb *);
+ void (*set_sb)(struct bch_sb *, u64);
+ enum opt_type type;
+ enum opt_flags flags;
+ u64 min, max;
+
+ const char * const *choices;
+
+ struct bch_opt_fn fn;
+
+ const char *hint;
+ const char *help;
+
+};
+
+extern const struct bch_option bch2_opt_table[];
+
+bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
+u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
+void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
+
+u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id);
+int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *);
+void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64);
+void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64);
+
+int bch2_opt_lookup(const char *);
+int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *);
+int bch2_opt_parse(struct bch_fs *, const struct bch_option *,
+ const char *, u64 *, struct printbuf *);
+
+#define OPT_SHOW_FULL_LIST (1 << 0)
+#define OPT_SHOW_MOUNT_STYLE (1 << 1)
+
+void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *,
+ const struct bch_option *, u64, unsigned);
+
+int bch2_opt_check_may_set(struct bch_fs *, int, u64);
+int bch2_opts_check_may_set(struct bch_fs *);
+int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, char *);
+
+/* inode opts: */
+
+struct bch_io_opts {
+#define x(_name, _bits) u##_bits _name;
+ BCH_INODE_OPTS()
+#undef x
+};
+
+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
+bool bch2_opt_is_inode_opt(enum bch_opt_id);
+
+#endif /* _BCACHEFS_OPTS_H */
diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c
new file mode 100644
index 000000000000..accf246c3233
--- /dev/null
+++ b/fs/bcachefs/printbuf.c
@@ -0,0 +1,447 @@
+// SPDX-License-Identifier: LGPL-2.1+
+/* Copyright (C) 2022 Kent Overstreet */
+
+#include <linux/bitmap.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string_helpers.h>
+
+#include "printbuf.h"
+
+static inline unsigned printbuf_linelen(struct printbuf *buf)
+{
+ return buf->pos - buf->last_newline;
+}
+
+int bch2_printbuf_make_room(struct printbuf *out, unsigned extra)
+{
+ unsigned new_size;
+ char *buf;
+
+ if (!out->heap_allocated)
+ return 0;
+
+ /* Reserved space for terminating nul: */
+ extra += 1;
+
+ if (out->pos + extra < out->size)
+ return 0;
+
+ new_size = roundup_pow_of_two(out->size + extra);
+
+ /*
+ * Note: output buffer must be freeable with kfree(), it's not required
+ * that the user use printbuf_exit().
+ */
+ buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT);
+
+ if (!buf) {
+ out->allocation_failure = true;
+ return -ENOMEM;
+ }
+
+ out->buf = buf;
+ out->size = new_size;
+ return 0;
+}
+
+void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args)
+{
+ int len;
+
+ do {
+ va_list args2;
+
+ va_copy(args2, args);
+ len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2);
+ } while (len + 1 >= printbuf_remaining(out) &&
+ !bch2_printbuf_make_room(out, len + 1));
+
+ len = min_t(size_t, len,
+ printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0);
+ out->pos += len;
+}
+
+void bch2_prt_printf(struct printbuf *out, const char *fmt, ...)
+{
+ va_list args;
+ int len;
+
+ do {
+ va_start(args, fmt);
+ len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args);
+ va_end(args);
+ } while (len + 1 >= printbuf_remaining(out) &&
+ !bch2_printbuf_make_room(out, len + 1));
+
+ len = min_t(size_t, len,
+ printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0);
+ out->pos += len;
+}
+
+/**
+ * bch2_printbuf_str() - returns printbuf's buf as a C string, guaranteed to be
+ * null terminated
+ * @buf: printbuf to terminate
+ * Returns: Printbuf contents, as a nul terminated C string
+ */
+const char *bch2_printbuf_str(const struct printbuf *buf)
+{
+ /*
+ * If we've written to a printbuf then it's guaranteed to be a null
+ * terminated string - but if we haven't, then we might not have
+ * allocated a buffer at all:
+ */
+ return buf->pos
+ ? buf->buf
+ : "";
+}
+
+/**
+ * bch2_printbuf_exit() - exit a printbuf, freeing memory it owns and poisoning it
+ * against accidental use.
+ * @buf: printbuf to exit
+ */
+void bch2_printbuf_exit(struct printbuf *buf)
+{
+ if (buf->heap_allocated) {
+ kfree(buf->buf);
+ buf->buf = ERR_PTR(-EINTR); /* poison value */
+ }
+}
+
+void bch2_printbuf_tabstops_reset(struct printbuf *buf)
+{
+ buf->nr_tabstops = 0;
+}
+
+void bch2_printbuf_tabstop_pop(struct printbuf *buf)
+{
+ if (buf->nr_tabstops)
+ --buf->nr_tabstops;
+}
+
+/*
+ * bch2_printbuf_tabstop_set() - add a tabstop, n spaces from the previous tabstop
+ *
+ * @buf: printbuf to control
+ * @spaces: number of spaces from previous tabpstop
+ *
+ * In the future this function may allocate memory if setting more than
+ * PRINTBUF_INLINE_TABSTOPS or setting tabstops more than 255 spaces from start
+ * of line.
+ */
+int bch2_printbuf_tabstop_push(struct printbuf *buf, unsigned spaces)
+{
+ unsigned prev_tabstop = buf->nr_tabstops
+ ? buf->_tabstops[buf->nr_tabstops - 1]
+ : 0;
+
+ if (WARN_ON(buf->nr_tabstops >= ARRAY_SIZE(buf->_tabstops)))
+ return -EINVAL;
+
+ buf->_tabstops[buf->nr_tabstops++] = prev_tabstop + spaces;
+ buf->has_indent_or_tabstops = true;
+ return 0;
+}
+
+/**
+ * bch2_printbuf_indent_add() - add to the current indent level
+ *
+ * @buf: printbuf to control
+ * @spaces: number of spaces to add to the current indent level
+ *
+ * Subsequent lines, and the current line if the output position is at the start
+ * of the current line, will be indented by @spaces more spaces.
+ */
+void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces)
+{
+ if (WARN_ON_ONCE(buf->indent + spaces < buf->indent))
+ spaces = 0;
+
+ buf->indent += spaces;
+ prt_chars(buf, ' ', spaces);
+
+ buf->has_indent_or_tabstops = true;
+}
+
+/**
+ * bch2_printbuf_indent_sub() - subtract from the current indent level
+ *
+ * @buf: printbuf to control
+ * @spaces: number of spaces to subtract from the current indent level
+ *
+ * Subsequent lines, and the current line if the output position is at the start
+ * of the current line, will be indented by @spaces less spaces.
+ */
+void bch2_printbuf_indent_sub(struct printbuf *buf, unsigned spaces)
+{
+ if (WARN_ON_ONCE(spaces > buf->indent))
+ spaces = buf->indent;
+
+ if (buf->last_newline + buf->indent == buf->pos) {
+ buf->pos -= spaces;
+ printbuf_nul_terminate(buf);
+ }
+ buf->indent -= spaces;
+
+ if (!buf->indent && !buf->nr_tabstops)
+ buf->has_indent_or_tabstops = false;
+}
+
+void bch2_prt_newline(struct printbuf *buf)
+{
+ unsigned i;
+
+ bch2_printbuf_make_room(buf, 1 + buf->indent);
+
+ __prt_char(buf, '\n');
+
+ buf->last_newline = buf->pos;
+
+ for (i = 0; i < buf->indent; i++)
+ __prt_char(buf, ' ');
+
+ printbuf_nul_terminate(buf);
+
+ buf->last_field = buf->pos;
+ buf->cur_tabstop = 0;
+}
+
+/*
+ * Returns spaces from start of line, if set, or 0 if unset:
+ */
+static inline unsigned cur_tabstop(struct printbuf *buf)
+{
+ return buf->cur_tabstop < buf->nr_tabstops
+ ? buf->_tabstops[buf->cur_tabstop]
+ : 0;
+}
+
+static void __prt_tab(struct printbuf *out)
+{
+ int spaces = max_t(int, 0, cur_tabstop(out) - printbuf_linelen(out));
+
+ prt_chars(out, ' ', spaces);
+
+ out->last_field = out->pos;
+ out->cur_tabstop++;
+}
+
+/**
+ * bch2_prt_tab() - Advance printbuf to the next tabstop
+ * @out: printbuf to control
+ *
+ * Advance output to the next tabstop by printing spaces.
+ */
+void bch2_prt_tab(struct printbuf *out)
+{
+ if (WARN_ON(!cur_tabstop(out)))
+ return;
+
+ __prt_tab(out);
+}
+
+static void __prt_tab_rjust(struct printbuf *buf)
+{
+ unsigned move = buf->pos - buf->last_field;
+ int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf);
+
+ if (pad > 0) {
+ bch2_printbuf_make_room(buf, pad);
+
+ if (buf->last_field + pad < buf->size)
+ memmove(buf->buf + buf->last_field + pad,
+ buf->buf + buf->last_field,
+ min(move, buf->size - 1 - buf->last_field - pad));
+
+ if (buf->last_field < buf->size)
+ memset(buf->buf + buf->last_field, ' ',
+ min((unsigned) pad, buf->size - buf->last_field));
+
+ buf->pos += pad;
+ printbuf_nul_terminate(buf);
+ }
+
+ buf->last_field = buf->pos;
+ buf->cur_tabstop++;
+}
+
+/**
+ * bch2_prt_tab_rjust - Advance printbuf to the next tabstop, right justifying
+ * previous output
+ *
+ * @buf: printbuf to control
+ *
+ * Advance output to the next tabstop by inserting spaces immediately after the
+ * previous tabstop, right justifying previously outputted text.
+ */
+void bch2_prt_tab_rjust(struct printbuf *buf)
+{
+ if (WARN_ON(!cur_tabstop(buf)))
+ return;
+
+ __prt_tab_rjust(buf);
+}
+
+/**
+ * bch2_prt_bytes_indented() - Print an array of chars, handling embedded control characters
+ *
+ * @out: output printbuf
+ * @str: string to print
+ * @count: number of bytes to print
+ *
+ * The following contol characters are handled as so:
+ * \n: prt_newline newline that obeys current indent level
+ * \t: prt_tab advance to next tabstop
+ * \r: prt_tab_rjust advance to next tabstop, with right justification
+ */
+void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned count)
+{
+ const char *unprinted_start = str;
+ const char *end = str + count;
+
+ if (!out->has_indent_or_tabstops || out->suppress_indent_tabstop_handling) {
+ prt_bytes(out, str, count);
+ return;
+ }
+
+ while (str != end) {
+ switch (*str) {
+ case '\n':
+ prt_bytes(out, unprinted_start, str - unprinted_start);
+ unprinted_start = str + 1;
+ bch2_prt_newline(out);
+ break;
+ case '\t':
+ if (likely(cur_tabstop(out))) {
+ prt_bytes(out, unprinted_start, str - unprinted_start);
+ unprinted_start = str + 1;
+ __prt_tab(out);
+ }
+ break;
+ case '\r':
+ if (likely(cur_tabstop(out))) {
+ prt_bytes(out, unprinted_start, str - unprinted_start);
+ unprinted_start = str + 1;
+ __prt_tab_rjust(out);
+ }
+ break;
+ }
+
+ str++;
+ }
+
+ prt_bytes(out, unprinted_start, str - unprinted_start);
+}
+
+/**
+ * bch2_prt_human_readable_u64() - Print out a u64 in human readable units
+ * @out: output printbuf
+ * @v: integer to print
+ *
+ * Units of 2^10 (default) or 10^3 are controlled via @out->si_units
+ */
+void bch2_prt_human_readable_u64(struct printbuf *out, u64 v)
+{
+ bch2_printbuf_make_room(out, 10);
+ out->pos += string_get_size(v, 1, !out->si_units,
+ out->buf + out->pos,
+ printbuf_remaining_size(out));
+}
+
+/**
+ * bch2_prt_human_readable_s64() - Print out a s64 in human readable units
+ * @out: output printbuf
+ * @v: integer to print
+ *
+ * Units of 2^10 (default) or 10^3 are controlled via @out->si_units
+ */
+void bch2_prt_human_readable_s64(struct printbuf *out, s64 v)
+{
+ if (v < 0)
+ prt_char(out, '-');
+ bch2_prt_human_readable_u64(out, abs(v));
+}
+
+/**
+ * bch2_prt_units_u64() - Print out a u64 according to printbuf unit options
+ * @out: output printbuf
+ * @v: integer to print
+ *
+ * Units are either raw (default), or human reabable units (controlled via
+ * @buf->human_readable_units)
+ */
+void bch2_prt_units_u64(struct printbuf *out, u64 v)
+{
+ if (out->human_readable_units)
+ bch2_prt_human_readable_u64(out, v);
+ else
+ bch2_prt_printf(out, "%llu", v);
+}
+
+/**
+ * bch2_prt_units_s64() - Print out a s64 according to printbuf unit options
+ * @out: output printbuf
+ * @v: integer to print
+ *
+ * Units are either raw (default), or human reabable units (controlled via
+ * @buf->human_readable_units)
+ */
+void bch2_prt_units_s64(struct printbuf *out, s64 v)
+{
+ if (v < 0)
+ prt_char(out, '-');
+ bch2_prt_units_u64(out, abs(v));
+}
+
+void bch2_prt_string_option(struct printbuf *out,
+ const char * const list[],
+ size_t selected)
+{
+ size_t i;
+
+ for (i = 0; list[i]; i++)
+ bch2_prt_printf(out, i == selected ? "[%s] " : "%s ", list[i]);
+}
+
+void bch2_prt_bitflags(struct printbuf *out,
+ const char * const list[], u64 flags)
+{
+ unsigned bit, nr = 0;
+ bool first = true;
+
+ while (list[nr])
+ nr++;
+
+ while (flags && (bit = __ffs64(flags)) < nr) {
+ if (!first)
+ bch2_prt_printf(out, ",");
+ first = false;
+ bch2_prt_printf(out, "%s", list[bit]);
+ flags ^= BIT_ULL(bit);
+ }
+}
+
+void bch2_prt_bitflags_vector(struct printbuf *out,
+ const char * const list[],
+ unsigned long *v, unsigned nr)
+{
+ bool first = true;
+ unsigned i;
+
+ for (i = 0; i < nr; i++)
+ if (!list[i]) {
+ nr = i - 1;
+ break;
+ }
+
+ for_each_set_bit(i, v, nr) {
+ if (!first)
+ bch2_prt_printf(out, ",");
+ first = false;
+ bch2_prt_printf(out, "%s", list[i]);
+ }
+}
diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h
new file mode 100644
index 000000000000..9a4a56c40937
--- /dev/null
+++ b/fs/bcachefs/printbuf.h
@@ -0,0 +1,286 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+/* Copyright (C) 2022 Kent Overstreet */
+
+#ifndef _BCACHEFS_PRINTBUF_H
+#define _BCACHEFS_PRINTBUF_H
+
+/*
+ * Printbufs: Simple strings for printing to, with optional heap allocation
+ *
+ * This code has provisions for use in userspace, to aid in making other code
+ * portable between kernelspace and userspace.
+ *
+ * Basic example:
+ * struct printbuf buf = PRINTBUF;
+ *
+ * prt_printf(&buf, "foo=");
+ * foo_to_text(&buf, foo);
+ * printk("%s", buf.buf);
+ * printbuf_exit(&buf);
+ *
+ * Or
+ * struct printbuf buf = PRINTBUF_EXTERN(char_buf, char_buf_size)
+ *
+ * We can now write pretty printers instead of writing code that dumps
+ * everything to the kernel log buffer, and then those pretty-printers can be
+ * used by other code that outputs to kernel log, sysfs, debugfs, etc.
+ *
+ * Memory allocation: Outputing to a printbuf may allocate memory. This
+ * allocation is done with GFP_KERNEL, by default: use the newer
+ * memalloc_*_(save|restore) functions as needed.
+ *
+ * Since no equivalent yet exists for GFP_ATOMIC/GFP_NOWAIT, memory allocations
+ * will be done with GFP_NOWAIT if printbuf->atomic is nonzero.
+ *
+ * It's allowed to grab the output buffer and free it later with kfree() instead
+ * of using printbuf_exit(), if the user just needs a heap allocated string at
+ * the end.
+ *
+ * Memory allocation failures: We don't return errors directly, because on
+ * memory allocation failure we usually don't want to bail out and unwind - we
+ * want to print what we've got, on a best-effort basis. But code that does want
+ * to return -ENOMEM may check printbuf.allocation_failure.
+ *
+ * Indenting, tabstops:
+ *
+ * To aid is writing multi-line pretty printers spread across multiple
+ * functions, printbufs track the current indent level.
+ *
+ * printbuf_indent_push() and printbuf_indent_pop() increase and decrease the current indent
+ * level, respectively.
+ *
+ * To use tabstops, set printbuf->tabstops[]; they are in units of spaces, from
+ * start of line. Once set, prt_tab() will output spaces up to the next tabstop.
+ * prt_tab_rjust() will also advance the current line of text up to the next
+ * tabstop, but it does so by shifting text since the previous tabstop up to the
+ * next tabstop - right justifying it.
+ *
+ * Make sure you use prt_newline() instead of \n in the format string for indent
+ * level and tabstops to work corretly.
+ *
+ * Output units: printbuf->units exists to tell pretty-printers how to output
+ * numbers: a raw value (e.g. directly from a superblock field), as bytes, or as
+ * human readable bytes. prt_units() obeys it.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+enum printbuf_si {
+ PRINTBUF_UNITS_2, /* use binary powers of 2^10 */
+ PRINTBUF_UNITS_10, /* use powers of 10^3 (standard SI) */
+};
+
+#define PRINTBUF_INLINE_TABSTOPS 6
+
+struct printbuf {
+ char *buf;
+ unsigned size;
+ unsigned pos;
+ unsigned last_newline;
+ unsigned last_field;
+ unsigned indent;
+ /*
+ * If nonzero, allocations will be done with GFP_ATOMIC:
+ */
+ u8 atomic;
+ bool allocation_failure:1;
+ bool heap_allocated:1;
+ enum printbuf_si si_units:1;
+ bool human_readable_units:1;
+ bool has_indent_or_tabstops:1;
+ bool suppress_indent_tabstop_handling:1;
+ u8 nr_tabstops;
+
+ /*
+ * Do not modify directly: use printbuf_tabstop_add(),
+ * printbuf_tabstop_get()
+ */
+ u8 cur_tabstop;
+ u8 _tabstops[PRINTBUF_INLINE_TABSTOPS];
+};
+
+int bch2_printbuf_make_room(struct printbuf *, unsigned);
+__printf(2, 3) void bch2_prt_printf(struct printbuf *out, const char *fmt, ...);
+__printf(2, 0) void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list);
+const char *bch2_printbuf_str(const struct printbuf *);
+void bch2_printbuf_exit(struct printbuf *);
+
+void bch2_printbuf_tabstops_reset(struct printbuf *);
+void bch2_printbuf_tabstop_pop(struct printbuf *);
+int bch2_printbuf_tabstop_push(struct printbuf *, unsigned);
+
+void bch2_printbuf_indent_add(struct printbuf *, unsigned);
+void bch2_printbuf_indent_sub(struct printbuf *, unsigned);
+
+void bch2_prt_newline(struct printbuf *);
+void bch2_prt_tab(struct printbuf *);
+void bch2_prt_tab_rjust(struct printbuf *);
+
+void bch2_prt_bytes_indented(struct printbuf *, const char *, unsigned);
+void bch2_prt_human_readable_u64(struct printbuf *, u64);
+void bch2_prt_human_readable_s64(struct printbuf *, s64);
+void bch2_prt_units_u64(struct printbuf *, u64);
+void bch2_prt_units_s64(struct printbuf *, s64);
+void bch2_prt_string_option(struct printbuf *, const char * const[], size_t);
+void bch2_prt_bitflags(struct printbuf *, const char * const[], u64);
+void bch2_prt_bitflags_vector(struct printbuf *, const char * const[],
+ unsigned long *, unsigned);
+
+/* Initializer for a heap allocated printbuf: */
+#define PRINTBUF ((struct printbuf) { .heap_allocated = true })
+
+/* Initializer a printbuf that points to an external buffer: */
+#define PRINTBUF_EXTERN(_buf, _size) \
+((struct printbuf) { \
+ .buf = _buf, \
+ .size = _size, \
+})
+
+/*
+ * Returns size remaining of output buffer:
+ */
+static inline unsigned printbuf_remaining_size(struct printbuf *out)
+{
+ return out->pos < out->size ? out->size - out->pos : 0;
+}
+
+/*
+ * Returns number of characters we can print to the output buffer - i.e.
+ * excluding the terminating nul:
+ */
+static inline unsigned printbuf_remaining(struct printbuf *out)
+{
+ return out->pos < out->size ? out->size - out->pos - 1 : 0;
+}
+
+static inline unsigned printbuf_written(struct printbuf *out)
+{
+ return out->size ? min(out->pos, out->size - 1) : 0;
+}
+
+/*
+ * Returns true if output was truncated:
+ */
+static inline bool printbuf_overflowed(struct printbuf *out)
+{
+ return out->pos >= out->size;
+}
+
+static inline void printbuf_nul_terminate(struct printbuf *out)
+{
+ bch2_printbuf_make_room(out, 1);
+
+ if (out->pos < out->size)
+ out->buf[out->pos] = 0;
+ else if (out->size)
+ out->buf[out->size - 1] = 0;
+}
+
+/* Doesn't call bch2_printbuf_make_room(), doesn't nul terminate: */
+static inline void __prt_char_reserved(struct printbuf *out, char c)
+{
+ if (printbuf_remaining(out))
+ out->buf[out->pos] = c;
+ out->pos++;
+}
+
+/* Doesn't nul terminate: */
+static inline void __prt_char(struct printbuf *out, char c)
+{
+ bch2_printbuf_make_room(out, 1);
+ __prt_char_reserved(out, c);
+}
+
+static inline void prt_char(struct printbuf *out, char c)
+{
+ __prt_char(out, c);
+ printbuf_nul_terminate(out);
+}
+
+static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n)
+{
+ unsigned i, can_print = min(n, printbuf_remaining(out));
+
+ for (i = 0; i < can_print; i++)
+ out->buf[out->pos++] = c;
+ out->pos += n - can_print;
+}
+
+static inline void prt_chars(struct printbuf *out, char c, unsigned n)
+{
+ bch2_printbuf_make_room(out, n);
+ __prt_chars_reserved(out, c, n);
+ printbuf_nul_terminate(out);
+}
+
+static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n)
+{
+ unsigned i, can_print;
+
+ bch2_printbuf_make_room(out, n);
+
+ can_print = min(n, printbuf_remaining(out));
+
+ for (i = 0; i < can_print; i++)
+ out->buf[out->pos++] = ((char *) b)[i];
+ out->pos += n - can_print;
+
+ printbuf_nul_terminate(out);
+}
+
+static inline void prt_str(struct printbuf *out, const char *str)
+{
+ prt_bytes(out, str, strlen(str));
+}
+
+static inline void prt_str_indented(struct printbuf *out, const char *str)
+{
+ bch2_prt_bytes_indented(out, str, strlen(str));
+}
+
+static inline void prt_hex_byte(struct printbuf *out, u8 byte)
+{
+ bch2_printbuf_make_room(out, 2);
+ __prt_char_reserved(out, hex_asc_hi(byte));
+ __prt_char_reserved(out, hex_asc_lo(byte));
+ printbuf_nul_terminate(out);
+}
+
+static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte)
+{
+ bch2_printbuf_make_room(out, 2);
+ __prt_char_reserved(out, hex_asc_upper_hi(byte));
+ __prt_char_reserved(out, hex_asc_upper_lo(byte));
+ printbuf_nul_terminate(out);
+}
+
+/**
+ * printbuf_reset - re-use a printbuf without freeing and re-initializing it:
+ */
+static inline void printbuf_reset(struct printbuf *buf)
+{
+ buf->pos = 0;
+ buf->allocation_failure = 0;
+ buf->indent = 0;
+ buf->nr_tabstops = 0;
+ buf->cur_tabstop = 0;
+}
+
+/**
+ * printbuf_atomic_inc - mark as entering an atomic section
+ */
+static inline void printbuf_atomic_inc(struct printbuf *buf)
+{
+ buf->atomic++;
+}
+
+/**
+ * printbuf_atomic_inc - mark as leaving an atomic section
+ */
+static inline void printbuf_atomic_dec(struct printbuf *buf)
+{
+ buf->atomic--;
+}
+
+#endif /* _BCACHEFS_PRINTBUF_H */
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
new file mode 100644
index 000000000000..a54647c36b85
--- /dev/null
+++ b/fs/bcachefs/quota.c
@@ -0,0 +1,979 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "errcode.h"
+#include "error.h"
+#include "inode.h"
+#include "quota.h"
+#include "snapshot.h"
+#include "super-io.h"
+
+static const char * const bch2_quota_types[] = {
+ "user",
+ "group",
+ "project",
+};
+
+static const char * const bch2_quota_counters[] = {
+ "space",
+ "inodes",
+};
+
+static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ struct printbuf *err)
+{
+ struct bch_sb_field_quota *q = field_to_type(f, quota);
+
+ if (vstruct_bytes(&q->field) < sizeof(*q)) {
+ prt_printf(err, "wrong size (got %zu should be %zu)",
+ vstruct_bytes(&q->field), sizeof(*q));
+ return -BCH_ERR_invalid_sb_quota;
+ }
+
+ return 0;
+}
+
+static void bch2_sb_quota_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_quota *q = field_to_type(f, quota);
+ unsigned qtyp, counter;
+
+ for (qtyp = 0; qtyp < ARRAY_SIZE(q->q); qtyp++) {
+ prt_printf(out, "%s: flags %llx",
+ bch2_quota_types[qtyp],
+ le64_to_cpu(q->q[qtyp].flags));
+
+ for (counter = 0; counter < Q_COUNTERS; counter++)
+ prt_printf(out, " %s timelimit %u warnlimit %u",
+ bch2_quota_counters[counter],
+ le32_to_cpu(q->q[qtyp].c[counter].timelimit),
+ le32_to_cpu(q->q[qtyp].c[counter].warnlimit));
+
+ prt_newline(out);
+ }
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_quota = {
+ .validate = bch2_sb_quota_validate,
+ .to_text = bch2_sb_quota_to_text,
+};
+
+int bch2_quota_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
+{
+ int ret = 0;
+
+ bkey_fsck_err_on(k.k->p.inode >= QTYP_NR, c, err,
+ quota_type_invalid,
+ "invalid quota type (%llu >= %u)",
+ k.k->p.inode, QTYP_NR);
+fsck_err:
+ return ret;
+}
+
+void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_quota dq = bkey_s_c_to_quota(k);
+ unsigned i;
+
+ for (i = 0; i < Q_COUNTERS; i++)
+ prt_printf(out, "%s hardlimit %llu softlimit %llu",
+ bch2_quota_counters[i],
+ le64_to_cpu(dq.v->c[i].hardlimit),
+ le64_to_cpu(dq.v->c[i].softlimit));
+}
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+
+#include <linux/cred.h>
+#include <linux/fs.h>
+#include <linux/quota.h>
+
+static void qc_info_to_text(struct printbuf *out, struct qc_info *i)
+{
+ printbuf_tabstops_reset(out);
+ printbuf_tabstop_push(out, 20);
+
+ prt_str(out, "i_fieldmask");
+ prt_tab(out);
+ prt_printf(out, "%x", i->i_fieldmask);
+ prt_newline(out);
+
+ prt_str(out, "i_flags");
+ prt_tab(out);
+ prt_printf(out, "%u", i->i_flags);
+ prt_newline(out);
+
+ prt_str(out, "i_spc_timelimit");
+ prt_tab(out);
+ prt_printf(out, "%u", i->i_spc_timelimit);
+ prt_newline(out);
+
+ prt_str(out, "i_ino_timelimit");
+ prt_tab(out);
+ prt_printf(out, "%u", i->i_ino_timelimit);
+ prt_newline(out);
+
+ prt_str(out, "i_rt_spc_timelimit");
+ prt_tab(out);
+ prt_printf(out, "%u", i->i_rt_spc_timelimit);
+ prt_newline(out);
+
+ prt_str(out, "i_spc_warnlimit");
+ prt_tab(out);
+ prt_printf(out, "%u", i->i_spc_warnlimit);
+ prt_newline(out);
+
+ prt_str(out, "i_ino_warnlimit");
+ prt_tab(out);
+ prt_printf(out, "%u", i->i_ino_warnlimit);
+ prt_newline(out);
+
+ prt_str(out, "i_rt_spc_warnlimit");
+ prt_tab(out);
+ prt_printf(out, "%u", i->i_rt_spc_warnlimit);
+ prt_newline(out);
+}
+
+static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q)
+{
+ printbuf_tabstops_reset(out);
+ printbuf_tabstop_push(out, 20);
+
+ prt_str(out, "d_fieldmask");
+ prt_tab(out);
+ prt_printf(out, "%x", q->d_fieldmask);
+ prt_newline(out);
+
+ prt_str(out, "d_spc_hardlimit");
+ prt_tab(out);
+ prt_printf(out, "%llu", q->d_spc_hardlimit);
+ prt_newline(out);
+
+ prt_str(out, "d_spc_softlimit");
+ prt_tab(out);
+ prt_printf(out, "%llu", q->d_spc_softlimit);
+ prt_newline(out);
+
+ prt_str(out, "d_ino_hardlimit");
+ prt_tab(out);
+ prt_printf(out, "%llu", q->d_ino_hardlimit);
+ prt_newline(out);
+
+ prt_str(out, "d_ino_softlimit");
+ prt_tab(out);
+ prt_printf(out, "%llu", q->d_ino_softlimit);
+ prt_newline(out);
+
+ prt_str(out, "d_space");
+ prt_tab(out);
+ prt_printf(out, "%llu", q->d_space);
+ prt_newline(out);
+
+ prt_str(out, "d_ino_count");
+ prt_tab(out);
+ prt_printf(out, "%llu", q->d_ino_count);
+ prt_newline(out);
+
+ prt_str(out, "d_ino_timer");
+ prt_tab(out);
+ prt_printf(out, "%llu", q->d_ino_timer);
+ prt_newline(out);
+
+ prt_str(out, "d_spc_timer");
+ prt_tab(out);
+ prt_printf(out, "%llu", q->d_spc_timer);
+ prt_newline(out);
+
+ prt_str(out, "d_ino_warns");
+ prt_tab(out);
+ prt_printf(out, "%i", q->d_ino_warns);
+ prt_newline(out);
+
+ prt_str(out, "d_spc_warns");
+ prt_tab(out);
+ prt_printf(out, "%i", q->d_spc_warns);
+ prt_newline(out);
+}
+
+static inline unsigned __next_qtype(unsigned i, unsigned qtypes)
+{
+ qtypes >>= i;
+ return qtypes ? i + __ffs(qtypes) : QTYP_NR;
+}
+
+#define for_each_set_qtype(_c, _i, _q, _qtypes) \
+ for (_i = 0; \
+ (_i = __next_qtype(_i, _qtypes), \
+ _q = &(_c)->quotas[_i], \
+ _i < QTYP_NR); \
+ _i++)
+
+static bool ignore_hardlimit(struct bch_memquota_type *q)
+{
+ if (capable(CAP_SYS_RESOURCE))
+ return true;
+#if 0
+ struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
+
+ return capable(CAP_SYS_RESOURCE) &&
+ (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
+ !(info->dqi_flags & DQF_ROOT_SQUASH));
+#endif
+ return false;
+}
+
+enum quota_msg {
+ SOFTWARN, /* Softlimit reached */
+ SOFTLONGWARN, /* Grace time expired */
+ HARDWARN, /* Hardlimit reached */
+
+ HARDBELOW, /* Usage got below inode hardlimit */
+ SOFTBELOW, /* Usage got below inode softlimit */
+};
+
+static int quota_nl[][Q_COUNTERS] = {
+ [HARDWARN][Q_SPC] = QUOTA_NL_BHARDWARN,
+ [SOFTLONGWARN][Q_SPC] = QUOTA_NL_BSOFTLONGWARN,
+ [SOFTWARN][Q_SPC] = QUOTA_NL_BSOFTWARN,
+ [HARDBELOW][Q_SPC] = QUOTA_NL_BHARDBELOW,
+ [SOFTBELOW][Q_SPC] = QUOTA_NL_BSOFTBELOW,
+
+ [HARDWARN][Q_INO] = QUOTA_NL_IHARDWARN,
+ [SOFTLONGWARN][Q_INO] = QUOTA_NL_ISOFTLONGWARN,
+ [SOFTWARN][Q_INO] = QUOTA_NL_ISOFTWARN,
+ [HARDBELOW][Q_INO] = QUOTA_NL_IHARDBELOW,
+ [SOFTBELOW][Q_INO] = QUOTA_NL_ISOFTBELOW,
+};
+
+struct quota_msgs {
+ u8 nr;
+ struct {
+ u8 qtype;
+ u8 msg;
+ } m[QTYP_NR * Q_COUNTERS];
+};
+
+static void prepare_msg(unsigned qtype,
+ enum quota_counters counter,
+ struct quota_msgs *msgs,
+ enum quota_msg msg_type)
+{
+ BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m));
+
+ msgs->m[msgs->nr].qtype = qtype;
+ msgs->m[msgs->nr].msg = quota_nl[msg_type][counter];
+ msgs->nr++;
+}
+
+static void prepare_warning(struct memquota_counter *qc,
+ unsigned qtype,
+ enum quota_counters counter,
+ struct quota_msgs *msgs,
+ enum quota_msg msg_type)
+{
+ if (qc->warning_issued & (1 << msg_type))
+ return;
+
+ prepare_msg(qtype, counter, msgs, msg_type);
+}
+
+static void flush_warnings(struct bch_qid qid,
+ struct super_block *sb,
+ struct quota_msgs *msgs)
+{
+ unsigned i;
+
+ for (i = 0; i < msgs->nr; i++)
+ quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]),
+ sb->s_dev, msgs->m[i].msg);
+}
+
+static int bch2_quota_check_limit(struct bch_fs *c,
+ unsigned qtype,
+ struct bch_memquota *mq,
+ struct quota_msgs *msgs,
+ enum quota_counters counter,
+ s64 v,
+ enum quota_acct_mode mode)
+{
+ struct bch_memquota_type *q = &c->quotas[qtype];
+ struct memquota_counter *qc = &mq->c[counter];
+ u64 n = qc->v + v;
+
+ BUG_ON((s64) n < 0);
+
+ if (mode == KEY_TYPE_QUOTA_NOCHECK)
+ return 0;
+
+ if (v <= 0) {
+ if (n < qc->hardlimit &&
+ (qc->warning_issued & (1 << HARDWARN))) {
+ qc->warning_issued &= ~(1 << HARDWARN);
+ prepare_msg(qtype, counter, msgs, HARDBELOW);
+ }
+
+ if (n < qc->softlimit &&
+ (qc->warning_issued & (1 << SOFTWARN))) {
+ qc->warning_issued &= ~(1 << SOFTWARN);
+ prepare_msg(qtype, counter, msgs, SOFTBELOW);
+ }
+
+ qc->warning_issued = 0;
+ return 0;
+ }
+
+ if (qc->hardlimit &&
+ qc->hardlimit < n &&
+ !ignore_hardlimit(q)) {
+ prepare_warning(qc, qtype, counter, msgs, HARDWARN);
+ return -EDQUOT;
+ }
+
+ if (qc->softlimit &&
+ qc->softlimit < n) {
+ if (qc->timer == 0) {
+ qc->timer = ktime_get_real_seconds() + q->limits[counter].timelimit;
+ prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
+ } else if (ktime_get_real_seconds() >= qc->timer &&
+ !ignore_hardlimit(q)) {
+ prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
+ return -EDQUOT;
+ }
+ }
+
+ return 0;
+}
+
+int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
+ enum quota_counters counter, s64 v,
+ enum quota_acct_mode mode)
+{
+ unsigned qtypes = enabled_qtypes(c);
+ struct bch_memquota_type *q;
+ struct bch_memquota *mq[QTYP_NR];
+ struct quota_msgs msgs;
+ unsigned i;
+ int ret = 0;
+
+ memset(&msgs, 0, sizeof(msgs));
+
+ for_each_set_qtype(c, i, q, qtypes) {
+ mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_KERNEL);
+ if (!mq[i])
+ return -ENOMEM;
+ }
+
+ for_each_set_qtype(c, i, q, qtypes)
+ mutex_lock_nested(&q->lock, i);
+
+ for_each_set_qtype(c, i, q, qtypes) {
+ ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode);
+ if (ret)
+ goto err;
+ }
+
+ for_each_set_qtype(c, i, q, qtypes)
+ mq[i]->c[counter].v += v;
+err:
+ for_each_set_qtype(c, i, q, qtypes)
+ mutex_unlock(&q->lock);
+
+ flush_warnings(qid, c->vfs_sb, &msgs);
+
+ return ret;
+}
+
+static void __bch2_quota_transfer(struct bch_memquota *src_q,
+ struct bch_memquota *dst_q,
+ enum quota_counters counter, s64 v)
+{
+ BUG_ON(v > src_q->c[counter].v);
+ BUG_ON(v + dst_q->c[counter].v < v);
+
+ src_q->c[counter].v -= v;
+ dst_q->c[counter].v += v;
+}
+
+int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
+ struct bch_qid dst,
+ struct bch_qid src, u64 space,
+ enum quota_acct_mode mode)
+{
+ struct bch_memquota_type *q;
+ struct bch_memquota *src_q[3], *dst_q[3];
+ struct quota_msgs msgs;
+ unsigned i;
+ int ret = 0;
+
+ qtypes &= enabled_qtypes(c);
+
+ memset(&msgs, 0, sizeof(msgs));
+
+ for_each_set_qtype(c, i, q, qtypes) {
+ src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_KERNEL);
+ dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_KERNEL);
+ if (!src_q[i] || !dst_q[i])
+ return -ENOMEM;
+ }
+
+ for_each_set_qtype(c, i, q, qtypes)
+ mutex_lock_nested(&q->lock, i);
+
+ for_each_set_qtype(c, i, q, qtypes) {
+ ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC,
+ dst_q[i]->c[Q_SPC].v + space,
+ mode);
+ if (ret)
+ goto err;
+
+ ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO,
+ dst_q[i]->c[Q_INO].v + 1,
+ mode);
+ if (ret)
+ goto err;
+ }
+
+ for_each_set_qtype(c, i, q, qtypes) {
+ __bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space);
+ __bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1);
+ }
+
+err:
+ for_each_set_qtype(c, i, q, qtypes)
+ mutex_unlock(&q->lock);
+
+ flush_warnings(dst, c->vfs_sb, &msgs);
+
+ return ret;
+}
+
+static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k,
+ struct qc_dqblk *qdq)
+{
+ struct bkey_s_c_quota dq;
+ struct bch_memquota_type *q;
+ struct bch_memquota *mq;
+ unsigned i;
+
+ BUG_ON(k.k->p.inode >= QTYP_NR);
+
+ if (!((1U << k.k->p.inode) & enabled_qtypes(c)))
+ return 0;
+
+ switch (k.k->type) {
+ case KEY_TYPE_quota:
+ dq = bkey_s_c_to_quota(k);
+ q = &c->quotas[k.k->p.inode];
+
+ mutex_lock(&q->lock);
+ mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL);
+ if (!mq) {
+ mutex_unlock(&q->lock);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < Q_COUNTERS; i++) {
+ mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit);
+ mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit);
+ }
+
+ if (qdq && qdq->d_fieldmask & QC_SPC_TIMER)
+ mq->c[Q_SPC].timer = qdq->d_spc_timer;
+ if (qdq && qdq->d_fieldmask & QC_SPC_WARNS)
+ mq->c[Q_SPC].warns = qdq->d_spc_warns;
+ if (qdq && qdq->d_fieldmask & QC_INO_TIMER)
+ mq->c[Q_INO].timer = qdq->d_ino_timer;
+ if (qdq && qdq->d_fieldmask & QC_INO_WARNS)
+ mq->c[Q_INO].warns = qdq->d_ino_warns;
+
+ mutex_unlock(&q->lock);
+ }
+
+ return 0;
+}
+
+void bch2_fs_quota_exit(struct bch_fs *c)
+{
+ unsigned i;
+
+ for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
+ genradix_free(&c->quotas[i].table);
+}
+
+void bch2_fs_quota_init(struct bch_fs *c)
+{
+ unsigned i;
+
+ for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
+ mutex_init(&c->quotas[i].lock);
+}
+
+static struct bch_sb_field_quota *bch2_sb_get_or_create_quota(struct bch_sb_handle *sb)
+{
+ struct bch_sb_field_quota *sb_quota = bch2_sb_field_get(sb->sb, quota);
+
+ if (sb_quota)
+ return sb_quota;
+
+ sb_quota = bch2_sb_field_resize(sb, quota, sizeof(*sb_quota) / sizeof(u64));
+ if (sb_quota) {
+ unsigned qtype, qc;
+
+ for (qtype = 0; qtype < QTYP_NR; qtype++)
+ for (qc = 0; qc < Q_COUNTERS; qc++)
+ sb_quota->q[qtype].c[qc].timelimit =
+ cpu_to_le32(7 * 24 * 60 * 60);
+ }
+
+ return sb_quota;
+}
+
+static void bch2_sb_quota_read(struct bch_fs *c)
+{
+ struct bch_sb_field_quota *sb_quota;
+ unsigned i, j;
+
+ sb_quota = bch2_sb_field_get(c->disk_sb.sb, quota);
+ if (!sb_quota)
+ return;
+
+ for (i = 0; i < QTYP_NR; i++) {
+ struct bch_memquota_type *q = &c->quotas[i];
+
+ for (j = 0; j < Q_COUNTERS; j++) {
+ q->limits[j].timelimit =
+ le32_to_cpu(sb_quota->q[i].c[j].timelimit);
+ q->limits[j].warnlimit =
+ le32_to_cpu(sb_quota->q[i].c[j].warnlimit);
+ }
+ }
+}
+
+static int bch2_fs_quota_read_inode(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_inode_unpacked u;
+ struct bch_snapshot_tree s_t;
+ int ret;
+
+ ret = bch2_snapshot_tree_lookup(trans,
+ bch2_snapshot_tree(c, k.k->p.snapshot), &s_t);
+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+ "%s: snapshot tree %u not found", __func__,
+ snapshot_t(c, k.k->p.snapshot)->tree);
+ if (ret)
+ return ret;
+
+ if (!s_t.master_subvol)
+ goto advance;
+
+ ret = bch2_inode_find_by_inum_nowarn_trans(trans,
+ (subvol_inum) {
+ le32_to_cpu(s_t.master_subvol),
+ k.k->p.offset,
+ }, &u);
+ /*
+ * Inode might be deleted in this snapshot - the easiest way to handle
+ * that is to just skip it here:
+ */
+ if (bch2_err_matches(ret, ENOENT))
+ goto advance;
+
+ if (ret)
+ return ret;
+
+ bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
+ KEY_TYPE_QUOTA_NOCHECK);
+ bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
+ KEY_TYPE_QUOTA_NOCHECK);
+advance:
+ bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
+ return 0;
+}
+
+int bch2_fs_quota_read(struct bch_fs *c)
+{
+ struct bch_sb_field_quota *sb_quota;
+ struct btree_trans *trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ mutex_lock(&c->sb_lock);
+ sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
+ if (!sb_quota) {
+ mutex_unlock(&c->sb_lock);
+ return -BCH_ERR_ENOSPC_sb_quota;
+ }
+
+ bch2_sb_quota_read(c);
+ mutex_unlock(&c->sb_lock);
+
+ trans = bch2_trans_get(c);
+
+ ret = for_each_btree_key2(trans, iter, BTREE_ID_quotas,
+ POS_MIN, BTREE_ITER_PREFETCH, k,
+ __bch2_quota_set(c, k, NULL)) ?:
+ for_each_btree_key2(trans, iter, BTREE_ID_inodes,
+ POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ bch2_fs_quota_read_inode(trans, &iter, k));
+
+ bch2_trans_put(trans);
+
+ if (ret)
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+/* Enable/disable/delete quotas for an entire filesystem: */
+
+static int bch2_quota_enable(struct super_block *sb, unsigned uflags)
+{
+ struct bch_fs *c = sb->s_fs_info;
+ struct bch_sb_field_quota *sb_quota;
+ int ret = 0;
+
+ if (sb->s_flags & SB_RDONLY)
+ return -EROFS;
+
+ /* Accounting must be enabled at mount time: */
+ if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT))
+ return -EINVAL;
+
+ /* Can't enable enforcement without accounting: */
+ if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota)
+ return -EINVAL;
+
+ if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota)
+ return -EINVAL;
+
+ if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota)
+ return -EINVAL;
+
+ mutex_lock(&c->sb_lock);
+ sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
+ if (!sb_quota) {
+ ret = -BCH_ERR_ENOSPC_sb_quota;
+ goto unlock;
+ }
+
+ if (uflags & FS_QUOTA_UDQ_ENFD)
+ SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true);
+
+ if (uflags & FS_QUOTA_GDQ_ENFD)
+ SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true);
+
+ if (uflags & FS_QUOTA_PDQ_ENFD)
+ SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true);
+
+ bch2_write_super(c);
+unlock:
+ mutex_unlock(&c->sb_lock);
+
+ return bch2_err_class(ret);
+}
+
+static int bch2_quota_disable(struct super_block *sb, unsigned uflags)
+{
+ struct bch_fs *c = sb->s_fs_info;
+
+ if (sb->s_flags & SB_RDONLY)
+ return -EROFS;
+
+ mutex_lock(&c->sb_lock);
+ if (uflags & FS_QUOTA_UDQ_ENFD)
+ SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false);
+
+ if (uflags & FS_QUOTA_GDQ_ENFD)
+ SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false);
+
+ if (uflags & FS_QUOTA_PDQ_ENFD)
+ SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false);
+
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
+ return 0;
+}
+
+static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
+{
+ struct bch_fs *c = sb->s_fs_info;
+ int ret;
+
+ if (sb->s_flags & SB_RDONLY)
+ return -EROFS;
+
+ if (uflags & FS_USER_QUOTA) {
+ if (c->opts.usrquota)
+ return -EINVAL;
+
+ ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
+ POS(QTYP_USR, 0),
+ POS(QTYP_USR, U64_MAX),
+ 0, NULL);
+ if (ret)
+ return ret;
+ }
+
+ if (uflags & FS_GROUP_QUOTA) {
+ if (c->opts.grpquota)
+ return -EINVAL;
+
+ ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
+ POS(QTYP_GRP, 0),
+ POS(QTYP_GRP, U64_MAX),
+ 0, NULL);
+ if (ret)
+ return ret;
+ }
+
+ if (uflags & FS_PROJ_QUOTA) {
+ if (c->opts.prjquota)
+ return -EINVAL;
+
+ ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
+ POS(QTYP_PRJ, 0),
+ POS(QTYP_PRJ, U64_MAX),
+ 0, NULL);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * Return quota status information, such as enforcements, quota file inode
+ * numbers etc.
+ */
+static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state)
+{
+ struct bch_fs *c = sb->s_fs_info;
+ unsigned qtypes = enabled_qtypes(c);
+ unsigned i;
+
+ memset(state, 0, sizeof(*state));
+
+ for (i = 0; i < QTYP_NR; i++) {
+ state->s_state[i].flags |= QCI_SYSFILE;
+
+ if (!(qtypes & (1 << i)))
+ continue;
+
+ state->s_state[i].flags |= QCI_ACCT_ENABLED;
+
+ state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit;
+ state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit;
+
+ state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit;
+ state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit;
+ }
+
+ return 0;
+}
+
+/*
+ * Adjust quota timers & warnings
+ */
+static int bch2_quota_set_info(struct super_block *sb, int type,
+ struct qc_info *info)
+{
+ struct bch_fs *c = sb->s_fs_info;
+ struct bch_sb_field_quota *sb_quota;
+ int ret = 0;
+
+ if (0) {
+ struct printbuf buf = PRINTBUF;
+
+ qc_info_to_text(&buf, info);
+ pr_info("setting:\n%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ if (sb->s_flags & SB_RDONLY)
+ return -EROFS;
+
+ if (type >= QTYP_NR)
+ return -EINVAL;
+
+ if (!((1 << type) & enabled_qtypes(c)))
+ return -ESRCH;
+
+ if (info->i_fieldmask &
+ ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS))
+ return -EINVAL;
+
+ mutex_lock(&c->sb_lock);
+ sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
+ if (!sb_quota) {
+ ret = -BCH_ERR_ENOSPC_sb_quota;
+ goto unlock;
+ }
+
+ if (info->i_fieldmask & QC_SPC_TIMER)
+ sb_quota->q[type].c[Q_SPC].timelimit =
+ cpu_to_le32(info->i_spc_timelimit);
+
+ if (info->i_fieldmask & QC_SPC_WARNS)
+ sb_quota->q[type].c[Q_SPC].warnlimit =
+ cpu_to_le32(info->i_spc_warnlimit);
+
+ if (info->i_fieldmask & QC_INO_TIMER)
+ sb_quota->q[type].c[Q_INO].timelimit =
+ cpu_to_le32(info->i_ino_timelimit);
+
+ if (info->i_fieldmask & QC_INO_WARNS)
+ sb_quota->q[type].c[Q_INO].warnlimit =
+ cpu_to_le32(info->i_ino_warnlimit);
+
+ bch2_sb_quota_read(c);
+
+ bch2_write_super(c);
+unlock:
+ mutex_unlock(&c->sb_lock);
+
+ return bch2_err_class(ret);
+}
+
+/* Get/set individual quotas: */
+
+static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src)
+{
+ dst->d_space = src->c[Q_SPC].v << 9;
+ dst->d_spc_hardlimit = src->c[Q_SPC].hardlimit << 9;
+ dst->d_spc_softlimit = src->c[Q_SPC].softlimit << 9;
+ dst->d_spc_timer = src->c[Q_SPC].timer;
+ dst->d_spc_warns = src->c[Q_SPC].warns;
+
+ dst->d_ino_count = src->c[Q_INO].v;
+ dst->d_ino_hardlimit = src->c[Q_INO].hardlimit;
+ dst->d_ino_softlimit = src->c[Q_INO].softlimit;
+ dst->d_ino_timer = src->c[Q_INO].timer;
+ dst->d_ino_warns = src->c[Q_INO].warns;
+}
+
+static int bch2_get_quota(struct super_block *sb, struct kqid kqid,
+ struct qc_dqblk *qdq)
+{
+ struct bch_fs *c = sb->s_fs_info;
+ struct bch_memquota_type *q = &c->quotas[kqid.type];
+ qid_t qid = from_kqid(&init_user_ns, kqid);
+ struct bch_memquota *mq;
+
+ memset(qdq, 0, sizeof(*qdq));
+
+ mutex_lock(&q->lock);
+ mq = genradix_ptr(&q->table, qid);
+ if (mq)
+ __bch2_quota_get(qdq, mq);
+ mutex_unlock(&q->lock);
+
+ return 0;
+}
+
+static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid,
+ struct qc_dqblk *qdq)
+{
+ struct bch_fs *c = sb->s_fs_info;
+ struct bch_memquota_type *q = &c->quotas[kqid->type];
+ qid_t qid = from_kqid(&init_user_ns, *kqid);
+ struct genradix_iter iter;
+ struct bch_memquota *mq;
+ int ret = 0;
+
+ mutex_lock(&q->lock);
+
+ genradix_for_each_from(&q->table, iter, mq, qid)
+ if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) {
+ __bch2_quota_get(qdq, mq);
+ *kqid = make_kqid(current_user_ns(), kqid->type, iter.pos);
+ goto found;
+ }
+
+ ret = -ENOENT;
+found:
+ mutex_unlock(&q->lock);
+ return bch2_err_class(ret);
+}
+
+static int bch2_set_quota_trans(struct btree_trans *trans,
+ struct bkey_i_quota *new_quota,
+ struct qc_dqblk *qdq)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_quotas, new_quota->k.p,
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ ret = bkey_err(k);
+ if (unlikely(ret))
+ return ret;
+
+ if (k.k->type == KEY_TYPE_quota)
+ new_quota->v = *bkey_s_c_to_quota(k).v;
+
+ if (qdq->d_fieldmask & QC_SPC_SOFT)
+ new_quota->v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9);
+ if (qdq->d_fieldmask & QC_SPC_HARD)
+ new_quota->v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9);
+
+ if (qdq->d_fieldmask & QC_INO_SOFT)
+ new_quota->v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit);
+ if (qdq->d_fieldmask & QC_INO_HARD)
+ new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
+
+ ret = bch2_trans_update(trans, &iter, &new_quota->k_i, 0);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int bch2_set_quota(struct super_block *sb, struct kqid qid,
+ struct qc_dqblk *qdq)
+{
+ struct bch_fs *c = sb->s_fs_info;
+ struct bkey_i_quota new_quota;
+ int ret;
+
+ if (0) {
+ struct printbuf buf = PRINTBUF;
+
+ qc_dqblk_to_text(&buf, qdq);
+ pr_info("setting:\n%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ if (sb->s_flags & SB_RDONLY)
+ return -EROFS;
+
+ bkey_quota_init(&new_quota.k_i);
+ new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
+
+ ret = bch2_trans_do(c, NULL, NULL, 0,
+ bch2_set_quota_trans(trans, &new_quota, qdq)) ?:
+ __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq);
+
+ return bch2_err_class(ret);
+}
+
+const struct quotactl_ops bch2_quotactl_operations = {
+ .quota_enable = bch2_quota_enable,
+ .quota_disable = bch2_quota_disable,
+ .rm_xquota = bch2_quota_remove,
+
+ .get_state = bch2_quota_get_state,
+ .set_info = bch2_quota_set_info,
+
+ .get_dqblk = bch2_get_quota,
+ .get_nextdqblk = bch2_get_next_quota,
+ .set_dqblk = bch2_set_quota,
+};
+
+#endif /* CONFIG_BCACHEFS_QUOTA */
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
new file mode 100644
index 000000000000..884f601f41c4
--- /dev/null
+++ b/fs/bcachefs/quota.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_QUOTA_H
+#define _BCACHEFS_QUOTA_H
+
+#include "inode.h"
+#include "quota_types.h"
+
+enum bkey_invalid_flags;
+extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
+
+int bch2_quota_invalid(struct bch_fs *, struct bkey_s_c,
+ enum bkey_invalid_flags, struct printbuf *);
+void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_quota ((struct bkey_ops) { \
+ .key_invalid = bch2_quota_invalid, \
+ .val_to_text = bch2_quota_to_text, \
+ .min_val_size = 32, \
+})
+
+static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u)
+{
+ return (struct bch_qid) {
+ .q[QTYP_USR] = u->bi_uid,
+ .q[QTYP_GRP] = u->bi_gid,
+ .q[QTYP_PRJ] = u->bi_project ? u->bi_project - 1 : 0,
+ };
+}
+
+static inline unsigned enabled_qtypes(struct bch_fs *c)
+{
+ return ((c->opts.usrquota << QTYP_USR)|
+ (c->opts.grpquota << QTYP_GRP)|
+ (c->opts.prjquota << QTYP_PRJ));
+}
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+
+int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters,
+ s64, enum quota_acct_mode);
+
+int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid,
+ struct bch_qid, u64, enum quota_acct_mode);
+
+void bch2_fs_quota_exit(struct bch_fs *);
+void bch2_fs_quota_init(struct bch_fs *);
+int bch2_fs_quota_read(struct bch_fs *);
+
+extern const struct quotactl_ops bch2_quotactl_operations;
+
+#else
+
+static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
+ enum quota_counters counter, s64 v,
+ enum quota_acct_mode mode)
+{
+ return 0;
+}
+
+static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
+ struct bch_qid dst,
+ struct bch_qid src, u64 space,
+ enum quota_acct_mode mode)
+{
+ return 0;
+}
+
+static inline void bch2_fs_quota_exit(struct bch_fs *c) {}
+static inline void bch2_fs_quota_init(struct bch_fs *c) {}
+static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; }
+
+#endif
+
+#endif /* _BCACHEFS_QUOTA_H */
diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h
new file mode 100644
index 000000000000..6a136083d389
--- /dev/null
+++ b/fs/bcachefs/quota_types.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_QUOTA_TYPES_H
+#define _BCACHEFS_QUOTA_TYPES_H
+
+#include <linux/generic-radix-tree.h>
+
+struct bch_qid {
+ u32 q[QTYP_NR];
+};
+
+enum quota_acct_mode {
+ KEY_TYPE_QUOTA_PREALLOC,
+ KEY_TYPE_QUOTA_WARN,
+ KEY_TYPE_QUOTA_NOCHECK,
+};
+
+struct memquota_counter {
+ u64 v;
+ u64 hardlimit;
+ u64 softlimit;
+ s64 timer;
+ int warns;
+ int warning_issued;
+};
+
+struct bch_memquota {
+ struct memquota_counter c[Q_COUNTERS];
+};
+
+typedef GENRADIX(struct bch_memquota) bch_memquota_table;
+
+struct quota_limit {
+ u32 timelimit;
+ u32 warnlimit;
+};
+
+struct bch_memquota_type {
+ struct quota_limit limits[Q_COUNTERS];
+ bch_memquota_table table;
+ struct mutex lock;
+};
+
+#endif /* _BCACHEFS_QUOTA_TYPES_H */
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
new file mode 100644
index 000000000000..3319190b8d9c
--- /dev/null
+++ b/fs/bcachefs/rebalance.c
@@ -0,0 +1,464 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "btree_write_buffer.h"
+#include "buckets.h"
+#include "clock.h"
+#include "compress.h"
+#include "disk_groups.h"
+#include "errcode.h"
+#include "error.h"
+#include "inode.h"
+#include "move.h"
+#include "rebalance.h"
+#include "subvolume.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/sched/cputime.h>
+
+#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1)
+
+static const char * const bch2_rebalance_state_strs[] = {
+#define x(t) #t,
+ BCH_REBALANCE_STATES()
+ NULL
+#undef x
+};
+
+static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i_cookie *cookie;
+ u64 v;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
+ SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ v = k.k->type == KEY_TYPE_cookie
+ ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
+ : 0;
+
+ cookie = bch2_trans_kmalloc(trans, sizeof(*cookie));
+ ret = PTR_ERR_OR_ZERO(cookie);
+ if (ret)
+ goto err;
+
+ bkey_cookie_init(&cookie->k_i);
+ cookie->k.p = iter.pos;
+ cookie->v.cookie = cpu_to_le64(v + 1);
+
+ ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
+{
+ int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ __bch2_set_rebalance_needs_scan(trans, inum));
+ rebalance_wakeup(c);
+ return ret;
+}
+
+int bch2_set_fs_needs_rebalance(struct bch_fs *c)
+{
+ return bch2_set_rebalance_needs_scan(c, 0);
+}
+
+static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u64 v;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
+ SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ v = k.k->type == KEY_TYPE_cookie
+ ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
+ : 0;
+
+ if (v == cookie)
+ ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans,
+ struct btree_iter *work_iter)
+{
+ return !kthread_should_stop()
+ ? bch2_btree_iter_peek(work_iter)
+ : bkey_s_c_null;
+}
+
+static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
+ int ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
+
+ extent_entry_drop(bkey_i_to_s(n),
+ (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
+ return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+}
+
+static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
+ struct bpos work_pos,
+ struct btree_iter *extent_iter,
+ struct data_update_opts *data_opts)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k;
+
+ bch2_trans_iter_exit(trans, extent_iter);
+ bch2_trans_iter_init(trans, extent_iter,
+ work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
+ work_pos,
+ BTREE_ITER_ALL_SNAPSHOTS);
+ k = bch2_btree_iter_peek_slot(extent_iter);
+ if (bkey_err(k))
+ return k;
+
+ const struct bch_extent_rebalance *r = k.k ? bch2_bkey_rebalance_opts(k) : NULL;
+ if (!r) {
+ /* raced due to btree write buffer, nothing to do */
+ return bkey_s_c_null;
+ }
+
+ memset(data_opts, 0, sizeof(*data_opts));
+
+ data_opts->rewrite_ptrs =
+ bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression);
+ data_opts->target = r->target;
+
+ if (!data_opts->rewrite_ptrs) {
+ /*
+ * device we would want to write to offline? devices in target
+ * changed?
+ *
+ * We'll now need a full scan before this extent is picked up
+ * again:
+ */
+ int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k);
+ if (ret)
+ return bkey_s_c_err(ret);
+ return bkey_s_c_null;
+ }
+
+ return k;
+}
+
+noinline_for_stack
+static int do_rebalance_extent(struct moving_context *ctxt,
+ struct bpos work_pos,
+ struct btree_iter *extent_iter)
+{
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs *c = trans->c;
+ struct bch_fs_rebalance *r = &trans->c->rebalance;
+ struct data_update_opts data_opts;
+ struct bch_io_opts io_opts;
+ struct bkey_s_c k;
+ struct bkey_buf sk;
+ int ret;
+
+ ctxt->stats = &r->work_stats;
+ r->state = BCH_REBALANCE_working;
+
+ bch2_bkey_buf_init(&sk);
+
+ ret = bkey_err(k = next_rebalance_extent(trans, work_pos,
+ extent_iter, &data_opts));
+ if (ret || !k.k)
+ goto out;
+
+ ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
+ if (ret)
+ goto out;
+
+ atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
+
+ /*
+ * The iterator gets unlocked by __bch2_read_extent - need to
+ * save a copy of @k elsewhere:
+ */
+ bch2_bkey_buf_reassemble(&sk, c, k);
+ k = bkey_i_to_s_c(sk.k);
+
+ ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts);
+ if (ret) {
+ if (bch2_err_matches(ret, ENOMEM)) {
+ /* memory allocation failure, wait for some IO to finish */
+ bch2_move_ctxt_wait_for_io(ctxt);
+ ret = -BCH_ERR_transaction_restart_nested;
+ }
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto out;
+
+ /* skip it and continue, XXX signal failure */
+ ret = 0;
+ }
+out:
+ bch2_bkey_buf_exit(&sk, c);
+ return ret;
+}
+
+static bool rebalance_pred(struct bch_fs *c, void *arg,
+ struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
+{
+ unsigned target, compression;
+
+ if (k.k->p.inode) {
+ target = io_opts->background_target;
+ compression = io_opts->background_compression ?: io_opts->compression;
+ } else {
+ const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
+
+ target = r ? r->target : io_opts->background_target;
+ compression = r ? r->compression :
+ (io_opts->background_compression ?: io_opts->compression);
+ }
+
+ data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression);
+ data_opts->target = target;
+ return data_opts->rewrite_ptrs != 0;
+}
+
+static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
+{
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs_rebalance *r = &trans->c->rebalance;
+ int ret;
+
+ bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
+ ctxt->stats = &r->scan_stats;
+
+ if (!inum) {
+ r->scan_start = BBPOS_MIN;
+ r->scan_end = BBPOS_MAX;
+ } else {
+ r->scan_start = BBPOS(BTREE_ID_extents, POS(inum, 0));
+ r->scan_end = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX));
+ }
+
+ r->state = BCH_REBALANCE_scanning;
+
+ ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?:
+ commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ bch2_clear_rebalance_needs_scan(trans, inum, cookie));
+
+ bch2_move_stats_exit(&r->scan_stats, trans->c);
+ return ret;
+}
+
+static void rebalance_wait(struct bch_fs *c)
+{
+ struct bch_fs_rebalance *r = &c->rebalance;
+ struct io_clock *clock = &c->io_clock[WRITE];
+ u64 now = atomic64_read(&clock->now);
+ u64 min_member_capacity = bch2_min_rw_member_capacity(c);
+
+ if (min_member_capacity == U64_MAX)
+ min_member_capacity = 128 * 2048;
+
+ r->wait_iotime_end = now + (min_member_capacity >> 6);
+
+ if (r->state != BCH_REBALANCE_waiting) {
+ r->wait_iotime_start = now;
+ r->wait_wallclock_start = ktime_get_real_ns();
+ r->state = BCH_REBALANCE_waiting;
+ }
+
+ bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
+}
+
+static int do_rebalance(struct moving_context *ctxt)
+{
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs *c = trans->c;
+ struct bch_fs_rebalance *r = &c->rebalance;
+ struct btree_iter rebalance_work_iter, extent_iter = { NULL };
+ struct bkey_s_c k;
+ int ret = 0;
+
+ bch2_move_stats_init(&r->work_stats, "rebalance_work");
+ bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
+
+ bch2_trans_iter_init(trans, &rebalance_work_iter,
+ BTREE_ID_rebalance_work, POS_MIN,
+ BTREE_ITER_ALL_SNAPSHOTS);
+
+ while (!bch2_move_ratelimit(ctxt) &&
+ !kthread_wait_freezable(r->enabled)) {
+ bch2_trans_begin(trans);
+
+ ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter));
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret || !k.k)
+ break;
+
+ ret = k.k->type == KEY_TYPE_cookie
+ ? do_rebalance_scan(ctxt, k.k->p.inode,
+ le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie))
+ : do_rebalance_extent(ctxt, k.k->p, &extent_iter);
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ break;
+
+ bch2_btree_iter_advance(&rebalance_work_iter);
+ }
+
+ bch2_trans_iter_exit(trans, &extent_iter);
+ bch2_trans_iter_exit(trans, &rebalance_work_iter);
+ bch2_move_stats_exit(&r->scan_stats, c);
+
+ if (!ret &&
+ !kthread_should_stop() &&
+ !atomic64_read(&r->work_stats.sectors_seen) &&
+ !atomic64_read(&r->scan_stats.sectors_seen)) {
+ bch2_trans_unlock_long(trans);
+ rebalance_wait(c);
+ }
+
+ if (!bch2_err_matches(ret, EROFS))
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+static int bch2_rebalance_thread(void *arg)
+{
+ struct bch_fs *c = arg;
+ struct bch_fs_rebalance *r = &c->rebalance;
+ struct moving_context ctxt;
+ int ret;
+
+ set_freezable();
+
+ bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats,
+ writepoint_ptr(&c->rebalance_write_point),
+ true);
+
+ while (!kthread_should_stop() &&
+ !(ret = do_rebalance(&ctxt)))
+ ;
+
+ bch2_moving_ctxt_exit(&ctxt);
+
+ return 0;
+}
+
+void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ struct bch_fs_rebalance *r = &c->rebalance;
+
+ prt_str(out, bch2_rebalance_state_strs[r->state]);
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+
+ switch (r->state) {
+ case BCH_REBALANCE_waiting: {
+ u64 now = atomic64_read(&c->io_clock[WRITE].now);
+
+ prt_str(out, "io wait duration: ");
+ bch2_prt_human_readable_s64(out, r->wait_iotime_end - r->wait_iotime_start);
+ prt_newline(out);
+
+ prt_str(out, "io wait remaining: ");
+ bch2_prt_human_readable_s64(out, r->wait_iotime_end - now);
+ prt_newline(out);
+
+ prt_str(out, "duration waited: ");
+ bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
+ prt_newline(out);
+ break;
+ }
+ case BCH_REBALANCE_working:
+ bch2_move_stats_to_text(out, &r->work_stats);
+ break;
+ case BCH_REBALANCE_scanning:
+ bch2_move_stats_to_text(out, &r->scan_stats);
+ break;
+ }
+ prt_newline(out);
+ printbuf_indent_sub(out, 2);
+}
+
+void bch2_rebalance_stop(struct bch_fs *c)
+{
+ struct task_struct *p;
+
+ c->rebalance.pd.rate.rate = UINT_MAX;
+ bch2_ratelimit_reset(&c->rebalance.pd.rate);
+
+ p = rcu_dereference_protected(c->rebalance.thread, 1);
+ c->rebalance.thread = NULL;
+
+ if (p) {
+ /* for sychronizing with rebalance_wakeup() */
+ synchronize_rcu();
+
+ kthread_stop(p);
+ put_task_struct(p);
+ }
+}
+
+int bch2_rebalance_start(struct bch_fs *c)
+{
+ struct task_struct *p;
+ int ret;
+
+ if (c->rebalance.thread)
+ return 0;
+
+ if (c->opts.nochanges)
+ return 0;
+
+ p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
+ ret = PTR_ERR_OR_ZERO(p);
+ if (ret) {
+ bch_err_msg(c, ret, "creating rebalance thread");
+ return ret;
+ }
+
+ get_task_struct(p);
+ rcu_assign_pointer(c->rebalance.thread, p);
+ wake_up_process(p);
+ return 0;
+}
+
+void bch2_fs_rebalance_init(struct bch_fs *c)
+{
+ bch2_pd_controller_init(&c->rebalance.pd);
+}
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
new file mode 100644
index 000000000000..28a52638f16c
--- /dev/null
+++ b/fs/bcachefs/rebalance.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REBALANCE_H
+#define _BCACHEFS_REBALANCE_H
+
+#include "rebalance_types.h"
+
+int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);
+int bch2_set_fs_needs_rebalance(struct bch_fs *);
+
+static inline void rebalance_wakeup(struct bch_fs *c)
+{
+ struct task_struct *p;
+
+ rcu_read_lock();
+ p = rcu_dereference(c->rebalance.thread);
+ if (p)
+ wake_up_process(p);
+ rcu_read_unlock();
+}
+
+void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *);
+
+void bch2_rebalance_stop(struct bch_fs *);
+int bch2_rebalance_start(struct bch_fs *);
+void bch2_fs_rebalance_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_REBALANCE_H */
diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h
new file mode 100644
index 000000000000..0fffb536c1d0
--- /dev/null
+++ b/fs/bcachefs/rebalance_types.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REBALANCE_TYPES_H
+#define _BCACHEFS_REBALANCE_TYPES_H
+
+#include "bbpos_types.h"
+#include "move_types.h"
+
+#define BCH_REBALANCE_STATES() \
+ x(waiting) \
+ x(working) \
+ x(scanning)
+
+enum bch_rebalance_states {
+#define x(t) BCH_REBALANCE_##t,
+ BCH_REBALANCE_STATES()
+#undef x
+};
+
+struct bch_fs_rebalance {
+ struct task_struct __rcu *thread;
+ struct bch_pd_controller pd;
+
+ enum bch_rebalance_states state;
+ u64 wait_iotime_start;
+ u64 wait_iotime_end;
+ u64 wait_wallclock_start;
+
+ struct bch_move_stats work_stats;
+
+ struct bbpos scan_start;
+ struct bbpos scan_end;
+ struct bch_move_stats scan_stats;
+
+ unsigned enabled:1;
+};
+
+#endif /* _BCACHEFS_REBALANCE_TYPES_H */
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
new file mode 100644
index 000000000000..5cf7d0532002
--- /dev/null
+++ b/fs/bcachefs/recovery.c
@@ -0,0 +1,1157 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "backpointers.h"
+#include "bkey_buf.h"
+#include "alloc_background.h"
+#include "btree_gc.h"
+#include "btree_journal_iter.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "buckets.h"
+#include "dirent.h"
+#include "ec.h"
+#include "errcode.h"
+#include "error.h"
+#include "fs-common.h"
+#include "fsck.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+#include "lru.h"
+#include "logged_ops.h"
+#include "move.h"
+#include "quota.h"
+#include "rebalance.h"
+#include "recovery.h"
+#include "replicas.h"
+#include "sb-clean.h"
+#include "sb-downgrade.h"
+#include "snapshot.h"
+#include "subvolume.h"
+#include "super-io.h"
+
+#include <linux/sort.h>
+#include <linux/stat.h>
+
+#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+
+static bool btree_id_is_alloc(enum btree_id id)
+{
+ switch (id) {
+ case BTREE_ID_alloc:
+ case BTREE_ID_backpointers:
+ case BTREE_ID_need_discard:
+ case BTREE_ID_freespace:
+ case BTREE_ID_bucket_gens:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/* for -o reconstruct_alloc: */
+static void drop_alloc_keys(struct journal_keys *keys)
+{
+ size_t src, dst;
+
+ for (src = 0, dst = 0; src < keys->nr; src++)
+ if (!btree_id_is_alloc(keys->d[src].btree_id))
+ keys->d[dst++] = keys->d[src];
+
+ keys->nr = dst;
+}
+
+/*
+ * Btree node pointers have a field to stack a pointer to the in memory btree
+ * node; we need to zero out this field when reading in btree nodes, or when
+ * reading in keys from the journal:
+ */
+static void zero_out_btree_mem_ptr(struct journal_keys *keys)
+{
+ struct journal_key *i;
+
+ for (i = keys->d; i < keys->d + keys->nr; i++)
+ if (i->k->k.type == KEY_TYPE_btree_ptr_v2)
+ bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0;
+}
+
+/* journal replay: */
+
+static void replay_now_at(struct journal *j, u64 seq)
+{
+ BUG_ON(seq < j->replay_journal_seq);
+
+ seq = min(seq, j->replay_journal_seq_end);
+
+ while (j->replay_journal_seq < seq)
+ bch2_journal_pin_put(j, j->replay_journal_seq++);
+}
+
+static int bch2_journal_replay_key(struct btree_trans *trans,
+ struct journal_key *k)
+{
+ struct btree_iter iter;
+ unsigned iter_flags =
+ BTREE_ITER_INTENT|
+ BTREE_ITER_NOT_EXTENTS;
+ unsigned update_flags = BTREE_TRIGGER_NORUN;
+ int ret;
+
+ /*
+ * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to
+ * keep the key cache coherent with the underlying btree. Nothing
+ * besides the allocator is doing updates yet so we don't need key cache
+ * coherency for non-alloc btrees, and key cache fills for snapshots
+ * btrees use BTREE_ITER_FILTER_SNAPSHOTS, which isn't available until
+ * the snapshots recovery pass runs.
+ */
+ if (!k->level && k->btree_id == BTREE_ID_alloc)
+ iter_flags |= BTREE_ITER_CACHED;
+ else
+ update_flags |= BTREE_UPDATE_KEY_CACHE_RECLAIM;
+
+ bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
+ BTREE_MAX_DEPTH, k->level,
+ iter_flags);
+ ret = bch2_btree_iter_traverse(&iter);
+ if (ret)
+ goto out;
+
+ /* Must be checked with btree locked: */
+ if (k->overwritten)
+ goto out;
+
+ ret = bch2_trans_update(trans, &iter, k->k, update_flags);
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int journal_sort_seq_cmp(const void *_l, const void *_r)
+{
+ const struct journal_key *l = *((const struct journal_key **)_l);
+ const struct journal_key *r = *((const struct journal_key **)_r);
+
+ return cmp_int(l->journal_seq, r->journal_seq);
+}
+
+static int bch2_journal_replay(struct bch_fs *c)
+{
+ struct journal_keys *keys = &c->journal_keys;
+ struct journal_key **keys_sorted, *k;
+ struct journal *j = &c->journal;
+ u64 start_seq = c->journal_replay_seq_start;
+ u64 end_seq = c->journal_replay_seq_start;
+ size_t i;
+ int ret = 0;
+
+ move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
+ keys->gap = keys->nr;
+
+ keys_sorted = kvmalloc_array(keys->nr, sizeof(*keys_sorted), GFP_KERNEL);
+ if (!keys_sorted)
+ return -BCH_ERR_ENOMEM_journal_replay;
+
+ for (i = 0; i < keys->nr; i++)
+ keys_sorted[i] = &keys->d[i];
+
+ sort(keys_sorted, keys->nr,
+ sizeof(keys_sorted[0]),
+ journal_sort_seq_cmp, NULL);
+
+ if (keys->nr) {
+ ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)",
+ keys->nr, start_seq, end_seq);
+ if (ret)
+ goto err;
+ }
+
+ BUG_ON(!atomic_read(&keys->ref));
+
+ for (i = 0; i < keys->nr; i++) {
+ k = keys_sorted[i];
+
+ cond_resched();
+
+ replay_now_at(j, k->journal_seq);
+
+ ret = bch2_trans_do(c, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL|
+ (!k->allocated
+ ? BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim
+ : 0),
+ bch2_journal_replay_key(trans, k));
+ if (ret) {
+ bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s",
+ bch2_btree_id_str(k->btree_id), k->level, bch2_err_str(ret));
+ goto err;
+ }
+ }
+
+ if (!c->opts.keep_journal)
+ bch2_journal_keys_put_initial(c);
+
+ replay_now_at(j, j->replay_journal_seq_end);
+ j->replay_journal_seq = 0;
+
+ bch2_journal_set_replay_done(j);
+ bch2_journal_flush_all_pins(j);
+ ret = bch2_journal_error(j);
+
+ if (keys->nr && !ret)
+ bch2_journal_log_msg(c, "journal replay finished");
+err:
+ kvfree(keys_sorted);
+
+ if (ret)
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+/* journal replay early: */
+
+static int journal_replay_entry_early(struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ int ret = 0;
+
+ switch (entry->type) {
+ case BCH_JSET_ENTRY_btree_root: {
+ struct btree_root *r;
+
+ while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) {
+ ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL });
+ if (ret)
+ return ret;
+ }
+
+ r = bch2_btree_id_root(c, entry->btree_id);
+
+ if (entry->u64s) {
+ r->level = entry->level;
+ bkey_copy(&r->key, (struct bkey_i *) entry->start);
+ r->error = 0;
+ } else {
+ r->error = -EIO;
+ }
+ r->alive = true;
+ break;
+ }
+ case BCH_JSET_ENTRY_usage: {
+ struct jset_entry_usage *u =
+ container_of(entry, struct jset_entry_usage, entry);
+
+ switch (entry->btree_id) {
+ case BCH_FS_USAGE_reserved:
+ if (entry->level < BCH_REPLICAS_MAX)
+ c->usage_base->persistent_reserved[entry->level] =
+ le64_to_cpu(u->v);
+ break;
+ case BCH_FS_USAGE_inodes:
+ c->usage_base->nr_inodes = le64_to_cpu(u->v);
+ break;
+ case BCH_FS_USAGE_key_version:
+ atomic64_set(&c->key_version,
+ le64_to_cpu(u->v));
+ break;
+ }
+
+ break;
+ }
+ case BCH_JSET_ENTRY_data_usage: {
+ struct jset_entry_data_usage *u =
+ container_of(entry, struct jset_entry_data_usage, entry);
+
+ ret = bch2_replicas_set_usage(c, &u->r,
+ le64_to_cpu(u->v));
+ break;
+ }
+ case BCH_JSET_ENTRY_dev_usage: {
+ struct jset_entry_dev_usage *u =
+ container_of(entry, struct jset_entry_dev_usage, entry);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev));
+ unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
+
+ ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec);
+
+ for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) {
+ ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets);
+ ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors);
+ ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented);
+ }
+
+ break;
+ }
+ case BCH_JSET_ENTRY_blacklist: {
+ struct jset_entry_blacklist *bl_entry =
+ container_of(entry, struct jset_entry_blacklist, entry);
+
+ ret = bch2_journal_seq_blacklist_add(c,
+ le64_to_cpu(bl_entry->seq),
+ le64_to_cpu(bl_entry->seq) + 1);
+ break;
+ }
+ case BCH_JSET_ENTRY_blacklist_v2: {
+ struct jset_entry_blacklist_v2 *bl_entry =
+ container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+ ret = bch2_journal_seq_blacklist_add(c,
+ le64_to_cpu(bl_entry->start),
+ le64_to_cpu(bl_entry->end) + 1);
+ break;
+ }
+ case BCH_JSET_ENTRY_clock: {
+ struct jset_entry_clock *clock =
+ container_of(entry, struct jset_entry_clock, entry);
+
+ atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time));
+ }
+ }
+
+ return ret;
+}
+
+static int journal_replay_early(struct bch_fs *c,
+ struct bch_sb_field_clean *clean)
+{
+ struct jset_entry *entry;
+ int ret;
+
+ if (clean) {
+ for (entry = clean->start;
+ entry != vstruct_end(&clean->field);
+ entry = vstruct_next(entry)) {
+ ret = journal_replay_entry_early(c, entry);
+ if (ret)
+ return ret;
+ }
+ } else {
+ struct genradix_iter iter;
+ struct journal_replay *i, **_i;
+
+ genradix_for_each(&c->journal_entries, iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
+ continue;
+
+ vstruct_for_each(&i->j, entry) {
+ ret = journal_replay_entry_early(c, entry);
+ if (ret)
+ return ret;
+ }
+ }
+ }
+
+ bch2_fs_usage_initialize(c);
+
+ return 0;
+}
+
+/* sb clean section: */
+
+static int read_btree_roots(struct bch_fs *c)
+{
+ unsigned i;
+ int ret = 0;
+
+ for (i = 0; i < btree_id_nr_alive(c); i++) {
+ struct btree_root *r = bch2_btree_id_root(c, i);
+
+ if (!r->alive)
+ continue;
+
+ if (btree_id_is_alloc(i) &&
+ c->opts.reconstruct_alloc) {
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+ continue;
+ }
+
+ if (r->error) {
+ __fsck_err(c,
+ btree_id_is_alloc(i)
+ ? FSCK_CAN_IGNORE : 0,
+ btree_root_bkey_invalid,
+ "invalid btree root %s",
+ bch2_btree_id_str(i));
+ if (i == BTREE_ID_alloc)
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+ }
+
+ ret = bch2_btree_root_read(c, i, &r->key, r->level);
+ if (ret) {
+ fsck_err(c,
+ btree_root_read_error,
+ "error reading btree root %s",
+ bch2_btree_id_str(i));
+ if (btree_id_is_alloc(i))
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+ ret = 0;
+ }
+ }
+
+ for (i = 0; i < BTREE_ID_NR; i++) {
+ struct btree_root *r = bch2_btree_id_root(c, i);
+
+ if (!r->b) {
+ r->alive = false;
+ r->level = 0;
+ bch2_btree_root_alloc(c, i);
+ }
+ }
+fsck_err:
+ return ret;
+}
+
+static int bch2_initialize_subvolumes(struct bch_fs *c)
+{
+ struct bkey_i_snapshot_tree root_tree;
+ struct bkey_i_snapshot root_snapshot;
+ struct bkey_i_subvolume root_volume;
+ int ret;
+
+ bkey_snapshot_tree_init(&root_tree.k_i);
+ root_tree.k.p.offset = 1;
+ root_tree.v.master_subvol = cpu_to_le32(1);
+ root_tree.v.root_snapshot = cpu_to_le32(U32_MAX);
+
+ bkey_snapshot_init(&root_snapshot.k_i);
+ root_snapshot.k.p.offset = U32_MAX;
+ root_snapshot.v.flags = 0;
+ root_snapshot.v.parent = 0;
+ root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL);
+ root_snapshot.v.tree = cpu_to_le32(1);
+ SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
+
+ bkey_subvolume_init(&root_volume.k_i);
+ root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
+ root_volume.v.flags = 0;
+ root_volume.v.snapshot = cpu_to_le32(U32_MAX);
+ root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO);
+
+ ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?:
+ bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?:
+ bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0);
+ if (ret)
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_inode_unpacked inode;
+ int ret;
+
+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ if (!bkey_is_inode(k.k)) {
+ bch_err(trans->c, "root inode not found");
+ ret = -BCH_ERR_ENOENT_inode;
+ goto err;
+ }
+
+ ret = bch2_inode_unpack(k, &inode);
+ BUG_ON(ret);
+
+ inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
+
+ ret = bch2_inode_write(trans, &iter, &inode);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+/* set bi_subvol on root inode */
+noinline_for_stack
+static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
+{
+ int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
+ __bch2_fs_upgrade_for_subvolumes(trans));
+ if (ret)
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+const char * const bch2_recovery_passes[] = {
+#define x(_fn, ...) #_fn,
+ BCH_RECOVERY_PASSES()
+#undef x
+ NULL
+};
+
+static int bch2_check_allocations(struct bch_fs *c)
+{
+ return bch2_gc(c, true, c->opts.norecovery);
+}
+
+static int bch2_set_may_go_rw(struct bch_fs *c)
+{
+ set_bit(BCH_FS_MAY_GO_RW, &c->flags);
+ return 0;
+}
+
+struct recovery_pass_fn {
+ int (*fn)(struct bch_fs *);
+ unsigned when;
+};
+
+static struct recovery_pass_fn recovery_pass_fns[] = {
+#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when },
+ BCH_RECOVERY_PASSES()
+#undef x
+};
+
+u64 bch2_recovery_passes_to_stable(u64 v)
+{
+ static const u8 map[] = {
+#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
+ BCH_RECOVERY_PASSES()
+#undef x
+ };
+
+ u64 ret = 0;
+ for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
+ if (v & BIT_ULL(i))
+ ret |= BIT_ULL(map[i]);
+ return ret;
+}
+
+u64 bch2_recovery_passes_from_stable(u64 v)
+{
+ static const u8 map[] = {
+#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n,
+ BCH_RECOVERY_PASSES()
+#undef x
+ };
+
+ u64 ret = 0;
+ for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
+ if (v & BIT_ULL(i))
+ ret |= BIT_ULL(map[i]);
+ return ret;
+}
+
+static bool check_version_upgrade(struct bch_fs *c)
+{
+ unsigned latest_compatible = bch2_latest_compatible_version(c->sb.version);
+ unsigned latest_version = bcachefs_metadata_version_current;
+ unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
+ unsigned new_version = 0;
+
+ if (old_version < bcachefs_metadata_required_upgrade_below) {
+ if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible ||
+ latest_compatible < bcachefs_metadata_required_upgrade_below)
+ new_version = latest_version;
+ else
+ new_version = latest_compatible;
+ } else {
+ switch (c->opts.version_upgrade) {
+ case BCH_VERSION_UPGRADE_compatible:
+ new_version = latest_compatible;
+ break;
+ case BCH_VERSION_UPGRADE_incompatible:
+ new_version = latest_version;
+ break;
+ case BCH_VERSION_UPGRADE_none:
+ new_version = old_version;
+ break;
+ }
+ }
+
+ if (new_version > old_version) {
+ struct printbuf buf = PRINTBUF;
+
+ if (old_version < bcachefs_metadata_required_upgrade_below)
+ prt_str(&buf, "Version upgrade required:\n");
+
+ if (old_version != c->sb.version) {
+ prt_str(&buf, "Version upgrade from ");
+ bch2_version_to_text(&buf, c->sb.version_upgrade_complete);
+ prt_str(&buf, " to ");
+ bch2_version_to_text(&buf, c->sb.version);
+ prt_str(&buf, " incomplete\n");
+ }
+
+ prt_printf(&buf, "Doing %s version upgrade from ",
+ BCH_VERSION_MAJOR(old_version) != BCH_VERSION_MAJOR(new_version)
+ ? "incompatible" : "compatible");
+ bch2_version_to_text(&buf, old_version);
+ prt_str(&buf, " to ");
+ bch2_version_to_text(&buf, new_version);
+ prt_newline(&buf);
+
+ u64 recovery_passes = bch2_upgrade_recovery_passes(c, old_version, new_version);
+ if (recovery_passes) {
+ if ((recovery_passes & RECOVERY_PASS_ALL_FSCK) == RECOVERY_PASS_ALL_FSCK)
+ prt_str(&buf, "fsck required");
+ else {
+ prt_str(&buf, "running recovery passes: ");
+ prt_bitflags(&buf, bch2_recovery_passes, recovery_passes);
+ }
+
+ c->recovery_passes_explicit |= recovery_passes;
+ c->opts.fix_errors = FSCK_FIX_yes;
+ }
+
+ bch_info(c, "%s", buf.buf);
+
+ bch2_sb_upgrade(c, new_version);
+
+ printbuf_exit(&buf);
+ return true;
+ }
+
+ return false;
+}
+
+u64 bch2_fsck_recovery_passes(void)
+{
+ u64 ret = 0;
+
+ for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
+ if (recovery_pass_fns[i].when & PASS_FSCK)
+ ret |= BIT_ULL(i);
+ return ret;
+}
+
+static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
+{
+ struct recovery_pass_fn *p = recovery_pass_fns + c->curr_recovery_pass;
+
+ if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read)
+ return false;
+ if (c->recovery_passes_explicit & BIT_ULL(pass))
+ return true;
+ if ((p->when & PASS_FSCK) && c->opts.fsck)
+ return true;
+ if ((p->when & PASS_UNCLEAN) && !c->sb.clean)
+ return true;
+ if (p->when & PASS_ALWAYS)
+ return true;
+ return false;
+}
+
+static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
+{
+ int ret;
+
+ c->curr_recovery_pass = pass;
+
+ if (should_run_recovery_pass(c, pass)) {
+ struct recovery_pass_fn *p = recovery_pass_fns + pass;
+
+ if (!(p->when & PASS_SILENT))
+ printk(KERN_INFO bch2_log_msg(c, "%s..."),
+ bch2_recovery_passes[pass]);
+ ret = p->fn(c);
+ if (ret)
+ return ret;
+ if (!(p->when & PASS_SILENT))
+ printk(KERN_CONT " done\n");
+
+ c->recovery_passes_complete |= BIT_ULL(pass);
+ }
+
+ return 0;
+}
+
+static int bch2_run_recovery_passes(struct bch_fs *c)
+{
+ int ret = 0;
+
+ while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
+ ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
+ if (bch2_err_matches(ret, BCH_ERR_restart_recovery))
+ continue;
+ if (ret)
+ break;
+ c->curr_recovery_pass++;
+ }
+
+ return ret;
+}
+
+int bch2_fs_recovery(struct bch_fs *c)
+{
+ struct bch_sb_field_clean *clean = NULL;
+ struct jset *last_journal_entry = NULL;
+ u64 last_seq = 0, blacklist_seq, journal_seq;
+ int ret = 0;
+
+ if (c->sb.clean) {
+ clean = bch2_read_superblock_clean(c);
+ ret = PTR_ERR_OR_ZERO(clean);
+ if (ret)
+ goto err;
+
+ bch_info(c, "recovering from clean shutdown, journal seq %llu",
+ le64_to_cpu(clean->journal_seq));
+ } else {
+ bch_info(c, "recovering from unclean shutdown");
+ }
+
+ if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) {
+ bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported");
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (!c->sb.clean &&
+ !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) {
+ bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (c->opts.fsck && c->opts.norecovery) {
+ bch_err(c, "cannot select both norecovery and fsck");
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (!(c->opts.nochanges && c->opts.norecovery)) {
+ mutex_lock(&c->sb_lock);
+ bool write_sb = false;
+
+ struct bch_sb_field_ext *ext =
+ bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64));
+ if (!ext) {
+ ret = -BCH_ERR_ENOSPC_sb;
+ mutex_unlock(&c->sb_lock);
+ goto err;
+ }
+
+ if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) {
+ ext->recovery_passes_required[0] |=
+ cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology)));
+ write_sb = true;
+ }
+
+ u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+ if (sb_passes) {
+ struct printbuf buf = PRINTBUF;
+ prt_str(&buf, "superblock requires following recovery passes to be run:\n ");
+ prt_bitflags(&buf, bch2_recovery_passes, sb_passes);
+ bch_info(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ if (bch2_check_version_downgrade(c)) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "Version downgrade required:\n");
+
+ __le64 passes = ext->recovery_passes_required[0];
+ bch2_sb_set_downgrade(c,
+ BCH_VERSION_MINOR(bcachefs_metadata_version_current),
+ BCH_VERSION_MINOR(c->sb.version));
+ passes = ext->recovery_passes_required[0] & ~passes;
+ if (passes) {
+ prt_str(&buf, " running recovery passes: ");
+ prt_bitflags(&buf, bch2_recovery_passes,
+ bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
+ }
+
+ bch_info(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ write_sb = true;
+ }
+
+ if (check_version_upgrade(c))
+ write_sb = true;
+
+ if (write_sb)
+ bch2_write_super(c);
+
+ c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+ mutex_unlock(&c->sb_lock);
+ }
+
+ if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
+
+ ret = bch2_blacklist_table_initialize(c);
+ if (ret) {
+ bch_err(c, "error initializing blacklist table");
+ goto err;
+ }
+
+ if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
+ struct genradix_iter iter;
+ struct journal_replay **i;
+
+ bch_verbose(c, "starting journal read");
+ ret = bch2_journal_read(c, &last_seq, &blacklist_seq, &journal_seq);
+ if (ret)
+ goto err;
+
+ /*
+ * note: cmd_list_journal needs the blacklist table fully up to date so
+ * it can asterisk ignored journal entries:
+ */
+ if (c->opts.read_journal_only)
+ goto out;
+
+ genradix_for_each_reverse(&c->journal_entries, iter, i)
+ if (*i && !(*i)->ignore) {
+ last_journal_entry = &(*i)->j;
+ break;
+ }
+
+ if (mustfix_fsck_err_on(c->sb.clean &&
+ last_journal_entry &&
+ !journal_entry_empty(last_journal_entry), c,
+ clean_but_journal_not_empty,
+ "filesystem marked clean but journal not empty")) {
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+ c->sb.clean = false;
+ }
+
+ if (!last_journal_entry) {
+ fsck_err_on(!c->sb.clean, c,
+ dirty_but_no_journal_entries,
+ "no journal entries found");
+ if (clean)
+ goto use_clean;
+
+ genradix_for_each_reverse(&c->journal_entries, iter, i)
+ if (*i) {
+ last_journal_entry = &(*i)->j;
+ (*i)->ignore = false;
+ /*
+ * This was probably a NO_FLUSH entry,
+ * so last_seq was garbage - but we know
+ * we're only using a single journal
+ * entry, set it here:
+ */
+ (*i)->j.last_seq = (*i)->j.seq;
+ break;
+ }
+ }
+
+ ret = bch2_journal_keys_sort(c);
+ if (ret)
+ goto err;
+
+ if (c->sb.clean && last_journal_entry) {
+ ret = bch2_verify_superblock_clean(c, &clean,
+ last_journal_entry);
+ if (ret)
+ goto err;
+ }
+ } else {
+use_clean:
+ if (!clean) {
+ bch_err(c, "no superblock clean section found");
+ ret = -BCH_ERR_fsck_repair_impossible;
+ goto err;
+
+ }
+ blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
+ }
+
+ c->journal_replay_seq_start = last_seq;
+ c->journal_replay_seq_end = blacklist_seq - 1;
+
+ if (c->opts.reconstruct_alloc) {
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+ drop_alloc_keys(&c->journal_keys);
+ }
+
+ zero_out_btree_mem_ptr(&c->journal_keys);
+
+ ret = journal_replay_early(c, clean);
+ if (ret)
+ goto err;
+
+ /*
+ * After an unclean shutdown, skip then next few journal sequence
+ * numbers as they may have been referenced by btree writes that
+ * happened before their corresponding journal writes - those btree
+ * writes need to be ignored, by skipping and blacklisting the next few
+ * journal sequence numbers:
+ */
+ if (!c->sb.clean)
+ journal_seq += 8;
+
+ if (blacklist_seq != journal_seq) {
+ ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu",
+ blacklist_seq, journal_seq) ?:
+ bch2_journal_seq_blacklist_add(c,
+ blacklist_seq, journal_seq);
+ if (ret) {
+ bch_err(c, "error creating new journal seq blacklist entry");
+ goto err;
+ }
+ }
+
+ ret = bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu",
+ journal_seq, last_seq, blacklist_seq - 1) ?:
+ bch2_fs_journal_start(&c->journal, journal_seq);
+ if (ret)
+ goto err;
+
+ if (c->opts.reconstruct_alloc)
+ bch2_journal_log_msg(c, "dropping alloc info");
+
+ /*
+ * Skip past versions that might have possibly been used (as nonces),
+ * but hadn't had their pointers written:
+ */
+ if (c->sb.encryption_type && !c->sb.clean)
+ atomic64_add(1 << 16, &c->key_version);
+
+ ret = read_btree_roots(c);
+ if (ret)
+ goto err;
+
+ ret = bch2_run_recovery_passes(c);
+ if (ret)
+ goto err;
+
+ /* If we fixed errors, verify that fs is actually clean now: */
+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+ test_bit(BCH_FS_ERRORS_FIXED, &c->flags) &&
+ !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags) &&
+ !test_bit(BCH_FS_ERROR, &c->flags)) {
+ bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean");
+ clear_bit(BCH_FS_ERRORS_FIXED, &c->flags);
+
+ c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
+
+ ret = bch2_run_recovery_passes(c);
+ if (ret)
+ goto err;
+
+ if (test_bit(BCH_FS_ERRORS_FIXED, &c->flags) ||
+ test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) {
+ bch_err(c, "Second fsck run was not clean");
+ set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags);
+ }
+
+ set_bit(BCH_FS_ERRORS_FIXED, &c->flags);
+ }
+
+ if (enabled_qtypes(c)) {
+ bch_verbose(c, "reading quotas");
+ ret = bch2_fs_quota_read(c);
+ if (ret)
+ goto err;
+ bch_verbose(c, "quotas done");
+ }
+
+ mutex_lock(&c->sb_lock);
+ bool write_sb = false;
+
+ if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) {
+ SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, le16_to_cpu(c->disk_sb.sb->version));
+ write_sb = true;
+ }
+
+ if (!test_bit(BCH_FS_ERROR, &c->flags) &&
+ !(c->disk_sb.sb->compat[0] & cpu_to_le64(1ULL << BCH_COMPAT_alloc_info))) {
+ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
+ write_sb = true;
+ }
+
+ if (!test_bit(BCH_FS_ERROR, &c->flags)) {
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+ if (ext &&
+ (!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) ||
+ !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent)))) {
+ memset(ext->recovery_passes_required, 0, sizeof(ext->recovery_passes_required));
+ memset(ext->errors_silent, 0, sizeof(ext->errors_silent));
+ write_sb = true;
+ }
+ }
+
+ if (c->opts.fsck &&
+ !test_bit(BCH_FS_ERROR, &c->flags) &&
+ !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) {
+ SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
+ SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0);
+ write_sb = true;
+ }
+
+ if (write_sb)
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
+ if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
+ c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) {
+ struct bch_move_stats stats;
+
+ bch2_move_stats_init(&stats, "recovery");
+
+ bch_info(c, "scanning for old btree nodes");
+ ret = bch2_fs_read_write(c) ?:
+ bch2_scan_old_btree_nodes(c, &stats);
+ if (ret)
+ goto err;
+ bch_info(c, "scanning for old btree nodes done");
+ }
+
+ if (c->journal_seq_blacklist_table &&
+ c->journal_seq_blacklist_table->nr > 128)
+ queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work);
+
+ ret = 0;
+out:
+ set_bit(BCH_FS_FSCK_DONE, &c->flags);
+ bch2_flush_fsck_errs(c);
+
+ if (!c->opts.keep_journal &&
+ test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+ bch2_journal_keys_put_initial(c);
+ kfree(clean);
+
+ if (!ret && test_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) {
+ bch2_fs_read_write_early(c);
+ bch2_delete_dead_snapshots_async(c);
+ }
+
+ if (ret)
+ bch_err_fn(c, ret);
+ return ret;
+err:
+fsck_err:
+ bch2_fs_emergency_read_only(c);
+ goto out;
+}
+
+int bch2_fs_initialize(struct bch_fs *c)
+{
+ struct bch_inode_unpacked root_inode, lostfound_inode;
+ struct bkey_inode_buf packed_inode;
+ struct qstr lostfound = QSTR("lost+found");
+ struct bch_dev *ca;
+ unsigned i;
+ int ret;
+
+ bch_notice(c, "initializing new filesystem");
+
+ mutex_lock(&c->sb_lock);
+ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
+ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
+
+ bch2_check_version_downgrade(c);
+
+ if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) {
+ bch2_sb_upgrade(c, bcachefs_metadata_version_current);
+ SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
+ bch2_write_super(c);
+ }
+ mutex_unlock(&c->sb_lock);
+
+ c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns);
+ set_bit(BCH_FS_MAY_GO_RW, &c->flags);
+ set_bit(BCH_FS_FSCK_DONE, &c->flags);
+
+ for (i = 0; i < BTREE_ID_NR; i++)
+ bch2_btree_root_alloc(c, i);
+
+ for_each_member_device(ca, c, i)
+ bch2_dev_usage_init(ca);
+
+ ret = bch2_fs_journal_alloc(c);
+ if (ret)
+ goto err;
+
+ /*
+ * journal_res_get() will crash if called before this has
+ * set up the journal.pin FIFO and journal.cur pointer:
+ */
+ bch2_fs_journal_start(&c->journal, 1);
+ bch2_journal_set_replay_done(&c->journal);
+
+ ret = bch2_fs_read_write_early(c);
+ if (ret)
+ goto err;
+
+ /*
+ * Write out the superblock and journal buckets, now that we can do
+ * btree updates
+ */
+ bch_verbose(c, "marking superblocks");
+ ret = bch2_trans_mark_dev_sbs(c);
+ bch_err_msg(c, ret, "marking superblocks");
+ if (ret)
+ goto err;
+
+ for_each_online_member(ca, c, i)
+ ca->new_fs_bucket_idx = 0;
+
+ ret = bch2_fs_freespace_init(c);
+ if (ret)
+ goto err;
+
+ ret = bch2_initialize_subvolumes(c);
+ if (ret)
+ goto err;
+
+ bch_verbose(c, "reading snapshots table");
+ ret = bch2_snapshots_read(c);
+ if (ret)
+ goto err;
+ bch_verbose(c, "reading snapshots done");
+
+ bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL);
+ root_inode.bi_inum = BCACHEFS_ROOT_INO;
+ root_inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
+ bch2_inode_pack(&packed_inode, &root_inode);
+ packed_inode.inode.k.p.snapshot = U32_MAX;
+
+ ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0);
+ if (ret) {
+ bch_err_msg(c, ret, "creating root directory");
+ goto err;
+ }
+
+ bch2_inode_init_early(c, &lostfound_inode);
+
+ ret = bch2_trans_do(c, NULL, NULL, 0,
+ bch2_create_trans(trans,
+ BCACHEFS_ROOT_SUBVOL_INUM,
+ &root_inode, &lostfound_inode,
+ &lostfound,
+ 0, 0, S_IFDIR|0700, 0,
+ NULL, NULL, (subvol_inum) { 0 }, 0));
+ if (ret) {
+ bch_err_msg(c, ret, "creating lost+found");
+ goto err;
+ }
+
+ if (enabled_qtypes(c)) {
+ ret = bch2_fs_quota_read(c);
+ if (ret)
+ goto err;
+ }
+
+ ret = bch2_journal_flush(&c->journal);
+ if (ret) {
+ bch_err_msg(c, ret, "writing first journal entry");
+ goto err;
+ }
+
+ mutex_lock(&c->sb_lock);
+ SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
+ return 0;
+err:
+ bch_err_fn(ca, ret);
+ return ret;
+}
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
new file mode 100644
index 000000000000..3a554b0751d0
--- /dev/null
+++ b/fs/bcachefs/recovery.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_RECOVERY_H
+#define _BCACHEFS_RECOVERY_H
+
+extern const char * const bch2_recovery_passes[];
+
+u64 bch2_recovery_passes_to_stable(u64 v);
+u64 bch2_recovery_passes_from_stable(u64 v);
+
+/*
+ * For when we need to rewind recovery passes and run a pass we skipped:
+ */
+static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c,
+ enum bch_recovery_pass pass)
+{
+ if (c->recovery_passes_explicit & BIT_ULL(pass))
+ return 0;
+
+ bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
+ bch2_recovery_passes[pass], pass,
+ bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
+
+ c->recovery_passes_explicit |= BIT_ULL(pass);
+
+ if (c->curr_recovery_pass >= pass) {
+ c->curr_recovery_pass = pass;
+ c->recovery_passes_complete &= (1ULL << pass) >> 1;
+ return -BCH_ERR_restart_recovery;
+ } else {
+ return 0;
+ }
+}
+
+u64 bch2_fsck_recovery_passes(void);
+
+int bch2_fs_recovery(struct bch_fs *);
+int bch2_fs_initialize(struct bch_fs *);
+
+#endif /* _BCACHEFS_RECOVERY_H */
diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h
new file mode 100644
index 000000000000..d37c6fd30e38
--- /dev/null
+++ b/fs/bcachefs/recovery_types.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_RECOVERY_TYPES_H
+#define _BCACHEFS_RECOVERY_TYPES_H
+
+#define PASS_SILENT BIT(0)
+#define PASS_FSCK BIT(1)
+#define PASS_UNCLEAN BIT(2)
+#define PASS_ALWAYS BIT(3)
+
+/*
+ * Passes may be reordered, but the second field is a persistent identifier and
+ * must never change:
+ */
+#define BCH_RECOVERY_PASSES() \
+ x(alloc_read, 0, PASS_ALWAYS) \
+ x(stripes_read, 1, PASS_ALWAYS) \
+ x(initialize_subvolumes, 2, 0) \
+ x(snapshots_read, 3, PASS_ALWAYS) \
+ x(check_topology, 4, 0) \
+ x(check_allocations, 5, PASS_FSCK) \
+ x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \
+ x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \
+ x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \
+ x(journal_replay, 9, PASS_ALWAYS) \
+ x(check_alloc_info, 10, PASS_FSCK) \
+ x(check_lrus, 11, PASS_FSCK) \
+ x(check_btree_backpointers, 12, PASS_FSCK) \
+ x(check_backpointers_to_extents, 13, PASS_FSCK) \
+ x(check_extents_to_backpointers, 14, PASS_FSCK) \
+ x(check_alloc_to_lru_refs, 15, PASS_FSCK) \
+ x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \
+ x(bucket_gens_init, 17, 0) \
+ x(check_snapshot_trees, 18, PASS_FSCK) \
+ x(check_snapshots, 19, PASS_FSCK) \
+ x(check_subvols, 20, PASS_FSCK) \
+ x(delete_dead_snapshots, 21, PASS_FSCK) \
+ x(fs_upgrade_for_subvolumes, 22, 0) \
+ x(resume_logged_ops, 23, PASS_ALWAYS) \
+ x(check_inodes, 24, PASS_FSCK) \
+ x(check_extents, 25, PASS_FSCK) \
+ x(check_indirect_extents, 26, PASS_FSCK) \
+ x(check_dirents, 27, PASS_FSCK) \
+ x(check_xattrs, 28, PASS_FSCK) \
+ x(check_root, 29, PASS_FSCK) \
+ x(check_directory_structure, 30, PASS_FSCK) \
+ x(check_nlinks, 31, PASS_FSCK) \
+ x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \
+ x(fix_reflink_p, 33, 0) \
+ x(set_fs_needs_rebalance, 34, 0) \
+
+/* We normally enumerate recovery passes in the order we run them: */
+enum bch_recovery_pass {
+#define x(n, id, when) BCH_RECOVERY_PASS_##n,
+ BCH_RECOVERY_PASSES()
+#undef x
+};
+
+/* But we also need stable identifiers that can be used in the superblock */
+enum bch_recovery_pass_stable {
+#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id,
+ BCH_RECOVERY_PASSES()
+#undef x
+};
+
+#endif /* _BCACHEFS_RECOVERY_TYPES_H */
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
new file mode 100644
index 000000000000..37d16e04e671
--- /dev/null
+++ b/fs/bcachefs/reflink.c
@@ -0,0 +1,414 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "extents.h"
+#include "inode.h"
+#include "io_misc.h"
+#include "io_write.h"
+#include "rebalance.h"
+#include "reflink.h"
+#include "subvolume.h"
+#include "super-io.h"
+
+#include <linux/sched/signal.h>
+
+static inline unsigned bkey_type_to_indirect(const struct bkey *k)
+{
+ switch (k->type) {
+ case KEY_TYPE_extent:
+ return KEY_TYPE_reflink_v;
+ case KEY_TYPE_inline_data:
+ return KEY_TYPE_indirect_inline_data;
+ default:
+ return 0;
+ }
+}
+
+/* reflink pointers */
+
+int bch2_reflink_p_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
+{
+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+
+ if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix &&
+ le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad)) {
+ prt_printf(err, "idx < front_pad (%llu < %u)",
+ le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+
+ prt_printf(out, "idx %llu front_pad %u back_pad %u",
+ le64_to_cpu(p.v->idx),
+ le32_to_cpu(p.v->front_pad),
+ le32_to_cpu(p.v->back_pad));
+}
+
+bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
+{
+ struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l);
+ struct bkey_s_c_reflink_p r = bkey_s_c_to_reflink_p(_r);
+
+ /*
+ * Disabled for now, the triggers code needs to be reworked for merging
+ * of reflink pointers to work:
+ */
+ return false;
+
+ if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx))
+ return false;
+
+ bch2_key_resize(l.k, l.k->size + r.k->size);
+ return true;
+}
+
+/* indirect extents */
+
+int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
+{
+ return bch2_bkey_ptrs_invalid(c, k, flags, err);
+}
+
+void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
+
+ prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
+
+ bch2_bkey_ptrs_to_text(out, c, k);
+}
+
+#if 0
+Currently disabled, needs to be debugged:
+
+bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
+{
+ struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l);
+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r);
+
+ return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r);
+}
+#endif
+
+static inline void check_indirect_extent_deleting(struct bkey_i *new, unsigned *flags)
+{
+ if ((*flags & BTREE_TRIGGER_INSERT) && !*bkey_refcount(new)) {
+ new->k.type = KEY_TYPE_deleted;
+ new->k.size = 0;
+ set_bkey_val_u64s(&new->k, 0);;
+ *flags &= ~BTREE_TRIGGER_INSERT;
+ }
+}
+
+int bch2_trans_mark_reflink_v(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_i *new,
+ unsigned flags)
+{
+ check_indirect_extent_deleting(new, &flags);
+
+ if (old.k->type == KEY_TYPE_reflink_v &&
+ new->k.type == KEY_TYPE_reflink_v &&
+ old.k->u64s == new->k.u64s &&
+ !memcmp(bkey_s_c_to_reflink_v(old).v->start,
+ bkey_i_to_reflink_v(new)->v.start,
+ bkey_val_bytes(&new->k) - 8))
+ return 0;
+
+ return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags);
+}
+
+/* indirect inline data */
+
+int bch2_indirect_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
+{
+ return 0;
+}
+
+void bch2_indirect_inline_data_to_text(struct printbuf *out,
+ struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k);
+ unsigned datalen = bkey_inline_data_bytes(k.k);
+
+ prt_printf(out, "refcount %llu datalen %u: %*phN",
+ le64_to_cpu(d.v->refcount), datalen,
+ min(datalen, 32U), d.v->data);
+}
+
+int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_i *new,
+ unsigned flags)
+{
+ check_indirect_extent_deleting(new, &flags);
+
+ return 0;
+}
+
+static int bch2_make_extent_indirect(struct btree_trans *trans,
+ struct btree_iter *extent_iter,
+ struct bkey_i *orig)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter reflink_iter = { NULL };
+ struct bkey_s_c k;
+ struct bkey_i *r_v;
+ struct bkey_i_reflink_p *r_p;
+ __le64 *refcount;
+ int ret;
+
+ if (orig->k.type == KEY_TYPE_inline_data)
+ bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data);
+
+ bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX,
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_prev(&reflink_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k));
+ ret = PTR_ERR_OR_ZERO(r_v);
+ if (ret)
+ goto err;
+
+ bkey_init(&r_v->k);
+ r_v->k.type = bkey_type_to_indirect(&orig->k);
+ r_v->k.p = reflink_iter.pos;
+ bch2_key_resize(&r_v->k, orig->k.size);
+ r_v->k.version = orig->k.version;
+
+ set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k));
+
+ refcount = bkey_refcount(r_v);
+ *refcount = 0;
+ memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k));
+
+ ret = bch2_trans_update(trans, &reflink_iter, r_v, 0);
+ if (ret)
+ goto err;
+
+ /*
+ * orig is in a bkey_buf which statically allocates 5 64s for the val,
+ * so we know it will be big enough:
+ */
+ orig->k.type = KEY_TYPE_reflink_p;
+ r_p = bkey_i_to_reflink_p(orig);
+ set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
+
+ /* FORTIFY_SOURCE is broken here, and doesn't provide unsafe_memset() */
+#if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE)
+ __underlying_memset(&r_p->v, 0, sizeof(r_p->v));
+#else
+ memset(&r_p->v, 0, sizeof(r_p->v));
+#endif
+
+ r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
+
+ ret = bch2_trans_update(trans, extent_iter, &r_p->k_i,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+err:
+ bch2_trans_iter_exit(trans, &reflink_iter);
+
+ return ret;
+}
+
+static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
+{
+ struct bkey_s_c k;
+ int ret;
+
+ for_each_btree_key_upto_continue_norestart(*iter, end, 0, k, ret) {
+ if (bkey_extent_is_unwritten(k))
+ continue;
+
+ if (bkey_extent_is_data(k.k))
+ return k;
+ }
+
+ if (bkey_ge(iter->pos, end))
+ bch2_btree_iter_set_pos(iter, end);
+ return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
+}
+
+s64 bch2_remap_range(struct bch_fs *c,
+ subvol_inum dst_inum, u64 dst_offset,
+ subvol_inum src_inum, u64 src_offset,
+ u64 remap_sectors,
+ u64 new_i_size, s64 *i_sectors_delta)
+{
+ struct btree_trans *trans;
+ struct btree_iter dst_iter, src_iter;
+ struct bkey_s_c src_k;
+ struct bkey_buf new_dst, new_src;
+ struct bpos dst_start = POS(dst_inum.inum, dst_offset);
+ struct bpos src_start = POS(src_inum.inum, src_offset);
+ struct bpos dst_end = dst_start, src_end = src_start;
+ struct bch_io_opts opts;
+ struct bpos src_want;
+ u64 dst_done = 0;
+ u32 dst_snapshot, src_snapshot;
+ int ret = 0, ret2 = 0;
+
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink))
+ return -BCH_ERR_erofs_no_writes;
+
+ bch2_check_set_feature(c, BCH_FEATURE_reflink);
+
+ dst_end.offset += remap_sectors;
+ src_end.offset += remap_sectors;
+
+ bch2_bkey_buf_init(&new_dst);
+ bch2_bkey_buf_init(&new_src);
+ trans = bch2_trans_get(c);
+
+ ret = bch2_inum_opts_get(trans, src_inum, &opts);
+ if (ret)
+ goto err;
+
+ bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start,
+ BTREE_ITER_INTENT);
+ bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start,
+ BTREE_ITER_INTENT);
+
+ while ((ret == 0 ||
+ bch2_err_matches(ret, BCH_ERR_transaction_restart)) &&
+ bkey_lt(dst_iter.pos, dst_end)) {
+ struct disk_reservation disk_res = { 0 };
+
+ bch2_trans_begin(trans);
+
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+
+ ret = bch2_subvolume_get_snapshot(trans, src_inum.subvol,
+ &src_snapshot);
+ if (ret)
+ continue;
+
+ bch2_btree_iter_set_snapshot(&src_iter, src_snapshot);
+
+ ret = bch2_subvolume_get_snapshot(trans, dst_inum.subvol,
+ &dst_snapshot);
+ if (ret)
+ continue;
+
+ bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot);
+
+ dst_done = dst_iter.pos.offset - dst_start.offset;
+ src_want = POS(src_start.inode, src_start.offset + dst_done);
+ bch2_btree_iter_set_pos(&src_iter, src_want);
+
+ src_k = get_next_src(&src_iter, src_end);
+ ret = bkey_err(src_k);
+ if (ret)
+ continue;
+
+ if (bkey_lt(src_want, src_iter.pos)) {
+ ret = bch2_fpunch_at(trans, &dst_iter, dst_inum,
+ min(dst_end.offset,
+ dst_iter.pos.offset +
+ src_iter.pos.offset - src_want.offset),
+ i_sectors_delta);
+ continue;
+ }
+
+ if (src_k.k->type != KEY_TYPE_reflink_p) {
+ bch2_btree_iter_set_pos_to_extent_start(&src_iter);
+
+ bch2_bkey_buf_reassemble(&new_src, c, src_k);
+ src_k = bkey_i_to_s_c(new_src.k);
+
+ ret = bch2_make_extent_indirect(trans, &src_iter,
+ new_src.k);
+ if (ret)
+ continue;
+
+ BUG_ON(src_k.k->type != KEY_TYPE_reflink_p);
+ }
+
+ if (src_k.k->type == KEY_TYPE_reflink_p) {
+ struct bkey_s_c_reflink_p src_p =
+ bkey_s_c_to_reflink_p(src_k);
+ struct bkey_i_reflink_p *dst_p =
+ bkey_reflink_p_init(new_dst.k);
+
+ u64 offset = le64_to_cpu(src_p.v->idx) +
+ (src_want.offset -
+ bkey_start_offset(src_k.k));
+
+ dst_p->v.idx = cpu_to_le64(offset);
+ } else {
+ BUG();
+ }
+
+ new_dst.k->k.p = dst_iter.pos;
+ bch2_key_resize(&new_dst.k->k,
+ min(src_k.k->p.offset - src_want.offset,
+ dst_end.offset - dst_iter.pos.offset));
+
+ ret = bch2_bkey_set_needs_rebalance(c, new_dst.k,
+ opts.background_target,
+ opts.background_compression) ?:
+ bch2_extent_update(trans, dst_inum, &dst_iter,
+ new_dst.k, &disk_res,
+ new_i_size, i_sectors_delta,
+ true);
+ bch2_disk_reservation_put(c, &disk_res);
+ }
+ bch2_trans_iter_exit(trans, &dst_iter);
+ bch2_trans_iter_exit(trans, &src_iter);
+
+ BUG_ON(!ret && !bkey_eq(dst_iter.pos, dst_end));
+ BUG_ON(bkey_gt(dst_iter.pos, dst_end));
+
+ dst_done = dst_iter.pos.offset - dst_start.offset;
+ new_i_size = min(dst_iter.pos.offset << 9, new_i_size);
+
+ do {
+ struct bch_inode_unpacked inode_u;
+ struct btree_iter inode_iter = { NULL };
+
+ bch2_trans_begin(trans);
+
+ ret2 = bch2_inode_peek(trans, &inode_iter, &inode_u,
+ dst_inum, BTREE_ITER_INTENT);
+
+ if (!ret2 &&
+ inode_u.bi_size < new_i_size) {
+ inode_u.bi_size = new_i_size;
+ ret2 = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL);
+ }
+
+ bch2_trans_iter_exit(trans, &inode_iter);
+ } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart));
+err:
+ bch2_trans_put(trans);
+ bch2_bkey_buf_exit(&new_src, c);
+ bch2_bkey_buf_exit(&new_dst, c);
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_reflink);
+
+ return dst_done ?: ret ?: ret2;
+}
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
new file mode 100644
index 000000000000..8ccf3f9c4939
--- /dev/null
+++ b/fs/bcachefs/reflink.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REFLINK_H
+#define _BCACHEFS_REFLINK_H
+
+enum bkey_invalid_flags;
+
+int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c,
+ enum bkey_invalid_flags, struct printbuf *);
+void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
+ struct bkey_s_c);
+bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+
+#define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \
+ .key_invalid = bch2_reflink_p_invalid, \
+ .val_to_text = bch2_reflink_p_to_text, \
+ .key_merge = bch2_reflink_p_merge, \
+ .trans_trigger = bch2_trans_mark_reflink_p, \
+ .atomic_trigger = bch2_mark_reflink_p, \
+ .min_val_size = 16, \
+})
+
+int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c,
+ enum bkey_invalid_flags, struct printbuf *);
+void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
+ struct bkey_s_c);
+int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_i *, unsigned);
+
+#define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \
+ .key_invalid = bch2_reflink_v_invalid, \
+ .val_to_text = bch2_reflink_v_to_text, \
+ .swab = bch2_ptr_swab, \
+ .trans_trigger = bch2_trans_mark_reflink_v, \
+ .atomic_trigger = bch2_mark_extent, \
+ .min_val_size = 8, \
+})
+
+int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c,
+ enum bkey_invalid_flags, struct printbuf *);
+void bch2_indirect_inline_data_to_text(struct printbuf *,
+ struct bch_fs *, struct bkey_s_c);
+int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
+ enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_i *,
+ unsigned);
+
+#define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \
+ .key_invalid = bch2_indirect_inline_data_invalid, \
+ .val_to_text = bch2_indirect_inline_data_to_text, \
+ .trans_trigger = bch2_trans_mark_indirect_inline_data, \
+ .min_val_size = 8, \
+})
+
+static inline const __le64 *bkey_refcount_c(struct bkey_s_c k)
+{
+ switch (k.k->type) {
+ case KEY_TYPE_reflink_v:
+ return &bkey_s_c_to_reflink_v(k).v->refcount;
+ case KEY_TYPE_indirect_inline_data:
+ return &bkey_s_c_to_indirect_inline_data(k).v->refcount;
+ default:
+ return NULL;
+ }
+}
+
+static inline __le64 *bkey_refcount(struct bkey_i *k)
+{
+ switch (k->k.type) {
+ case KEY_TYPE_reflink_v:
+ return &bkey_i_to_reflink_v(k)->v.refcount;
+ case KEY_TYPE_indirect_inline_data:
+ return &bkey_i_to_indirect_inline_data(k)->v.refcount;
+ default:
+ return NULL;
+ }
+}
+
+s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64,
+ subvol_inum, u64, u64, u64, s64 *);
+
+#endif /* _BCACHEFS_REFLINK_H */
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
new file mode 100644
index 000000000000..2008fe8bf706
--- /dev/null
+++ b/fs/bcachefs/replicas.c
@@ -0,0 +1,1059 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "buckets.h"
+#include "journal.h"
+#include "replicas.h"
+#include "super-io.h"
+
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
+ struct bch_replicas_cpu *);
+
+/* Replicas tracking - in memory: */
+
+static void verify_replicas_entry(struct bch_replicas_entry *e)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+ unsigned i;
+
+ BUG_ON(e->data_type >= BCH_DATA_NR);
+ BUG_ON(!e->nr_devs);
+ BUG_ON(e->nr_required > 1 &&
+ e->nr_required >= e->nr_devs);
+
+ for (i = 0; i + 1 < e->nr_devs; i++)
+ BUG_ON(e->devs[i] >= e->devs[i + 1]);
+#endif
+}
+
+void bch2_replicas_entry_sort(struct bch_replicas_entry *e)
+{
+ bubble_sort(e->devs, e->nr_devs, u8_cmp);
+}
+
+static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
+{
+ eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
+}
+
+static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
+ struct bch_replicas_entry_v0 *e)
+{
+ unsigned i;
+
+ if (e->data_type < BCH_DATA_NR)
+ prt_printf(out, "%s", bch2_data_types[e->data_type]);
+ else
+ prt_printf(out, "(invalid data type %u)", e->data_type);
+
+ prt_printf(out, ": %u [", e->nr_devs);
+ for (i = 0; i < e->nr_devs; i++)
+ prt_printf(out, i ? " %u" : "%u", e->devs[i]);
+ prt_printf(out, "]");
+}
+
+void bch2_replicas_entry_to_text(struct printbuf *out,
+ struct bch_replicas_entry *e)
+{
+ unsigned i;
+
+ if (e->data_type < BCH_DATA_NR)
+ prt_printf(out, "%s", bch2_data_types[e->data_type]);
+ else
+ prt_printf(out, "(invalid data type %u)", e->data_type);
+
+ prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs);
+ for (i = 0; i < e->nr_devs; i++)
+ prt_printf(out, i ? " %u" : "%u", e->devs[i]);
+ prt_printf(out, "]");
+}
+
+int bch2_replicas_entry_validate(struct bch_replicas_entry *r,
+ struct bch_sb *sb,
+ struct printbuf *err)
+{
+ if (!r->nr_devs) {
+ prt_printf(err, "no devices in entry ");
+ goto bad;
+ }
+
+ if (r->nr_required > 1 &&
+ r->nr_required >= r->nr_devs) {
+ prt_printf(err, "bad nr_required in entry ");
+ goto bad;
+ }
+
+ for (unsigned i = 0; i < r->nr_devs; i++)
+ if (!bch2_dev_exists(sb, r->devs[i])) {
+ prt_printf(err, "invalid device %u in entry ", r->devs[i]);
+ goto bad;
+ }
+
+ return 0;
+bad:
+ bch2_replicas_entry_to_text(err, r);
+ return -BCH_ERR_invalid_replicas_entry;
+}
+
+void bch2_cpu_replicas_to_text(struct printbuf *out,
+ struct bch_replicas_cpu *r)
+{
+ struct bch_replicas_entry *e;
+ bool first = true;
+
+ for_each_cpu_replicas_entry(r, e) {
+ if (!first)
+ prt_printf(out, " ");
+ first = false;
+
+ bch2_replicas_entry_to_text(out, e);
+ }
+}
+
+static void extent_to_replicas(struct bkey_s_c k,
+ struct bch_replicas_entry *r)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+
+ r->nr_required = 1;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ if (p.ptr.cached)
+ continue;
+
+ if (!p.has_ec)
+ r->devs[r->nr_devs++] = p.ptr.dev;
+ else
+ r->nr_required = 0;
+ }
+}
+
+static void stripe_to_replicas(struct bkey_s_c k,
+ struct bch_replicas_entry *r)
+{
+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+ const struct bch_extent_ptr *ptr;
+
+ r->nr_required = s.v->nr_blocks - s.v->nr_redundant;
+
+ for (ptr = s.v->ptrs;
+ ptr < s.v->ptrs + s.v->nr_blocks;
+ ptr++)
+ r->devs[r->nr_devs++] = ptr->dev;
+}
+
+void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
+ struct bkey_s_c k)
+{
+ e->nr_devs = 0;
+
+ switch (k.k->type) {
+ case KEY_TYPE_btree_ptr:
+ case KEY_TYPE_btree_ptr_v2:
+ e->data_type = BCH_DATA_btree;
+ extent_to_replicas(k, e);
+ break;
+ case KEY_TYPE_extent:
+ case KEY_TYPE_reflink_v:
+ e->data_type = BCH_DATA_user;
+ extent_to_replicas(k, e);
+ break;
+ case KEY_TYPE_stripe:
+ e->data_type = BCH_DATA_parity;
+ stripe_to_replicas(k, e);
+ break;
+ }
+
+ bch2_replicas_entry_sort(e);
+}
+
+void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
+ enum bch_data_type data_type,
+ struct bch_devs_list devs)
+{
+ unsigned i;
+
+ BUG_ON(!data_type ||
+ data_type == BCH_DATA_sb ||
+ data_type >= BCH_DATA_NR);
+
+ e->data_type = data_type;
+ e->nr_devs = 0;
+ e->nr_required = 1;
+
+ for (i = 0; i < devs.nr; i++)
+ e->devs[e->nr_devs++] = devs.devs[i];
+
+ bch2_replicas_entry_sort(e);
+}
+
+static struct bch_replicas_cpu
+cpu_replicas_add_entry(struct bch_fs *c,
+ struct bch_replicas_cpu *old,
+ struct bch_replicas_entry *new_entry)
+{
+ unsigned i;
+ struct bch_replicas_cpu new = {
+ .nr = old->nr + 1,
+ .entry_size = max_t(unsigned, old->entry_size,
+ replicas_entry_bytes(new_entry)),
+ };
+
+ for (i = 0; i < new_entry->nr_devs; i++)
+ BUG_ON(!bch2_dev_exists2(c, new_entry->devs[i]));
+
+ BUG_ON(!new_entry->data_type);
+ verify_replicas_entry(new_entry);
+
+ new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL);
+ if (!new.entries)
+ return new;
+
+ for (i = 0; i < old->nr; i++)
+ memcpy(cpu_replicas_entry(&new, i),
+ cpu_replicas_entry(old, i),
+ old->entry_size);
+
+ memcpy(cpu_replicas_entry(&new, old->nr),
+ new_entry,
+ replicas_entry_bytes(new_entry));
+
+ bch2_cpu_replicas_sort(&new);
+ return new;
+}
+
+static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
+ struct bch_replicas_entry *search)
+{
+ int idx, entry_size = replicas_entry_bytes(search);
+
+ if (unlikely(entry_size > r->entry_size))
+ return -1;
+
+ verify_replicas_entry(search);
+
+#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size)
+ idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
+ entry_cmp, search);
+#undef entry_cmp
+
+ return idx < r->nr ? idx : -1;
+}
+
+int bch2_replicas_entry_idx(struct bch_fs *c,
+ struct bch_replicas_entry *search)
+{
+ bch2_replicas_entry_sort(search);
+
+ return __replicas_entry_idx(&c->replicas, search);
+}
+
+static bool __replicas_has_entry(struct bch_replicas_cpu *r,
+ struct bch_replicas_entry *search)
+{
+ return __replicas_entry_idx(r, search) >= 0;
+}
+
+bool bch2_replicas_marked(struct bch_fs *c,
+ struct bch_replicas_entry *search)
+{
+ bool marked;
+
+ if (!search->nr_devs)
+ return true;
+
+ verify_replicas_entry(search);
+
+ percpu_down_read(&c->mark_lock);
+ marked = __replicas_has_entry(&c->replicas, search) &&
+ (likely((!c->replicas_gc.entries)) ||
+ __replicas_has_entry(&c->replicas_gc, search));
+ percpu_up_read(&c->mark_lock);
+
+ return marked;
+}
+
+static void __replicas_table_update(struct bch_fs_usage *dst,
+ struct bch_replicas_cpu *dst_r,
+ struct bch_fs_usage *src,
+ struct bch_replicas_cpu *src_r)
+{
+ int src_idx, dst_idx;
+
+ *dst = *src;
+
+ for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
+ if (!src->replicas[src_idx])
+ continue;
+
+ dst_idx = __replicas_entry_idx(dst_r,
+ cpu_replicas_entry(src_r, src_idx));
+ BUG_ON(dst_idx < 0);
+
+ dst->replicas[dst_idx] = src->replicas[src_idx];
+ }
+}
+
+static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p,
+ struct bch_replicas_cpu *dst_r,
+ struct bch_fs_usage __percpu *src_p,
+ struct bch_replicas_cpu *src_r)
+{
+ unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
+ struct bch_fs_usage *dst, *src = (void *)
+ bch2_acc_percpu_u64s((u64 __percpu *) src_p, src_nr);
+
+ preempt_disable();
+ dst = this_cpu_ptr(dst_p);
+ preempt_enable();
+
+ __replicas_table_update(dst, dst_r, src, src_r);
+}
+
+/*
+ * Resize filesystem accounting:
+ */
+static int replicas_table_update(struct bch_fs *c,
+ struct bch_replicas_cpu *new_r)
+{
+ struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR];
+ struct bch_fs_usage_online *new_scratch = NULL;
+ struct bch_fs_usage __percpu *new_gc = NULL;
+ struct bch_fs_usage *new_base = NULL;
+ unsigned i, bytes = sizeof(struct bch_fs_usage) +
+ sizeof(u64) * new_r->nr;
+ unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) +
+ sizeof(u64) * new_r->nr;
+ int ret = 0;
+
+ memset(new_usage, 0, sizeof(new_usage));
+
+ for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+ if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
+ sizeof(u64), GFP_KERNEL)))
+ goto err;
+
+ if (!(new_base = kzalloc(bytes, GFP_KERNEL)) ||
+ !(new_scratch = kmalloc(scratch_bytes, GFP_KERNEL)) ||
+ (c->usage_gc &&
+ !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL))))
+ goto err;
+
+ for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+ if (c->usage[i])
+ __replicas_table_update_pcpu(new_usage[i], new_r,
+ c->usage[i], &c->replicas);
+ if (c->usage_base)
+ __replicas_table_update(new_base, new_r,
+ c->usage_base, &c->replicas);
+ if (c->usage_gc)
+ __replicas_table_update_pcpu(new_gc, new_r,
+ c->usage_gc, &c->replicas);
+
+ for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+ swap(c->usage[i], new_usage[i]);
+ swap(c->usage_base, new_base);
+ swap(c->usage_scratch, new_scratch);
+ swap(c->usage_gc, new_gc);
+ swap(c->replicas, *new_r);
+out:
+ free_percpu(new_gc);
+ kfree(new_scratch);
+ for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+ free_percpu(new_usage[i]);
+ kfree(new_base);
+ return ret;
+err:
+ bch_err(c, "error updating replicas table: memory allocation failure");
+ ret = -BCH_ERR_ENOMEM_replicas_table;
+ goto out;
+}
+
+static unsigned reserve_journal_replicas(struct bch_fs *c,
+ struct bch_replicas_cpu *r)
+{
+ struct bch_replicas_entry *e;
+ unsigned journal_res_u64s = 0;
+
+ /* nr_inodes: */
+ journal_res_u64s +=
+ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
+
+ /* key_version: */
+ journal_res_u64s +=
+ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
+
+ /* persistent_reserved: */
+ journal_res_u64s +=
+ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) *
+ BCH_REPLICAS_MAX;
+
+ for_each_cpu_replicas_entry(r, e)
+ journal_res_u64s +=
+ DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) +
+ e->nr_devs, sizeof(u64));
+ return journal_res_u64s;
+}
+
+noinline
+static int bch2_mark_replicas_slowpath(struct bch_fs *c,
+ struct bch_replicas_entry *new_entry)
+{
+ struct bch_replicas_cpu new_r, new_gc;
+ int ret = 0;
+
+ verify_replicas_entry(new_entry);
+
+ memset(&new_r, 0, sizeof(new_r));
+ memset(&new_gc, 0, sizeof(new_gc));
+
+ mutex_lock(&c->sb_lock);
+
+ if (c->replicas_gc.entries &&
+ !__replicas_has_entry(&c->replicas_gc, new_entry)) {
+ new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry);
+ if (!new_gc.entries) {
+ ret = -BCH_ERR_ENOMEM_cpu_replicas;
+ goto err;
+ }
+ }
+
+ if (!__replicas_has_entry(&c->replicas, new_entry)) {
+ new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry);
+ if (!new_r.entries) {
+ ret = -BCH_ERR_ENOMEM_cpu_replicas;
+ goto err;
+ }
+
+ ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
+ if (ret)
+ goto err;
+
+ bch2_journal_entry_res_resize(&c->journal,
+ &c->replicas_journal_res,
+ reserve_journal_replicas(c, &new_r));
+ }
+
+ if (!new_r.entries &&
+ !new_gc.entries)
+ goto out;
+
+ /* allocations done, now commit: */
+
+ if (new_r.entries)
+ bch2_write_super(c);
+
+ /* don't update in memory replicas until changes are persistent */
+ percpu_down_write(&c->mark_lock);
+ if (new_r.entries)
+ ret = replicas_table_update(c, &new_r);
+ if (new_gc.entries)
+ swap(new_gc, c->replicas_gc);
+ percpu_up_write(&c->mark_lock);
+out:
+ mutex_unlock(&c->sb_lock);
+
+ kfree(new_r.entries);
+ kfree(new_gc.entries);
+
+ return ret;
+err:
+ bch_err_msg(c, ret, "adding replicas entry");
+ goto out;
+}
+
+int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r)
+{
+ return likely(bch2_replicas_marked(c, r))
+ ? 0 : bch2_mark_replicas_slowpath(c, r);
+}
+
+/* replicas delta list: */
+
+int bch2_replicas_delta_list_mark(struct bch_fs *c,
+ struct replicas_delta_list *r)
+{
+ struct replicas_delta *d = r->d;
+ struct replicas_delta *top = (void *) r->d + r->used;
+ int ret = 0;
+
+ for (d = r->d; !ret && d != top; d = replicas_delta_next(d))
+ ret = bch2_mark_replicas(c, &d->r);
+ return ret;
+}
+
+/*
+ * Old replicas_gc mechanism: only used for journal replicas entries now, should
+ * die at some point:
+ */
+
+int bch2_replicas_gc_end(struct bch_fs *c, int ret)
+{
+ lockdep_assert_held(&c->replicas_gc_lock);
+
+ mutex_lock(&c->sb_lock);
+ percpu_down_write(&c->mark_lock);
+
+ ret = ret ?:
+ bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc) ?:
+ replicas_table_update(c, &c->replicas_gc);
+
+ kfree(c->replicas_gc.entries);
+ c->replicas_gc.entries = NULL;
+
+ percpu_up_write(&c->mark_lock);
+
+ if (!ret)
+ bch2_write_super(c);
+
+ mutex_unlock(&c->sb_lock);
+
+ return ret;
+}
+
+int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
+{
+ struct bch_replicas_entry *e;
+ unsigned i = 0;
+
+ lockdep_assert_held(&c->replicas_gc_lock);
+
+ mutex_lock(&c->sb_lock);
+ BUG_ON(c->replicas_gc.entries);
+
+ c->replicas_gc.nr = 0;
+ c->replicas_gc.entry_size = 0;
+
+ for_each_cpu_replicas_entry(&c->replicas, e)
+ if (!((1 << e->data_type) & typemask)) {
+ c->replicas_gc.nr++;
+ c->replicas_gc.entry_size =
+ max_t(unsigned, c->replicas_gc.entry_size,
+ replicas_entry_bytes(e));
+ }
+
+ c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
+ c->replicas_gc.entry_size,
+ GFP_KERNEL);
+ if (!c->replicas_gc.entries) {
+ mutex_unlock(&c->sb_lock);
+ bch_err(c, "error allocating c->replicas_gc");
+ return -BCH_ERR_ENOMEM_replicas_gc;
+ }
+
+ for_each_cpu_replicas_entry(&c->replicas, e)
+ if (!((1 << e->data_type) & typemask))
+ memcpy(cpu_replicas_entry(&c->replicas_gc, i++),
+ e, c->replicas_gc.entry_size);
+
+ bch2_cpu_replicas_sort(&c->replicas_gc);
+ mutex_unlock(&c->sb_lock);
+
+ return 0;
+}
+
+/*
+ * New much simpler mechanism for clearing out unneeded replicas entries - drop
+ * replicas entries that have 0 sectors used.
+ *
+ * However, we don't track sector counts for journal usage, so this doesn't drop
+ * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism
+ * is retained for that.
+ */
+int bch2_replicas_gc2(struct bch_fs *c)
+{
+ struct bch_replicas_cpu new = { 0 };
+ unsigned i, nr;
+ int ret = 0;
+
+ bch2_journal_meta(&c->journal);
+retry:
+ nr = READ_ONCE(c->replicas.nr);
+ new.entry_size = READ_ONCE(c->replicas.entry_size);
+ new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL);
+ if (!new.entries) {
+ bch_err(c, "error allocating c->replicas_gc");
+ return -BCH_ERR_ENOMEM_replicas_gc;
+ }
+
+ mutex_lock(&c->sb_lock);
+ percpu_down_write(&c->mark_lock);
+
+ if (nr != c->replicas.nr ||
+ new.entry_size != c->replicas.entry_size) {
+ percpu_up_write(&c->mark_lock);
+ mutex_unlock(&c->sb_lock);
+ kfree(new.entries);
+ goto retry;
+ }
+
+ for (i = 0; i < c->replicas.nr; i++) {
+ struct bch_replicas_entry *e =
+ cpu_replicas_entry(&c->replicas, i);
+
+ if (e->data_type == BCH_DATA_journal ||
+ c->usage_base->replicas[i] ||
+ percpu_u64_get(&c->usage[0]->replicas[i]) ||
+ percpu_u64_get(&c->usage[1]->replicas[i]) ||
+ percpu_u64_get(&c->usage[2]->replicas[i]) ||
+ percpu_u64_get(&c->usage[3]->replicas[i]))
+ memcpy(cpu_replicas_entry(&new, new.nr++),
+ e, new.entry_size);
+ }
+
+ bch2_cpu_replicas_sort(&new);
+
+ ret = bch2_cpu_replicas_to_sb_replicas(c, &new) ?:
+ replicas_table_update(c, &new);
+
+ kfree(new.entries);
+
+ percpu_up_write(&c->mark_lock);
+
+ if (!ret)
+ bch2_write_super(c);
+
+ mutex_unlock(&c->sb_lock);
+
+ return ret;
+}
+
+int bch2_replicas_set_usage(struct bch_fs *c,
+ struct bch_replicas_entry *r,
+ u64 sectors)
+{
+ int ret, idx = bch2_replicas_entry_idx(c, r);
+
+ if (idx < 0) {
+ struct bch_replicas_cpu n;
+
+ n = cpu_replicas_add_entry(c, &c->replicas, r);
+ if (!n.entries)
+ return -BCH_ERR_ENOMEM_cpu_replicas;
+
+ ret = replicas_table_update(c, &n);
+ if (ret)
+ return ret;
+
+ kfree(n.entries);
+
+ idx = bch2_replicas_entry_idx(c, r);
+ BUG_ON(ret < 0);
+ }
+
+ c->usage_base->replicas[idx] = sectors;
+
+ return 0;
+}
+
+/* Replicas tracking - superblock: */
+
+static int
+__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
+ struct bch_replicas_cpu *cpu_r)
+{
+ struct bch_replicas_entry *e, *dst;
+ unsigned nr = 0, entry_size = 0, idx = 0;
+
+ for_each_replicas_entry(sb_r, e) {
+ entry_size = max_t(unsigned, entry_size,
+ replicas_entry_bytes(e));
+ nr++;
+ }
+
+ cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
+ if (!cpu_r->entries)
+ return -BCH_ERR_ENOMEM_cpu_replicas;
+
+ cpu_r->nr = nr;
+ cpu_r->entry_size = entry_size;
+
+ for_each_replicas_entry(sb_r, e) {
+ dst = cpu_replicas_entry(cpu_r, idx++);
+ memcpy(dst, e, replicas_entry_bytes(e));
+ bch2_replicas_entry_sort(dst);
+ }
+
+ return 0;
+}
+
+static int
+__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
+ struct bch_replicas_cpu *cpu_r)
+{
+ struct bch_replicas_entry_v0 *e;
+ unsigned nr = 0, entry_size = 0, idx = 0;
+
+ for_each_replicas_entry(sb_r, e) {
+ entry_size = max_t(unsigned, entry_size,
+ replicas_entry_bytes(e));
+ nr++;
+ }
+
+ entry_size += sizeof(struct bch_replicas_entry) -
+ sizeof(struct bch_replicas_entry_v0);
+
+ cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
+ if (!cpu_r->entries)
+ return -BCH_ERR_ENOMEM_cpu_replicas;
+
+ cpu_r->nr = nr;
+ cpu_r->entry_size = entry_size;
+
+ for_each_replicas_entry(sb_r, e) {
+ struct bch_replicas_entry *dst =
+ cpu_replicas_entry(cpu_r, idx++);
+
+ dst->data_type = e->data_type;
+ dst->nr_devs = e->nr_devs;
+ dst->nr_required = 1;
+ memcpy(dst->devs, e->devs, e->nr_devs);
+ bch2_replicas_entry_sort(dst);
+ }
+
+ return 0;
+}
+
+int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
+{
+ struct bch_sb_field_replicas *sb_v1;
+ struct bch_sb_field_replicas_v0 *sb_v0;
+ struct bch_replicas_cpu new_r = { 0, 0, NULL };
+ int ret = 0;
+
+ if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas)))
+ ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r);
+ else if ((sb_v0 = bch2_sb_field_get(c->disk_sb.sb, replicas_v0)))
+ ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r);
+ if (ret)
+ return ret;
+
+ bch2_cpu_replicas_sort(&new_r);
+
+ percpu_down_write(&c->mark_lock);
+
+ ret = replicas_table_update(c, &new_r);
+ percpu_up_write(&c->mark_lock);
+
+ kfree(new_r.entries);
+
+ return 0;
+}
+
+static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
+ struct bch_replicas_cpu *r)
+{
+ struct bch_sb_field_replicas_v0 *sb_r;
+ struct bch_replicas_entry_v0 *dst;
+ struct bch_replicas_entry *src;
+ size_t bytes;
+
+ bytes = sizeof(struct bch_sb_field_replicas);
+
+ for_each_cpu_replicas_entry(r, src)
+ bytes += replicas_entry_bytes(src) - 1;
+
+ sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0,
+ DIV_ROUND_UP(bytes, sizeof(u64)));
+ if (!sb_r)
+ return -BCH_ERR_ENOSPC_sb_replicas;
+
+ bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
+ sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0);
+
+ memset(&sb_r->entries, 0,
+ vstruct_end(&sb_r->field) -
+ (void *) &sb_r->entries);
+
+ dst = sb_r->entries;
+ for_each_cpu_replicas_entry(r, src) {
+ dst->data_type = src->data_type;
+ dst->nr_devs = src->nr_devs;
+ memcpy(dst->devs, src->devs, src->nr_devs);
+
+ dst = replicas_entry_next(dst);
+
+ BUG_ON((void *) dst > vstruct_end(&sb_r->field));
+ }
+
+ return 0;
+}
+
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
+ struct bch_replicas_cpu *r)
+{
+ struct bch_sb_field_replicas *sb_r;
+ struct bch_replicas_entry *dst, *src;
+ bool need_v1 = false;
+ size_t bytes;
+
+ bytes = sizeof(struct bch_sb_field_replicas);
+
+ for_each_cpu_replicas_entry(r, src) {
+ bytes += replicas_entry_bytes(src);
+ if (src->nr_required != 1)
+ need_v1 = true;
+ }
+
+ if (!need_v1)
+ return bch2_cpu_replicas_to_sb_replicas_v0(c, r);
+
+ sb_r = bch2_sb_field_resize(&c->disk_sb, replicas,
+ DIV_ROUND_UP(bytes, sizeof(u64)));
+ if (!sb_r)
+ return -BCH_ERR_ENOSPC_sb_replicas;
+
+ bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
+ sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas);
+
+ memset(&sb_r->entries, 0,
+ vstruct_end(&sb_r->field) -
+ (void *) &sb_r->entries);
+
+ dst = sb_r->entries;
+ for_each_cpu_replicas_entry(r, src) {
+ memcpy(dst, src, replicas_entry_bytes(src));
+
+ dst = replicas_entry_next(dst);
+
+ BUG_ON((void *) dst > vstruct_end(&sb_r->field));
+ }
+
+ return 0;
+}
+
+static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
+ struct bch_sb *sb,
+ struct printbuf *err)
+{
+ unsigned i;
+
+ sort_cmp_size(cpu_r->entries,
+ cpu_r->nr,
+ cpu_r->entry_size,
+ memcmp, NULL);
+
+ for (i = 0; i < cpu_r->nr; i++) {
+ struct bch_replicas_entry *e =
+ cpu_replicas_entry(cpu_r, i);
+
+ int ret = bch2_replicas_entry_validate(e, sb, err);
+ if (ret)
+ return ret;
+
+ if (i + 1 < cpu_r->nr) {
+ struct bch_replicas_entry *n =
+ cpu_replicas_entry(cpu_r, i + 1);
+
+ BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
+
+ if (!memcmp(e, n, cpu_r->entry_size)) {
+ prt_printf(err, "duplicate replicas entry ");
+ bch2_replicas_entry_to_text(err, e);
+ return -BCH_ERR_invalid_sb_replicas;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ struct printbuf *err)
+{
+ struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
+ struct bch_replicas_cpu cpu_r;
+ int ret;
+
+ ret = __bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r);
+ if (ret)
+ return ret;
+
+ ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
+ kfree(cpu_r.entries);
+ return ret;
+}
+
+static void bch2_sb_replicas_to_text(struct printbuf *out,
+ struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_replicas *r = field_to_type(f, replicas);
+ struct bch_replicas_entry *e;
+ bool first = true;
+
+ for_each_replicas_entry(r, e) {
+ if (!first)
+ prt_printf(out, " ");
+ first = false;
+
+ bch2_replicas_entry_to_text(out, e);
+ }
+ prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
+ .validate = bch2_sb_replicas_validate,
+ .to_text = bch2_sb_replicas_to_text,
+};
+
+static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ struct printbuf *err)
+{
+ struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
+ struct bch_replicas_cpu cpu_r;
+ int ret;
+
+ ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r);
+ if (ret)
+ return ret;
+
+ ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
+ kfree(cpu_r.entries);
+ return ret;
+}
+
+static void bch2_sb_replicas_v0_to_text(struct printbuf *out,
+ struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
+ struct bch_replicas_entry_v0 *e;
+ bool first = true;
+
+ for_each_replicas_entry(sb_r, e) {
+ if (!first)
+ prt_printf(out, " ");
+ first = false;
+
+ bch2_replicas_entry_v0_to_text(out, e);
+ }
+ prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
+ .validate = bch2_sb_replicas_v0_validate,
+ .to_text = bch2_sb_replicas_v0_to_text,
+};
+
+/* Query replicas: */
+
+bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
+ unsigned flags, bool print)
+{
+ struct bch_replicas_entry *e;
+ bool ret = true;
+
+ percpu_down_read(&c->mark_lock);
+ for_each_cpu_replicas_entry(&c->replicas, e) {
+ unsigned i, nr_online = 0, nr_failed = 0, dflags = 0;
+ bool metadata = e->data_type < BCH_DATA_user;
+
+ if (e->data_type == BCH_DATA_cached)
+ continue;
+
+ for (i = 0; i < e->nr_devs; i++) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]);
+
+ nr_online += test_bit(e->devs[i], devs.d);
+ nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed;
+ }
+
+ if (nr_failed == e->nr_devs)
+ continue;
+
+ if (nr_online < e->nr_required)
+ dflags |= metadata
+ ? BCH_FORCE_IF_METADATA_LOST
+ : BCH_FORCE_IF_DATA_LOST;
+
+ if (nr_online < e->nr_devs)
+ dflags |= metadata
+ ? BCH_FORCE_IF_METADATA_DEGRADED
+ : BCH_FORCE_IF_DATA_DEGRADED;
+
+ if (dflags & ~flags) {
+ if (print) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_replicas_entry_to_text(&buf, e);
+ bch_err(c, "insufficient devices online (%u) for replicas entry %s",
+ nr_online, buf.buf);
+ printbuf_exit(&buf);
+ }
+ ret = false;
+ break;
+ }
+
+ }
+ percpu_up_read(&c->mark_lock);
+
+ return ret;
+}
+
+unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
+{
+ struct bch_sb_field_replicas *replicas;
+ struct bch_sb_field_replicas_v0 *replicas_v0;
+ unsigned i, data_has = 0;
+
+ replicas = bch2_sb_field_get(sb, replicas);
+ replicas_v0 = bch2_sb_field_get(sb, replicas_v0);
+
+ if (replicas) {
+ struct bch_replicas_entry *r;
+
+ for_each_replicas_entry(replicas, r)
+ for (i = 0; i < r->nr_devs; i++)
+ if (r->devs[i] == dev)
+ data_has |= 1 << r->data_type;
+ } else if (replicas_v0) {
+ struct bch_replicas_entry_v0 *r;
+
+ for_each_replicas_entry_v0(replicas_v0, r)
+ for (i = 0; i < r->nr_devs; i++)
+ if (r->devs[i] == dev)
+ data_has |= 1 << r->data_type;
+ }
+
+
+ return data_has;
+}
+
+unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
+{
+ unsigned ret;
+
+ mutex_lock(&c->sb_lock);
+ ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx);
+ mutex_unlock(&c->sb_lock);
+
+ return ret;
+}
+
+void bch2_fs_replicas_exit(struct bch_fs *c)
+{
+ unsigned i;
+
+ kfree(c->usage_scratch);
+ for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+ free_percpu(c->usage[i]);
+ kfree(c->usage_base);
+ kfree(c->replicas.entries);
+ kfree(c->replicas_gc.entries);
+
+ mempool_exit(&c->replicas_delta_pool);
+}
+
+int bch2_fs_replicas_init(struct bch_fs *c)
+{
+ bch2_journal_entry_res_resize(&c->journal,
+ &c->replicas_journal_res,
+ reserve_journal_replicas(c, &c->replicas));
+
+ return mempool_init_kmalloc_pool(&c->replicas_delta_pool, 1,
+ REPLICAS_DELTA_LIST_MAX) ?:
+ replicas_table_update(c, &c->replicas);
+}
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
new file mode 100644
index 000000000000..f70a642775d1
--- /dev/null
+++ b/fs/bcachefs/replicas.h
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REPLICAS_H
+#define _BCACHEFS_REPLICAS_H
+
+#include "bkey.h"
+#include "eytzinger.h"
+#include "replicas_types.h"
+
+void bch2_replicas_entry_sort(struct bch_replicas_entry *);
+void bch2_replicas_entry_to_text(struct printbuf *,
+ struct bch_replicas_entry *);
+int bch2_replicas_entry_validate(struct bch_replicas_entry *,
+ struct bch_sb *, struct printbuf *);
+void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
+
+static inline struct bch_replicas_entry *
+cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
+{
+ return (void *) r->entries + r->entry_size * i;
+}
+
+int bch2_replicas_entry_idx(struct bch_fs *,
+ struct bch_replicas_entry *);
+
+void bch2_devlist_to_replicas(struct bch_replicas_entry *,
+ enum bch_data_type,
+ struct bch_devs_list);
+bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *);
+int bch2_mark_replicas(struct bch_fs *,
+ struct bch_replicas_entry *);
+
+static inline struct replicas_delta *
+replicas_delta_next(struct replicas_delta *d)
+{
+ return (void *) d + replicas_entry_bytes(&d->r) + 8;
+}
+
+int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *);
+
+void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c);
+
+static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
+ unsigned dev)
+{
+ e->data_type = BCH_DATA_cached;
+ e->nr_devs = 1;
+ e->nr_required = 1;
+ e->devs[0] = dev;
+}
+
+bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask,
+ unsigned, bool);
+
+unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned);
+unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
+
+int bch2_replicas_gc_end(struct bch_fs *, int);
+int bch2_replicas_gc_start(struct bch_fs *, unsigned);
+int bch2_replicas_gc2(struct bch_fs *);
+
+int bch2_replicas_set_usage(struct bch_fs *,
+ struct bch_replicas_entry *,
+ u64);
+
+#define for_each_cpu_replicas_entry(_r, _i) \
+ for (_i = (_r)->entries; \
+ (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
+ _i = (void *) (_i) + (_r)->entry_size)
+
+/* iterate over superblock replicas - used by userspace tools: */
+
+#define replicas_entry_next(_i) \
+ ((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i)))
+
+#define for_each_replicas_entry(_r, _i) \
+ for (_i = (_r)->entries; \
+ (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
+ (_i) = replicas_entry_next(_i))
+
+#define for_each_replicas_entry_v0(_r, _i) \
+ for (_i = (_r)->entries; \
+ (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
+ (_i) = replicas_entry_next(_i))
+
+int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_replicas;
+extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0;
+
+void bch2_fs_replicas_exit(struct bch_fs *);
+int bch2_fs_replicas_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_REPLICAS_H */
diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h
new file mode 100644
index 000000000000..5cfff489bbc3
--- /dev/null
+++ b/fs/bcachefs/replicas_types.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REPLICAS_TYPES_H
+#define _BCACHEFS_REPLICAS_TYPES_H
+
+struct bch_replicas_cpu {
+ unsigned nr;
+ unsigned entry_size;
+ struct bch_replicas_entry *entries;
+};
+
+struct replicas_delta {
+ s64 delta;
+ struct bch_replicas_entry r;
+} __packed;
+
+struct replicas_delta_list {
+ unsigned size;
+ unsigned used;
+
+ struct {} memset_start;
+ u64 nr_inodes;
+ u64 persistent_reserved[BCH_REPLICAS_MAX];
+ struct {} memset_end;
+ struct replicas_delta d[0];
+};
+
+#endif /* _BCACHEFS_REPLICAS_TYPES_H */
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
new file mode 100644
index 000000000000..c76ad8ea5e4a
--- /dev/null
+++ b/fs/bcachefs/sb-clean.c
@@ -0,0 +1,396 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "error.h"
+#include "journal_io.h"
+#include "replicas.h"
+#include "sb-clean.h"
+#include "super-io.h"
+
+/*
+ * BCH_SB_FIELD_clean:
+ *
+ * Btree roots, and a few other things, are recovered from the journal after an
+ * unclean shutdown - but after a clean shutdown, to avoid having to read the
+ * journal, we can store them in the superblock.
+ *
+ * bch_sb_field_clean simply contains a list of journal entries, stored exactly
+ * as they would be in the journal:
+ */
+
+int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean,
+ int write)
+{
+ struct jset_entry *entry;
+ int ret;
+
+ for (entry = clean->start;
+ entry < (struct jset_entry *) vstruct_end(&clean->field);
+ entry = vstruct_next(entry)) {
+ ret = bch2_journal_entry_validate(c, NULL, entry,
+ le16_to_cpu(c->disk_sb.sb->version),
+ BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
+ write);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static struct bkey_i *btree_root_find(struct bch_fs *c,
+ struct bch_sb_field_clean *clean,
+ struct jset *j,
+ enum btree_id id, unsigned *level)
+{
+ struct bkey_i *k;
+ struct jset_entry *entry, *start, *end;
+
+ if (clean) {
+ start = clean->start;
+ end = vstruct_end(&clean->field);
+ } else {
+ start = j->start;
+ end = vstruct_last(j);
+ }
+
+ for (entry = start; entry < end; entry = vstruct_next(entry))
+ if (entry->type == BCH_JSET_ENTRY_btree_root &&
+ entry->btree_id == id)
+ goto found;
+
+ return NULL;
+found:
+ if (!entry->u64s)
+ return ERR_PTR(-EINVAL);
+
+ k = entry->start;
+ *level = entry->level;
+ return k;
+}
+
+int bch2_verify_superblock_clean(struct bch_fs *c,
+ struct bch_sb_field_clean **cleanp,
+ struct jset *j)
+{
+ unsigned i;
+ struct bch_sb_field_clean *clean = *cleanp;
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
+ int ret = 0;
+
+ if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
+ sb_clean_journal_seq_mismatch,
+ "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
+ le64_to_cpu(clean->journal_seq),
+ le64_to_cpu(j->seq))) {
+ kfree(clean);
+ *cleanp = NULL;
+ return 0;
+ }
+
+ for (i = 0; i < BTREE_ID_NR; i++) {
+ struct bkey_i *k1, *k2;
+ unsigned l1 = 0, l2 = 0;
+
+ k1 = btree_root_find(c, clean, NULL, i, &l1);
+ k2 = btree_root_find(c, NULL, j, i, &l2);
+
+ if (!k1 && !k2)
+ continue;
+
+ printbuf_reset(&buf1);
+ printbuf_reset(&buf2);
+
+ if (k1)
+ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1));
+ else
+ prt_printf(&buf1, "(none)");
+
+ if (k2)
+ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2));
+ else
+ prt_printf(&buf2, "(none)");
+
+ mustfix_fsck_err_on(!k1 || !k2 ||
+ IS_ERR(k1) ||
+ IS_ERR(k2) ||
+ k1->k.u64s != k2->k.u64s ||
+ memcmp(k1, k2, bkey_bytes(&k1->k)) ||
+ l1 != l2, c,
+ sb_clean_btree_root_mismatch,
+ "superblock btree root %u doesn't match journal after clean shutdown\n"
+ "sb: l=%u %s\n"
+ "journal: l=%u %s\n", i,
+ l1, buf1.buf,
+ l2, buf2.buf);
+ }
+fsck_err:
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
+ return ret;
+}
+
+struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c)
+{
+ struct bch_sb_field_clean *clean, *sb_clean;
+ int ret;
+
+ mutex_lock(&c->sb_lock);
+ sb_clean = bch2_sb_field_get(c->disk_sb.sb, clean);
+
+ if (fsck_err_on(!sb_clean, c,
+ sb_clean_missing,
+ "superblock marked clean but clean section not present")) {
+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+ c->sb.clean = false;
+ mutex_unlock(&c->sb_lock);
+ return NULL;
+ }
+
+ clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
+ GFP_KERNEL);
+ if (!clean) {
+ mutex_unlock(&c->sb_lock);
+ return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean);
+ }
+
+ ret = bch2_sb_clean_validate_late(c, clean, READ);
+ if (ret) {
+ mutex_unlock(&c->sb_lock);
+ return ERR_PTR(ret);
+ }
+
+ mutex_unlock(&c->sb_lock);
+
+ return clean;
+fsck_err:
+ mutex_unlock(&c->sb_lock);
+ return ERR_PTR(ret);
+}
+
+static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
+{
+ struct jset_entry *entry = *end;
+ unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
+
+ memset(entry, 0, u64s * sizeof(u64));
+ /*
+ * The u64s field counts from the start of data, ignoring the shared
+ * fields.
+ */
+ entry->u64s = cpu_to_le16(u64s - 1);
+
+ *end = vstruct_next(*end);
+ return entry;
+}
+
+void bch2_journal_super_entries_add_common(struct bch_fs *c,
+ struct jset_entry **end,
+ u64 journal_seq)
+{
+ struct bch_dev *ca;
+ unsigned i, dev;
+
+ percpu_down_read(&c->mark_lock);
+
+ if (!journal_seq) {
+ for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+ bch2_fs_usage_acc_to_base(c, i);
+ } else {
+ bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK);
+ }
+
+ {
+ struct jset_entry_usage *u =
+ container_of(jset_entry_init(end, sizeof(*u)),
+ struct jset_entry_usage, entry);
+
+ u->entry.type = BCH_JSET_ENTRY_usage;
+ u->entry.btree_id = BCH_FS_USAGE_inodes;
+ u->v = cpu_to_le64(c->usage_base->nr_inodes);
+ }
+
+ {
+ struct jset_entry_usage *u =
+ container_of(jset_entry_init(end, sizeof(*u)),
+ struct jset_entry_usage, entry);
+
+ u->entry.type = BCH_JSET_ENTRY_usage;
+ u->entry.btree_id = BCH_FS_USAGE_key_version;
+ u->v = cpu_to_le64(atomic64_read(&c->key_version));
+ }
+
+ for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+ struct jset_entry_usage *u =
+ container_of(jset_entry_init(end, sizeof(*u)),
+ struct jset_entry_usage, entry);
+
+ u->entry.type = BCH_JSET_ENTRY_usage;
+ u->entry.btree_id = BCH_FS_USAGE_reserved;
+ u->entry.level = i;
+ u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]);
+ }
+
+ for (i = 0; i < c->replicas.nr; i++) {
+ struct bch_replicas_entry *e =
+ cpu_replicas_entry(&c->replicas, i);
+ struct jset_entry_data_usage *u =
+ container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
+ struct jset_entry_data_usage, entry);
+
+ u->entry.type = BCH_JSET_ENTRY_data_usage;
+ u->v = cpu_to_le64(c->usage_base->replicas[i]);
+ unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
+ "embedded variable length struct");
+ }
+
+ for_each_member_device(ca, c, dev) {
+ unsigned b = sizeof(struct jset_entry_dev_usage) +
+ sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
+ struct jset_entry_dev_usage *u =
+ container_of(jset_entry_init(end, b),
+ struct jset_entry_dev_usage, entry);
+
+ u->entry.type = BCH_JSET_ENTRY_dev_usage;
+ u->dev = cpu_to_le32(dev);
+ u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec);
+
+ for (i = 0; i < BCH_DATA_NR; i++) {
+ u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
+ u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors);
+ u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
+ }
+ }
+
+ percpu_up_read(&c->mark_lock);
+
+ for (i = 0; i < 2; i++) {
+ struct jset_entry_clock *clock =
+ container_of(jset_entry_init(end, sizeof(*clock)),
+ struct jset_entry_clock, entry);
+
+ clock->entry.type = BCH_JSET_ENTRY_clock;
+ clock->rw = i;
+ clock->time = cpu_to_le64(atomic64_read(&c->io_clock[i].now));
+ }
+}
+
+static int bch2_sb_clean_validate(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ struct printbuf *err)
+{
+ struct bch_sb_field_clean *clean = field_to_type(f, clean);
+
+ if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
+ prt_printf(err, "wrong size (got %zu should be %zu)",
+ vstruct_bytes(&clean->field), sizeof(*clean));
+ return -BCH_ERR_invalid_sb_clean;
+ }
+
+ return 0;
+}
+
+static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_clean *clean = field_to_type(f, clean);
+ struct jset_entry *entry;
+
+ prt_printf(out, "flags: %x", le32_to_cpu(clean->flags));
+ prt_newline(out);
+ prt_printf(out, "journal_seq: %llu", le64_to_cpu(clean->journal_seq));
+ prt_newline(out);
+
+ for (entry = clean->start;
+ entry != vstruct_end(&clean->field);
+ entry = vstruct_next(entry)) {
+ if (entry->type == BCH_JSET_ENTRY_btree_keys &&
+ !entry->u64s)
+ continue;
+
+ bch2_journal_entry_to_text(out, NULL, entry);
+ prt_newline(out);
+ }
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_clean = {
+ .validate = bch2_sb_clean_validate,
+ .to_text = bch2_sb_clean_to_text,
+};
+
+int bch2_fs_mark_dirty(struct bch_fs *c)
+{
+ int ret;
+
+ /*
+ * Unconditionally write superblock, to verify it hasn't changed before
+ * we go rw:
+ */
+
+ mutex_lock(&c->sb_lock);
+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
+
+ ret = bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
+ return ret;
+}
+
+void bch2_fs_mark_clean(struct bch_fs *c)
+{
+ struct bch_sb_field_clean *sb_clean;
+ struct jset_entry *entry;
+ unsigned u64s;
+ int ret;
+
+ mutex_lock(&c->sb_lock);
+ if (BCH_SB_CLEAN(c->disk_sb.sb))
+ goto out;
+
+ SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
+
+ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
+ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata);
+ c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates));
+ c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled));
+
+ u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
+
+ sb_clean = bch2_sb_field_resize(&c->disk_sb, clean, u64s);
+ if (!sb_clean) {
+ bch_err(c, "error resizing superblock while setting filesystem clean");
+ goto out;
+ }
+
+ sb_clean->flags = 0;
+ sb_clean->journal_seq = cpu_to_le64(atomic64_read(&c->journal.seq));
+
+ /* Trying to catch outstanding bug: */
+ BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
+
+ entry = sb_clean->start;
+ bch2_journal_super_entries_add_common(c, &entry, 0);
+ entry = bch2_btree_roots_to_journal_entries(c, entry, 0);
+ BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
+
+ memset(entry, 0,
+ vstruct_end(&sb_clean->field) - (void *) entry);
+
+ /*
+ * this should be in the write path, and we should be validating every
+ * superblock section:
+ */
+ ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE);
+ if (ret) {
+ bch_err(c, "error writing marking filesystem clean: validate error");
+ goto out;
+ }
+
+ bch2_write_super(c);
+out:
+ mutex_unlock(&c->sb_lock);
+}
diff --git a/fs/bcachefs/sb-clean.h b/fs/bcachefs/sb-clean.h
new file mode 100644
index 000000000000..71caef281239
--- /dev/null
+++ b/fs/bcachefs/sb-clean.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_CLEAN_H
+#define _BCACHEFS_SB_CLEAN_H
+
+int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int);
+int bch2_verify_superblock_clean(struct bch_fs *, struct bch_sb_field_clean **,
+ struct jset *);
+struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *);
+void bch2_journal_super_entries_add_common(struct bch_fs *, struct jset_entry **, u64);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_clean;
+
+int bch2_fs_mark_dirty(struct bch_fs *);
+void bch2_fs_mark_clean(struct bch_fs *);
+
+#endif /* _BCACHEFS_SB_CLEAN_H */
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
new file mode 100644
index 000000000000..4919237bbe73
--- /dev/null
+++ b/fs/bcachefs/sb-downgrade.c
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Superblock section that contains a list of recovery passes to run when
+ * downgrading past a given version
+ */
+
+#include "bcachefs.h"
+#include "darray.h"
+#include "recovery.h"
+#include "sb-downgrade.h"
+#include "sb-errors.h"
+#include "super-io.h"
+
+/*
+ * Downgrade table:
+ * When dowgrading past certain versions, we need to run certain recovery passes
+ * and fix certain errors:
+ *
+ * x(version, recovery_passes, errors...)
+ */
+
+#define DOWNGRADE_TABLE()
+
+struct downgrade_entry {
+ u64 recovery_passes;
+ u16 version;
+ u16 nr_errors;
+ const u16 *errors;
+};
+
+#define x(ver, passes, ...) static const u16 ver_##errors[] = { __VA_ARGS__ };
+DOWNGRADE_TABLE()
+#undef x
+
+static const struct downgrade_entry downgrade_table[] = {
+#define x(ver, passes, ...) { \
+ .recovery_passes = passes, \
+ .version = bcachefs_metadata_version_##ver,\
+ .nr_errors = ARRAY_SIZE(ver_##errors), \
+ .errors = ver_##errors, \
+},
+DOWNGRADE_TABLE()
+#undef x
+};
+
+static inline const struct bch_sb_field_downgrade_entry *
+downgrade_entry_next_c(const struct bch_sb_field_downgrade_entry *e)
+{
+ return (void *) &e->errors[le16_to_cpu(e->nr_errors)];
+}
+
+#define for_each_downgrade_entry(_d, _i) \
+ for (const struct bch_sb_field_downgrade_entry *_i = (_d)->entries; \
+ (void *) _i < vstruct_end(&(_d)->field) && \
+ (void *) &_i->errors[0] < vstruct_end(&(_d)->field); \
+ _i = downgrade_entry_next_c(_i))
+
+static int bch2_sb_downgrade_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ struct printbuf *err)
+{
+ struct bch_sb_field_downgrade *e = field_to_type(f, downgrade);
+
+ for_each_downgrade_entry(e, i) {
+ if (BCH_VERSION_MAJOR(le16_to_cpu(i->version)) !=
+ BCH_VERSION_MAJOR(le16_to_cpu(sb->version))) {
+ prt_printf(err, "downgrade entry with mismatched major version (%u != %u)",
+ BCH_VERSION_MAJOR(le16_to_cpu(i->version)),
+ BCH_VERSION_MAJOR(le16_to_cpu(sb->version)));
+ return -BCH_ERR_invalid_sb_downgrade;
+ }
+ }
+
+ return 0;
+}
+
+static void bch2_sb_downgrade_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_downgrade *e = field_to_type(f, downgrade);
+
+ if (out->nr_tabstops <= 1)
+ printbuf_tabstop_push(out, 16);
+
+ for_each_downgrade_entry(e, i) {
+ prt_str(out, "version:");
+ prt_tab(out);
+ bch2_version_to_text(out, le16_to_cpu(i->version));
+ prt_newline(out);
+
+ prt_str(out, "recovery passes:");
+ prt_tab(out);
+ prt_bitflags(out, bch2_recovery_passes,
+ bch2_recovery_passes_from_stable(le64_to_cpu(i->recovery_passes[0])));
+ prt_newline(out);
+
+ prt_str(out, "errors:");
+ prt_tab(out);
+ bool first = true;
+ for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) {
+ if (!first)
+ prt_char(out, ',');
+ first = false;
+ unsigned e = le16_to_cpu(i->errors[j]);
+ prt_str(out, e < BCH_SB_ERR_MAX ? bch2_sb_error_strs[e] : "(unknown)");
+ }
+ prt_newline(out);
+ }
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_downgrade = {
+ .validate = bch2_sb_downgrade_validate,
+ .to_text = bch2_sb_downgrade_to_text,
+};
+
+int bch2_sb_downgrade_update(struct bch_fs *c)
+{
+ darray_char table = {};
+ int ret = 0;
+
+ for (const struct downgrade_entry *src = downgrade_table;
+ src < downgrade_table + ARRAY_SIZE(downgrade_table);
+ src++) {
+ if (BCH_VERSION_MAJOR(src->version) != BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version)))
+ continue;
+
+ struct bch_sb_field_downgrade_entry *dst;
+ unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * src->nr_errors;
+
+ ret = darray_make_room(&table, bytes);
+ if (ret)
+ goto out;
+
+ dst = (void *) &darray_top(table);
+ dst->version = cpu_to_le16(src->version);
+ dst->recovery_passes[0] = cpu_to_le64(src->recovery_passes);
+ dst->recovery_passes[1] = 0;
+ dst->nr_errors = cpu_to_le16(src->nr_errors);
+ for (unsigned i = 0; i < src->nr_errors; i++)
+ dst->errors[i] = cpu_to_le16(src->errors[i]);
+
+ table.nr += bytes;
+ }
+
+ struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade);
+
+ unsigned sb_u64s = DIV_ROUND_UP(sizeof(*d) + table.nr, sizeof(u64));
+
+ if (d && le32_to_cpu(d->field.u64s) > sb_u64s)
+ goto out;
+
+ d = bch2_sb_field_resize(&c->disk_sb, downgrade, sb_u64s);
+ if (!d) {
+ ret = -BCH_ERR_ENOSPC_sb_downgrade;
+ goto out;
+ }
+
+ memcpy(d->entries, table.data, table.nr);
+ memset_u64s_tail(d->entries, 0, table.nr);
+out:
+ darray_exit(&table);
+ return ret;
+}
+
+void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_minor)
+{
+ struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade);
+ if (!d)
+ return;
+
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+ for_each_downgrade_entry(d, i) {
+ unsigned minor = BCH_VERSION_MINOR(le16_to_cpu(i->version));
+ if (new_minor < minor && minor <= old_minor) {
+ ext->recovery_passes_required[0] |= i->recovery_passes[0];
+ ext->recovery_passes_required[1] |= i->recovery_passes[1];
+
+ for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) {
+ unsigned e = le16_to_cpu(i->errors[j]);
+ if (e < BCH_SB_ERR_MAX)
+ __set_bit(e, c->sb.errors_silent);
+ if (e < sizeof(ext->errors_silent) * 8)
+ ext->errors_silent[e / 64] |= cpu_to_le64(BIT_ULL(e % 64));
+ }
+ }
+ }
+}
diff --git a/fs/bcachefs/sb-downgrade.h b/fs/bcachefs/sb-downgrade.h
new file mode 100644
index 000000000000..bc48fd2ca70e
--- /dev/null
+++ b/fs/bcachefs/sb-downgrade.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_DOWNGRADE_H
+#define _BCACHEFS_SB_DOWNGRADE_H
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_downgrade;
+
+int bch2_sb_downgrade_update(struct bch_fs *);
+void bch2_sb_set_downgrade(struct bch_fs *, unsigned, unsigned);
+
+#endif /* _BCACHEFS_SB_DOWNGRADE_H */
diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c
new file mode 100644
index 000000000000..5f5bcae391fb
--- /dev/null
+++ b/fs/bcachefs/sb-errors.c
@@ -0,0 +1,170 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "sb-errors.h"
+#include "super-io.h"
+
+const char * const bch2_sb_error_strs[] = {
+#define x(t, n, ...) [n] = #t,
+ BCH_SB_ERRS()
+ NULL
+};
+
+static void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id)
+{
+ if (id < BCH_SB_ERR_MAX)
+ prt_str(out, bch2_sb_error_strs[id]);
+ else
+ prt_printf(out, "(unknown error %u)", id);
+}
+
+static inline unsigned bch2_sb_field_errors_nr_entries(struct bch_sb_field_errors *e)
+{
+ return bch2_sb_field_nr_entries(e);
+}
+
+static inline unsigned bch2_sb_field_errors_u64s(unsigned nr)
+{
+ return (sizeof(struct bch_sb_field_errors) +
+ sizeof(struct bch_sb_field_error_entry) * nr) / sizeof(u64);
+}
+
+static int bch2_sb_errors_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ struct printbuf *err)
+{
+ struct bch_sb_field_errors *e = field_to_type(f, errors);
+ unsigned i, nr = bch2_sb_field_errors_nr_entries(e);
+
+ for (i = 0; i < nr; i++) {
+ if (!BCH_SB_ERROR_ENTRY_NR(&e->entries[i])) {
+ prt_printf(err, "entry with count 0 (id ");
+ bch2_sb_error_id_to_text(err, BCH_SB_ERROR_ENTRY_ID(&e->entries[i]));
+ prt_printf(err, ")");
+ return -BCH_ERR_invalid_sb_errors;
+ }
+
+ if (i + 1 < nr &&
+ BCH_SB_ERROR_ENTRY_ID(&e->entries[i]) >=
+ BCH_SB_ERROR_ENTRY_ID(&e->entries[i + 1])) {
+ prt_printf(err, "entries out of order");
+ return -BCH_ERR_invalid_sb_errors;
+ }
+ }
+
+ return 0;
+}
+
+static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_errors *e = field_to_type(f, errors);
+ unsigned i, nr = bch2_sb_field_errors_nr_entries(e);
+
+ if (out->nr_tabstops <= 1)
+ printbuf_tabstop_push(out, 16);
+
+ for (i = 0; i < nr; i++) {
+ bch2_sb_error_id_to_text(out, BCH_SB_ERROR_ENTRY_ID(&e->entries[i]));
+ prt_tab(out);
+ prt_u64(out, BCH_SB_ERROR_ENTRY_NR(&e->entries[i]));
+ prt_tab(out);
+ bch2_prt_datetime(out, le64_to_cpu(e->entries[i].last_error_time));
+ prt_newline(out);
+ }
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_errors = {
+ .validate = bch2_sb_errors_validate,
+ .to_text = bch2_sb_errors_to_text,
+};
+
+void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err)
+{
+ bch_sb_errors_cpu *e = &c->fsck_error_counts;
+ struct bch_sb_error_entry_cpu n = {
+ .id = err,
+ .nr = 1,
+ .last_error_time = ktime_get_real_seconds()
+ };
+ unsigned i;
+
+ mutex_lock(&c->fsck_error_counts_lock);
+ for (i = 0; i < e->nr; i++) {
+ if (err == e->data[i].id) {
+ e->data[i].nr++;
+ e->data[i].last_error_time = n.last_error_time;
+ goto out;
+ }
+ if (err < e->data[i].id)
+ break;
+ }
+
+ if (darray_make_room(e, 1))
+ goto out;
+
+ darray_insert_item(e, i, n);
+out:
+ mutex_unlock(&c->fsck_error_counts_lock);
+}
+
+void bch2_sb_errors_from_cpu(struct bch_fs *c)
+{
+ bch_sb_errors_cpu *src = &c->fsck_error_counts;
+ struct bch_sb_field_errors *dst =
+ bch2_sb_field_resize(&c->disk_sb, errors,
+ bch2_sb_field_errors_u64s(src->nr));
+ unsigned i;
+
+ if (!dst)
+ return;
+
+ for (i = 0; i < src->nr; i++) {
+ SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id);
+ SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr);
+ dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time);
+ }
+}
+
+static int bch2_sb_errors_to_cpu(struct bch_fs *c)
+{
+ struct bch_sb_field_errors *src = bch2_sb_field_get(c->disk_sb.sb, errors);
+ bch_sb_errors_cpu *dst = &c->fsck_error_counts;
+ unsigned i, nr = bch2_sb_field_errors_nr_entries(src);
+ int ret;
+
+ if (!nr)
+ return 0;
+
+ mutex_lock(&c->fsck_error_counts_lock);
+ ret = darray_make_room(dst, nr);
+ if (ret)
+ goto err;
+
+ dst->nr = nr;
+
+ for (i = 0; i < nr; i++) {
+ dst->data[i].id = BCH_SB_ERROR_ENTRY_ID(&src->entries[i]);
+ dst->data[i].nr = BCH_SB_ERROR_ENTRY_NR(&src->entries[i]);
+ dst->data[i].last_error_time = le64_to_cpu(src->entries[i].last_error_time);
+ }
+err:
+ mutex_unlock(&c->fsck_error_counts_lock);
+
+ return ret;
+}
+
+void bch2_fs_sb_errors_exit(struct bch_fs *c)
+{
+ darray_exit(&c->fsck_error_counts);
+}
+
+void bch2_fs_sb_errors_init_early(struct bch_fs *c)
+{
+ mutex_init(&c->fsck_error_counts_lock);
+ darray_init(&c->fsck_error_counts);
+}
+
+int bch2_fs_sb_errors_init(struct bch_fs *c)
+{
+ return bch2_sb_errors_to_cpu(c);
+}
diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h
new file mode 100644
index 000000000000..8889001e7db4
--- /dev/null
+++ b/fs/bcachefs/sb-errors.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_ERRORS_H
+#define _BCACHEFS_SB_ERRORS_H
+
+#include "sb-errors_types.h"
+
+extern const char * const bch2_sb_error_strs[];
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_errors;
+
+void bch2_sb_error_count(struct bch_fs *, enum bch_sb_error_id);
+
+void bch2_sb_errors_from_cpu(struct bch_fs *);
+
+void bch2_fs_sb_errors_exit(struct bch_fs *);
+void bch2_fs_sb_errors_init_early(struct bch_fs *);
+int bch2_fs_sb_errors_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_SB_ERRORS_H */
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
new file mode 100644
index 000000000000..3504c2d09c29
--- /dev/null
+++ b/fs/bcachefs/sb-errors_types.h
@@ -0,0 +1,269 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_ERRORS_TYPES_H
+#define _BCACHEFS_SB_ERRORS_TYPES_H
+
+#include "darray.h"
+
+#define BCH_SB_ERRS() \
+ x(clean_but_journal_not_empty, 0) \
+ x(dirty_but_no_journal_entries, 1) \
+ x(dirty_but_no_journal_entries_post_drop_nonflushes, 2) \
+ x(sb_clean_journal_seq_mismatch, 3) \
+ x(sb_clean_btree_root_mismatch, 4) \
+ x(sb_clean_missing, 5) \
+ x(jset_unsupported_version, 6) \
+ x(jset_unknown_csum, 7) \
+ x(jset_last_seq_newer_than_seq, 8) \
+ x(jset_past_bucket_end, 9) \
+ x(jset_seq_blacklisted, 10) \
+ x(journal_entries_missing, 11) \
+ x(journal_entry_replicas_not_marked, 12) \
+ x(journal_entry_past_jset_end, 13) \
+ x(journal_entry_replicas_data_mismatch, 14) \
+ x(journal_entry_bkey_u64s_0, 15) \
+ x(journal_entry_bkey_past_end, 16) \
+ x(journal_entry_bkey_bad_format, 17) \
+ x(journal_entry_bkey_invalid, 18) \
+ x(journal_entry_btree_root_bad_size, 19) \
+ x(journal_entry_blacklist_bad_size, 20) \
+ x(journal_entry_blacklist_v2_bad_size, 21) \
+ x(journal_entry_blacklist_v2_start_past_end, 22) \
+ x(journal_entry_usage_bad_size, 23) \
+ x(journal_entry_data_usage_bad_size, 24) \
+ x(journal_entry_clock_bad_size, 25) \
+ x(journal_entry_clock_bad_rw, 26) \
+ x(journal_entry_dev_usage_bad_size, 27) \
+ x(journal_entry_dev_usage_bad_dev, 28) \
+ x(journal_entry_dev_usage_bad_pad, 29) \
+ x(btree_node_unreadable, 30) \
+ x(btree_node_fault_injected, 31) \
+ x(btree_node_bad_magic, 32) \
+ x(btree_node_bad_seq, 33) \
+ x(btree_node_unsupported_version, 34) \
+ x(btree_node_bset_older_than_sb_min, 35) \
+ x(btree_node_bset_newer_than_sb, 36) \
+ x(btree_node_data_missing, 37) \
+ x(btree_node_bset_after_end, 38) \
+ x(btree_node_replicas_sectors_written_mismatch, 39) \
+ x(btree_node_replicas_data_mismatch, 40) \
+ x(bset_unknown_csum, 41) \
+ x(bset_bad_csum, 42) \
+ x(bset_past_end_of_btree_node, 43) \
+ x(bset_wrong_sector_offset, 44) \
+ x(bset_empty, 45) \
+ x(bset_bad_seq, 46) \
+ x(bset_blacklisted_journal_seq, 47) \
+ x(first_bset_blacklisted_journal_seq, 48) \
+ x(btree_node_bad_btree, 49) \
+ x(btree_node_bad_level, 50) \
+ x(btree_node_bad_min_key, 51) \
+ x(btree_node_bad_max_key, 52) \
+ x(btree_node_bad_format, 53) \
+ x(btree_node_bkey_past_bset_end, 54) \
+ x(btree_node_bkey_bad_format, 55) \
+ x(btree_node_bad_bkey, 56) \
+ x(btree_node_bkey_out_of_order, 57) \
+ x(btree_root_bkey_invalid, 58) \
+ x(btree_root_read_error, 59) \
+ x(btree_root_bad_min_key, 60) \
+ x(btree_root_bad_max_key, 61) \
+ x(btree_node_read_error, 62) \
+ x(btree_node_topology_bad_min_key, 63) \
+ x(btree_node_topology_bad_max_key, 64) \
+ x(btree_node_topology_overwritten_by_prev_node, 65) \
+ x(btree_node_topology_overwritten_by_next_node, 66) \
+ x(btree_node_topology_interior_node_empty, 67) \
+ x(fs_usage_hidden_wrong, 68) \
+ x(fs_usage_btree_wrong, 69) \
+ x(fs_usage_data_wrong, 70) \
+ x(fs_usage_cached_wrong, 71) \
+ x(fs_usage_reserved_wrong, 72) \
+ x(fs_usage_persistent_reserved_wrong, 73) \
+ x(fs_usage_nr_inodes_wrong, 74) \
+ x(fs_usage_replicas_wrong, 75) \
+ x(dev_usage_buckets_wrong, 76) \
+ x(dev_usage_sectors_wrong, 77) \
+ x(dev_usage_fragmented_wrong, 78) \
+ x(dev_usage_buckets_ec_wrong, 79) \
+ x(bkey_version_in_future, 80) \
+ x(bkey_u64s_too_small, 81) \
+ x(bkey_invalid_type_for_btree, 82) \
+ x(bkey_extent_size_zero, 83) \
+ x(bkey_extent_size_greater_than_offset, 84) \
+ x(bkey_size_nonzero, 85) \
+ x(bkey_snapshot_nonzero, 86) \
+ x(bkey_snapshot_zero, 87) \
+ x(bkey_at_pos_max, 88) \
+ x(bkey_before_start_of_btree_node, 89) \
+ x(bkey_after_end_of_btree_node, 90) \
+ x(bkey_val_size_nonzero, 91) \
+ x(bkey_val_size_too_small, 92) \
+ x(alloc_v1_val_size_bad, 93) \
+ x(alloc_v2_unpack_error, 94) \
+ x(alloc_v3_unpack_error, 95) \
+ x(alloc_v4_val_size_bad, 96) \
+ x(alloc_v4_backpointers_start_bad, 97) \
+ x(alloc_key_data_type_bad, 98) \
+ x(alloc_key_empty_but_have_data, 99) \
+ x(alloc_key_dirty_sectors_0, 100) \
+ x(alloc_key_data_type_inconsistency, 101) \
+ x(alloc_key_to_missing_dev_bucket, 102) \
+ x(alloc_key_cached_inconsistency, 103) \
+ x(alloc_key_cached_but_read_time_zero, 104) \
+ x(alloc_key_to_missing_lru_entry, 105) \
+ x(alloc_key_data_type_wrong, 106) \
+ x(alloc_key_gen_wrong, 107) \
+ x(alloc_key_dirty_sectors_wrong, 108) \
+ x(alloc_key_cached_sectors_wrong, 109) \
+ x(alloc_key_stripe_wrong, 110) \
+ x(alloc_key_stripe_redundancy_wrong, 111) \
+ x(bucket_sector_count_overflow, 112) \
+ x(bucket_metadata_type_mismatch, 113) \
+ x(need_discard_key_wrong, 114) \
+ x(freespace_key_wrong, 115) \
+ x(freespace_hole_missing, 116) \
+ x(bucket_gens_val_size_bad, 117) \
+ x(bucket_gens_key_wrong, 118) \
+ x(bucket_gens_hole_wrong, 119) \
+ x(bucket_gens_to_invalid_dev, 120) \
+ x(bucket_gens_to_invalid_buckets, 121) \
+ x(bucket_gens_nonzero_for_invalid_buckets, 122) \
+ x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \
+ x(need_discard_freespace_key_bad, 124) \
+ x(backpointer_pos_wrong, 125) \
+ x(backpointer_to_missing_device, 126) \
+ x(backpointer_to_missing_alloc, 127) \
+ x(backpointer_to_missing_ptr, 128) \
+ x(lru_entry_at_time_0, 129) \
+ x(lru_entry_to_invalid_bucket, 130) \
+ x(lru_entry_bad, 131) \
+ x(btree_ptr_val_too_big, 132) \
+ x(btree_ptr_v2_val_too_big, 133) \
+ x(btree_ptr_has_non_ptr, 134) \
+ x(extent_ptrs_invalid_entry, 135) \
+ x(extent_ptrs_no_ptrs, 136) \
+ x(extent_ptrs_too_many_ptrs, 137) \
+ x(extent_ptrs_redundant_crc, 138) \
+ x(extent_ptrs_redundant_stripe, 139) \
+ x(extent_ptrs_unwritten, 140) \
+ x(extent_ptrs_written_and_unwritten, 141) \
+ x(ptr_to_invalid_device, 142) \
+ x(ptr_to_duplicate_device, 143) \
+ x(ptr_after_last_bucket, 144) \
+ x(ptr_before_first_bucket, 145) \
+ x(ptr_spans_multiple_buckets, 146) \
+ x(ptr_to_missing_backpointer, 147) \
+ x(ptr_to_missing_alloc_key, 148) \
+ x(ptr_to_missing_replicas_entry, 149) \
+ x(ptr_to_missing_stripe, 150) \
+ x(ptr_to_incorrect_stripe, 151) \
+ x(ptr_gen_newer_than_bucket_gen, 152) \
+ x(ptr_too_stale, 153) \
+ x(stale_dirty_ptr, 154) \
+ x(ptr_bucket_data_type_mismatch, 155) \
+ x(ptr_cached_and_erasure_coded, 156) \
+ x(ptr_crc_uncompressed_size_too_small, 157) \
+ x(ptr_crc_csum_type_unknown, 158) \
+ x(ptr_crc_compression_type_unknown, 159) \
+ x(ptr_crc_redundant, 160) \
+ x(ptr_crc_uncompressed_size_too_big, 161) \
+ x(ptr_crc_nonce_mismatch, 162) \
+ x(ptr_stripe_redundant, 163) \
+ x(reservation_key_nr_replicas_invalid, 164) \
+ x(reflink_v_refcount_wrong, 165) \
+ x(reflink_p_to_missing_reflink_v, 166) \
+ x(stripe_pos_bad, 167) \
+ x(stripe_val_size_bad, 168) \
+ x(stripe_sector_count_wrong, 169) \
+ x(snapshot_tree_pos_bad, 170) \
+ x(snapshot_tree_to_missing_snapshot, 171) \
+ x(snapshot_tree_to_missing_subvol, 172) \
+ x(snapshot_tree_to_wrong_subvol, 173) \
+ x(snapshot_tree_to_snapshot_subvol, 174) \
+ x(snapshot_pos_bad, 175) \
+ x(snapshot_parent_bad, 176) \
+ x(snapshot_children_not_normalized, 177) \
+ x(snapshot_child_duplicate, 178) \
+ x(snapshot_child_bad, 179) \
+ x(snapshot_skiplist_not_normalized, 180) \
+ x(snapshot_skiplist_bad, 181) \
+ x(snapshot_should_not_have_subvol, 182) \
+ x(snapshot_to_bad_snapshot_tree, 183) \
+ x(snapshot_bad_depth, 184) \
+ x(snapshot_bad_skiplist, 185) \
+ x(subvol_pos_bad, 186) \
+ x(subvol_not_master_and_not_snapshot, 187) \
+ x(subvol_to_missing_root, 188) \
+ x(subvol_root_wrong_bi_subvol, 189) \
+ x(bkey_in_missing_snapshot, 190) \
+ x(inode_pos_inode_nonzero, 191) \
+ x(inode_pos_blockdev_range, 192) \
+ x(inode_unpack_error, 193) \
+ x(inode_str_hash_invalid, 194) \
+ x(inode_v3_fields_start_bad, 195) \
+ x(inode_snapshot_mismatch, 196) \
+ x(inode_unlinked_but_clean, 197) \
+ x(inode_unlinked_but_nlink_nonzero, 198) \
+ x(inode_checksum_type_invalid, 199) \
+ x(inode_compression_type_invalid, 200) \
+ x(inode_subvol_root_but_not_dir, 201) \
+ x(inode_i_size_dirty_but_clean, 202) \
+ x(inode_i_sectors_dirty_but_clean, 203) \
+ x(inode_i_sectors_wrong, 204) \
+ x(inode_dir_wrong_nlink, 205) \
+ x(inode_dir_multiple_links, 206) \
+ x(inode_multiple_links_but_nlink_0, 207) \
+ x(inode_wrong_backpointer, 208) \
+ x(inode_wrong_nlink, 209) \
+ x(inode_unreachable, 210) \
+ x(deleted_inode_but_clean, 211) \
+ x(deleted_inode_missing, 212) \
+ x(deleted_inode_is_dir, 213) \
+ x(deleted_inode_not_unlinked, 214) \
+ x(extent_overlapping, 215) \
+ x(extent_in_missing_inode, 216) \
+ x(extent_in_non_reg_inode, 217) \
+ x(extent_past_end_of_inode, 218) \
+ x(dirent_empty_name, 219) \
+ x(dirent_val_too_big, 220) \
+ x(dirent_name_too_long, 221) \
+ x(dirent_name_embedded_nul, 222) \
+ x(dirent_name_dot_or_dotdot, 223) \
+ x(dirent_name_has_slash, 224) \
+ x(dirent_d_type_wrong, 225) \
+ x(dirent_d_parent_subvol_wrong, 226) \
+ x(dirent_in_missing_dir_inode, 227) \
+ x(dirent_in_non_dir_inode, 228) \
+ x(dirent_to_missing_inode, 229) \
+ x(dirent_to_missing_subvol, 230) \
+ x(dirent_to_itself, 231) \
+ x(quota_type_invalid, 232) \
+ x(xattr_val_size_too_small, 233) \
+ x(xattr_val_size_too_big, 234) \
+ x(xattr_invalid_type, 235) \
+ x(xattr_name_invalid_chars, 236) \
+ x(xattr_in_missing_inode, 237) \
+ x(root_subvol_missing, 238) \
+ x(root_dir_missing, 239) \
+ x(root_inode_not_dir, 240) \
+ x(dir_loop, 241) \
+ x(hash_table_key_duplicate, 242) \
+ x(hash_table_key_wrong_offset, 243)
+
+enum bch_sb_error_id {
+#define x(t, n) BCH_FSCK_ERR_##t = n,
+ BCH_SB_ERRS()
+#undef x
+ BCH_SB_ERR_MAX
+};
+
+struct bch_sb_error_entry_cpu {
+ u64 id:16,
+ nr:48;
+ u64 last_error_time;
+};
+
+typedef DARRAY(struct bch_sb_error_entry_cpu) bch_sb_errors_cpu;
+
+#endif /* _BCACHEFS_SB_ERRORS_TYPES_H */
+
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
new file mode 100644
index 000000000000..bed0f857fe5b
--- /dev/null
+++ b/fs/bcachefs/sb-members.c
@@ -0,0 +1,420 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "disk_groups.h"
+#include "opts.h"
+#include "replicas.h"
+#include "sb-members.h"
+#include "super-io.h"
+
+#define x(t, n, ...) [n] = #t,
+static const char * const bch2_iops_measurements[] = {
+ BCH_IOPS_MEASUREMENTS()
+ NULL
+};
+
+char * const bch2_member_error_strs[] = {
+ BCH_MEMBER_ERROR_TYPES()
+ NULL
+};
+#undef x
+
+/* Code for bch_sb_field_members_v1: */
+
+struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i)
+{
+ return __bch2_members_v2_get_mut(bch2_sb_field_get(sb, members_v2), i);
+}
+
+static struct bch_member members_v2_get(struct bch_sb_field_members_v2 *mi, int i)
+{
+ struct bch_member ret, *p = __bch2_members_v2_get_mut(mi, i);
+ memset(&ret, 0, sizeof(ret));
+ memcpy(&ret, p, min_t(size_t, le16_to_cpu(mi->member_bytes), sizeof(ret)));
+ return ret;
+}
+
+static struct bch_member *members_v1_get_mut(struct bch_sb_field_members_v1 *mi, int i)
+{
+ return (void *) mi->_members + (i * BCH_MEMBER_V1_BYTES);
+}
+
+static struct bch_member members_v1_get(struct bch_sb_field_members_v1 *mi, int i)
+{
+ struct bch_member ret, *p = members_v1_get_mut(mi, i);
+ memset(&ret, 0, sizeof(ret));
+ memcpy(&ret, p, min_t(size_t, BCH_MEMBER_V1_BYTES, sizeof(ret)));
+ return ret;
+}
+
+struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i)
+{
+ struct bch_sb_field_members_v2 *mi2 = bch2_sb_field_get(sb, members_v2);
+ if (mi2)
+ return members_v2_get(mi2, i);
+ struct bch_sb_field_members_v1 *mi1 = bch2_sb_field_get(sb, members_v1);
+ return members_v1_get(mi1, i);
+}
+
+static int sb_members_v2_resize_entries(struct bch_fs *c)
+{
+ struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+
+ if (le16_to_cpu(mi->member_bytes) < sizeof(struct bch_member)) {
+ unsigned u64s = DIV_ROUND_UP((sizeof(*mi) + sizeof(mi->_members[0]) *
+ c->disk_sb.sb->nr_devices), 8);
+
+ mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
+ if (!mi)
+ return -BCH_ERR_ENOSPC_sb_members_v2;
+
+ for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) {
+ void *dst = (void *) mi->_members + (i * sizeof(struct bch_member));
+ memmove(dst, __bch2_members_v2_get_mut(mi, i), le16_to_cpu(mi->member_bytes));
+ memset(dst + le16_to_cpu(mi->member_bytes),
+ 0, (sizeof(struct bch_member) - le16_to_cpu(mi->member_bytes)));
+ }
+ mi->member_bytes = cpu_to_le16(sizeof(struct bch_member));
+ }
+ return 0;
+}
+
+int bch2_sb_members_v2_init(struct bch_fs *c)
+{
+ struct bch_sb_field_members_v1 *mi1;
+ struct bch_sb_field_members_v2 *mi2;
+
+ if (!bch2_sb_field_get(c->disk_sb.sb, members_v2)) {
+ mi2 = bch2_sb_field_resize(&c->disk_sb, members_v2,
+ DIV_ROUND_UP(sizeof(*mi2) +
+ sizeof(struct bch_member) * c->sb.nr_devices,
+ sizeof(u64)));
+ mi1 = bch2_sb_field_get(c->disk_sb.sb, members_v1);
+ memcpy(&mi2->_members[0], &mi1->_members[0],
+ BCH_MEMBER_V1_BYTES * c->sb.nr_devices);
+ memset(&mi2->pad[0], 0, sizeof(mi2->pad));
+ mi2->member_bytes = cpu_to_le16(BCH_MEMBER_V1_BYTES);
+ }
+
+ return sb_members_v2_resize_entries(c);
+}
+
+int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb)
+{
+ struct bch_sb_field_members_v1 *mi1;
+ struct bch_sb_field_members_v2 *mi2;
+
+ mi1 = bch2_sb_field_resize(disk_sb, members_v1,
+ DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES *
+ disk_sb->sb->nr_devices, sizeof(u64)));
+ if (!mi1)
+ return -BCH_ERR_ENOSPC_sb_members;
+
+ mi2 = bch2_sb_field_get(disk_sb->sb, members_v2);
+
+ for (unsigned i = 0; i < disk_sb->sb->nr_devices; i++)
+ memcpy(members_v1_get_mut(mi1, i), __bch2_members_v2_get_mut(mi2, i), BCH_MEMBER_V1_BYTES);
+
+ return 0;
+}
+
+static int validate_member(struct printbuf *err,
+ struct bch_member m,
+ struct bch_sb *sb,
+ int i)
+{
+ if (le64_to_cpu(m.nbuckets) > LONG_MAX) {
+ prt_printf(err, "device %u: too many buckets (got %llu, max %lu)",
+ i, le64_to_cpu(m.nbuckets), LONG_MAX);
+ return -BCH_ERR_invalid_sb_members;
+ }
+
+ if (le64_to_cpu(m.nbuckets) -
+ le16_to_cpu(m.first_bucket) < BCH_MIN_NR_NBUCKETS) {
+ prt_printf(err, "device %u: not enough buckets (got %llu, max %u)",
+ i, le64_to_cpu(m.nbuckets), BCH_MIN_NR_NBUCKETS);
+ return -BCH_ERR_invalid_sb_members;
+ }
+
+ if (le16_to_cpu(m.bucket_size) <
+ le16_to_cpu(sb->block_size)) {
+ prt_printf(err, "device %u: bucket size %u smaller than block size %u",
+ i, le16_to_cpu(m.bucket_size), le16_to_cpu(sb->block_size));
+ return -BCH_ERR_invalid_sb_members;
+ }
+
+ if (le16_to_cpu(m.bucket_size) <
+ BCH_SB_BTREE_NODE_SIZE(sb)) {
+ prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu",
+ i, le16_to_cpu(m.bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
+ return -BCH_ERR_invalid_sb_members;
+ }
+
+ return 0;
+}
+
+static void member_to_text(struct printbuf *out,
+ struct bch_member m,
+ struct bch_sb_field_disk_groups *gi,
+ struct bch_sb *sb,
+ int i)
+{
+ unsigned data_have = bch2_sb_dev_has_data(sb, i);
+ u64 bucket_size = le16_to_cpu(m.bucket_size);
+ u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size;
+
+ if (!bch2_member_exists(&m))
+ return;
+
+ prt_printf(out, "Device:");
+ prt_tab(out);
+ prt_printf(out, "%u", i);
+ prt_newline(out);
+
+ printbuf_indent_add(out, 2);
+
+ prt_printf(out, "Label:");
+ prt_tab(out);
+ if (BCH_MEMBER_GROUP(&m)) {
+ unsigned idx = BCH_MEMBER_GROUP(&m) - 1;
+
+ if (idx < disk_groups_nr(gi))
+ prt_printf(out, "%s (%u)",
+ gi->entries[idx].label, idx);
+ else
+ prt_printf(out, "(bad disk labels section)");
+ } else {
+ prt_printf(out, "(none)");
+ }
+ prt_newline(out);
+
+ prt_printf(out, "UUID:");
+ prt_tab(out);
+ pr_uuid(out, m.uuid.b);
+ prt_newline(out);
+
+ prt_printf(out, "Size:");
+ prt_tab(out);
+ prt_units_u64(out, device_size << 9);
+ prt_newline(out);
+
+ for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) {
+ prt_printf(out, "%s errors:", bch2_member_error_strs[i]);
+ prt_tab(out);
+ prt_u64(out, le64_to_cpu(m.errors[i]));
+ prt_newline(out);
+ }
+
+ for (unsigned i = 0; i < BCH_IOPS_NR; i++) {
+ prt_printf(out, "%s iops:", bch2_iops_measurements[i]);
+ prt_tab(out);
+ prt_printf(out, "%u", le32_to_cpu(m.iops[i]));
+ prt_newline(out);
+ }
+
+ prt_printf(out, "Bucket size:");
+ prt_tab(out);
+ prt_units_u64(out, bucket_size << 9);
+ prt_newline(out);
+
+ prt_printf(out, "First bucket:");
+ prt_tab(out);
+ prt_printf(out, "%u", le16_to_cpu(m.first_bucket));
+ prt_newline(out);
+
+ prt_printf(out, "Buckets:");
+ prt_tab(out);
+ prt_printf(out, "%llu", le64_to_cpu(m.nbuckets));
+ prt_newline(out);
+
+ prt_printf(out, "Last mount:");
+ prt_tab(out);
+ if (m.last_mount)
+ bch2_prt_datetime(out, le64_to_cpu(m.last_mount));
+ else
+ prt_printf(out, "(never)");
+ prt_newline(out);
+
+ prt_printf(out, "State:");
+ prt_tab(out);
+ prt_printf(out, "%s",
+ BCH_MEMBER_STATE(&m) < BCH_MEMBER_STATE_NR
+ ? bch2_member_states[BCH_MEMBER_STATE(&m)]
+ : "unknown");
+ prt_newline(out);
+
+ prt_printf(out, "Data allowed:");
+ prt_tab(out);
+ if (BCH_MEMBER_DATA_ALLOWED(&m))
+ prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m));
+ else
+ prt_printf(out, "(none)");
+ prt_newline(out);
+
+ prt_printf(out, "Has data:");
+ prt_tab(out);
+ if (data_have)
+ prt_bitflags(out, bch2_data_types, data_have);
+ else
+ prt_printf(out, "(none)");
+ prt_newline(out);
+
+ prt_printf(out, "Discard:");
+ prt_tab(out);
+ prt_printf(out, "%llu", BCH_MEMBER_DISCARD(&m));
+ prt_newline(out);
+
+ prt_printf(out, "Freespace initialized:");
+ prt_tab(out);
+ prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(&m));
+ prt_newline(out);
+
+ printbuf_indent_sub(out, 2);
+}
+
+static int bch2_sb_members_v1_validate(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ struct printbuf *err)
+{
+ struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
+ unsigned i;
+
+ if ((void *) members_v1_get_mut(mi, sb->nr_devices) > vstruct_end(&mi->field)) {
+ prt_printf(err, "too many devices for section size");
+ return -BCH_ERR_invalid_sb_members;
+ }
+
+ for (i = 0; i < sb->nr_devices; i++) {
+ struct bch_member m = members_v1_get(mi, i);
+
+ int ret = validate_member(err, m, sb, i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
+ struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
+ unsigned i;
+
+ for (i = 0; i < sb->nr_devices; i++)
+ member_to_text(out, members_v1_get(mi, i), gi, sb, i);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_members_v1 = {
+ .validate = bch2_sb_members_v1_validate,
+ .to_text = bch2_sb_members_v1_to_text,
+};
+
+static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
+ struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
+ unsigned i;
+
+ for (i = 0; i < sb->nr_devices; i++)
+ member_to_text(out, members_v2_get(mi, i), gi, sb, i);
+}
+
+static int bch2_sb_members_v2_validate(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ struct printbuf *err)
+{
+ struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
+ size_t mi_bytes = (void *) __bch2_members_v2_get_mut(mi, sb->nr_devices) -
+ (void *) mi;
+
+ if (mi_bytes > vstruct_bytes(&mi->field)) {
+ prt_printf(err, "section too small (%zu > %zu)",
+ mi_bytes, vstruct_bytes(&mi->field));
+ return -BCH_ERR_invalid_sb_members;
+ }
+
+ for (unsigned i = 0; i < sb->nr_devices; i++) {
+ int ret = validate_member(err, members_v2_get(mi, i), sb, i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_members_v2 = {
+ .validate = bch2_sb_members_v2_validate,
+ .to_text = bch2_sb_members_v2_to_text,
+};
+
+void bch2_sb_members_from_cpu(struct bch_fs *c)
+{
+ struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+ struct bch_dev *ca;
+ unsigned i, e;
+
+ rcu_read_lock();
+ for_each_member_device_rcu(ca, c, i, NULL) {
+ struct bch_member *m = __bch2_members_v2_get_mut(mi, i);
+
+ for (e = 0; e < BCH_MEMBER_ERROR_NR; e++)
+ m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e]));
+ }
+ rcu_read_unlock();
+}
+
+void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca)
+{
+ struct bch_fs *c = ca->fs;
+ struct bch_member m;
+
+ mutex_lock(&ca->fs->sb_lock);
+ m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx);
+ mutex_unlock(&ca->fs->sb_lock);
+
+ printbuf_tabstop_push(out, 12);
+
+ prt_str(out, "IO errors since filesystem creation");
+ prt_newline(out);
+
+ printbuf_indent_add(out, 2);
+ for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) {
+ prt_printf(out, "%s:", bch2_member_error_strs[i]);
+ prt_tab(out);
+ prt_u64(out, atomic64_read(&ca->errors[i]));
+ prt_newline(out);
+ }
+ printbuf_indent_sub(out, 2);
+
+ prt_str(out, "IO errors since ");
+ bch2_pr_time_units(out, (ktime_get_real_seconds() - le64_to_cpu(m.errors_reset_time)) * NSEC_PER_SEC);
+ prt_str(out, " ago");
+ prt_newline(out);
+
+ printbuf_indent_add(out, 2);
+ for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) {
+ prt_printf(out, "%s:", bch2_member_error_strs[i]);
+ prt_tab(out);
+ prt_u64(out, atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i]));
+ prt_newline(out);
+ }
+ printbuf_indent_sub(out, 2);
+}
+
+void bch2_dev_errors_reset(struct bch_dev *ca)
+{
+ struct bch_fs *c = ca->fs;
+ struct bch_member *m;
+
+ mutex_lock(&c->sb_lock);
+ m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+ for (unsigned i = 0; i < ARRAY_SIZE(m->errors_at_reset); i++)
+ m->errors_at_reset[i] = cpu_to_le64(atomic64_read(&ca->errors[i]));
+ m->errors_reset_time = ktime_get_real_seconds();
+
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+}
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
new file mode 100644
index 000000000000..03613e3eb8e3
--- /dev/null
+++ b/fs/bcachefs/sb-members.h
@@ -0,0 +1,227 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_MEMBERS_H
+#define _BCACHEFS_SB_MEMBERS_H
+
+extern char * const bch2_member_error_strs[];
+
+static inline struct bch_member *
+__bch2_members_v2_get_mut(struct bch_sb_field_members_v2 *mi, unsigned i)
+{
+ return (void *) mi->_members + (i * le16_to_cpu(mi->member_bytes));
+}
+
+int bch2_sb_members_v2_init(struct bch_fs *c);
+int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb);
+struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i);
+struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i);
+
+static inline bool bch2_dev_is_online(struct bch_dev *ca)
+{
+ return !percpu_ref_is_zero(&ca->io_ref);
+}
+
+static inline bool bch2_dev_is_readable(struct bch_dev *ca)
+{
+ return bch2_dev_is_online(ca) &&
+ ca->mi.state != BCH_MEMBER_STATE_failed;
+}
+
+static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
+{
+ if (!percpu_ref_tryget(&ca->io_ref))
+ return false;
+
+ if (ca->mi.state == BCH_MEMBER_STATE_rw ||
+ (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
+ return true;
+
+ percpu_ref_put(&ca->io_ref);
+ return false;
+}
+
+static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
+{
+ return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
+}
+
+static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
+ unsigned dev)
+{
+ unsigned i;
+
+ for (i = 0; i < devs.nr; i++)
+ if (devs.devs[i] == dev)
+ return true;
+
+ return false;
+}
+
+static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
+ unsigned dev)
+{
+ unsigned i;
+
+ for (i = 0; i < devs->nr; i++)
+ if (devs->devs[i] == dev) {
+ array_remove_item(devs->devs, devs->nr, i);
+ return;
+ }
+}
+
+static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
+ unsigned dev)
+{
+ if (!bch2_dev_list_has_dev(*devs, dev)) {
+ BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs));
+ devs->devs[devs->nr++] = dev;
+ }
+}
+
+static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
+{
+ return (struct bch_devs_list) { .nr = 1, .devs[0] = dev };
+}
+
+static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
+ const struct bch_devs_mask *mask)
+{
+ struct bch_dev *ca = NULL;
+
+ while ((*iter = mask
+ ? find_next_bit(mask->d, c->sb.nr_devices, *iter)
+ : *iter) < c->sb.nr_devices &&
+ !(ca = rcu_dereference_check(c->devs[*iter],
+ lockdep_is_held(&c->state_lock))))
+ (*iter)++;
+
+ return ca;
+}
+
+#define for_each_member_device_rcu(ca, c, iter, mask) \
+ for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++)
+
+static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter)
+{
+ struct bch_dev *ca;
+
+ rcu_read_lock();
+ if ((ca = __bch2_next_dev(c, iter, NULL)))
+ percpu_ref_get(&ca->ref);
+ rcu_read_unlock();
+
+ return ca;
+}
+
+/*
+ * If you break early, you must drop your ref on the current device
+ */
+#define for_each_member_device(ca, c, iter) \
+ for ((iter) = 0; \
+ (ca = bch2_get_next_dev(c, &(iter))); \
+ percpu_ref_put(&ca->ref), (iter)++)
+
+static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
+ unsigned *iter,
+ int state_mask)
+{
+ struct bch_dev *ca;
+
+ rcu_read_lock();
+ while ((ca = __bch2_next_dev(c, iter, NULL)) &&
+ (!((1 << ca->mi.state) & state_mask) ||
+ !percpu_ref_tryget(&ca->io_ref)))
+ (*iter)++;
+ rcu_read_unlock();
+
+ return ca;
+}
+
+#define __for_each_online_member(ca, c, iter, state_mask) \
+ for ((iter) = 0; \
+ (ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \
+ percpu_ref_put(&ca->io_ref), (iter)++)
+
+#define for_each_online_member(ca, c, iter) \
+ __for_each_online_member(ca, c, iter, ~0)
+
+#define for_each_rw_member(ca, c, iter) \
+ __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw)
+
+#define for_each_readable_member(ca, c, iter) \
+ __for_each_online_member(ca, c, iter, \
+ (1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro))
+
+/*
+ * If a key exists that references a device, the device won't be going away and
+ * we can omit rcu_read_lock():
+ */
+static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
+{
+ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+
+ return rcu_dereference_check(c->devs[idx], 1);
+}
+
+static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
+{
+ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+
+ return rcu_dereference_protected(c->devs[idx],
+ lockdep_is_held(&c->sb_lock) ||
+ lockdep_is_held(&c->state_lock));
+}
+
+/* XXX kill, move to struct bch_fs */
+static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
+{
+ struct bch_devs_mask devs;
+ struct bch_dev *ca;
+ unsigned i;
+
+ memset(&devs, 0, sizeof(devs));
+ for_each_online_member(ca, c, i)
+ __set_bit(ca->dev_idx, devs.d);
+ return devs;
+}
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1;
+extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2;
+
+static inline bool bch2_member_exists(struct bch_member *m)
+{
+ return !bch2_is_zero(&m->uuid, sizeof(m->uuid));
+}
+
+static inline bool bch2_dev_exists(struct bch_sb *sb, unsigned dev)
+{
+ if (dev < sb->nr_devices) {
+ struct bch_member m = bch2_sb_member_get(sb, dev);
+ return bch2_member_exists(&m);
+ }
+ return false;
+}
+
+static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
+{
+ return (struct bch_member_cpu) {
+ .nbuckets = le64_to_cpu(mi->nbuckets),
+ .first_bucket = le16_to_cpu(mi->first_bucket),
+ .bucket_size = le16_to_cpu(mi->bucket_size),
+ .group = BCH_MEMBER_GROUP(mi),
+ .state = BCH_MEMBER_STATE(mi),
+ .discard = BCH_MEMBER_DISCARD(mi),
+ .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi),
+ .durability = BCH_MEMBER_DURABILITY(mi)
+ ? BCH_MEMBER_DURABILITY(mi) - 1
+ : 1,
+ .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
+ .valid = bch2_member_exists(mi),
+ };
+}
+
+void bch2_sb_members_from_cpu(struct bch_fs *);
+
+void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *);
+void bch2_dev_errors_reset(struct bch_dev *);
+
+#endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/fs/bcachefs/seqmutex.h b/fs/bcachefs/seqmutex.h
new file mode 100644
index 000000000000..c1860d8163fb
--- /dev/null
+++ b/fs/bcachefs/seqmutex.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SEQMUTEX_H
+#define _BCACHEFS_SEQMUTEX_H
+
+#include <linux/mutex.h>
+
+struct seqmutex {
+ struct mutex lock;
+ u32 seq;
+};
+
+#define seqmutex_init(_lock) mutex_init(&(_lock)->lock)
+
+static inline bool seqmutex_trylock(struct seqmutex *lock)
+{
+ return mutex_trylock(&lock->lock);
+}
+
+static inline void seqmutex_lock(struct seqmutex *lock)
+{
+ mutex_lock(&lock->lock);
+}
+
+static inline void seqmutex_unlock(struct seqmutex *lock)
+{
+ lock->seq++;
+ mutex_unlock(&lock->lock);
+}
+
+static inline u32 seqmutex_seq(struct seqmutex *lock)
+{
+ return lock->seq;
+}
+
+static inline bool seqmutex_relock(struct seqmutex *lock, u32 seq)
+{
+ if (lock->seq != seq || !mutex_trylock(&lock->lock))
+ return false;
+
+ if (lock->seq != seq) {
+ mutex_unlock(&lock->lock);
+ return false;
+ }
+
+ return true;
+}
+
+#endif /* _BCACHEFS_SEQMUTEX_H */
diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c
new file mode 100644
index 000000000000..dc1a27cc31cd
--- /dev/null
+++ b/fs/bcachefs/siphash.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: BSD-3-Clause
+/* $OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */
+
+/*-
+ * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d
+ * are the number of compression rounds and the number of finalization rounds.
+ * A compression round is identical to a finalization round and this round
+ * function is called SipRound. Given a 128-bit key k and a (possibly empty)
+ * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m).
+ *
+ * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18,
+ * by Jean-Philippe Aumasson and Daniel J. Bernstein,
+ * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa
+ * https://131002.net/siphash/siphash.pdf
+ * https://131002.net/siphash/
+ */
+
+#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+#include <linux/bitops.h>
+#include <linux/string.h>
+
+#include "siphash.h"
+
+static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds)
+{
+ while (rounds--) {
+ ctx->v[0] += ctx->v[1];
+ ctx->v[2] += ctx->v[3];
+ ctx->v[1] = rol64(ctx->v[1], 13);
+ ctx->v[3] = rol64(ctx->v[3], 16);
+
+ ctx->v[1] ^= ctx->v[0];
+ ctx->v[3] ^= ctx->v[2];
+ ctx->v[0] = rol64(ctx->v[0], 32);
+
+ ctx->v[2] += ctx->v[1];
+ ctx->v[0] += ctx->v[3];
+ ctx->v[1] = rol64(ctx->v[1], 17);
+ ctx->v[3] = rol64(ctx->v[3], 21);
+
+ ctx->v[1] ^= ctx->v[2];
+ ctx->v[3] ^= ctx->v[0];
+ ctx->v[2] = rol64(ctx->v[2], 32);
+ }
+}
+
+static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds)
+{
+ u64 m = get_unaligned_le64(ptr);
+
+ ctx->v[3] ^= m;
+ SipHash_Rounds(ctx, rounds);
+ ctx->v[0] ^= m;
+}
+
+void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key)
+{
+ u64 k0, k1;
+
+ k0 = le64_to_cpu(key->k0);
+ k1 = le64_to_cpu(key->k1);
+
+ ctx->v[0] = 0x736f6d6570736575ULL ^ k0;
+ ctx->v[1] = 0x646f72616e646f6dULL ^ k1;
+ ctx->v[2] = 0x6c7967656e657261ULL ^ k0;
+ ctx->v[3] = 0x7465646279746573ULL ^ k1;
+
+ memset(ctx->buf, 0, sizeof(ctx->buf));
+ ctx->bytes = 0;
+}
+
+void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf,
+ const void *src, size_t len)
+{
+ const u8 *ptr = src;
+ size_t left, used;
+
+ if (len == 0)
+ return;
+
+ used = ctx->bytes % sizeof(ctx->buf);
+ ctx->bytes += len;
+
+ if (used > 0) {
+ left = sizeof(ctx->buf) - used;
+
+ if (len >= left) {
+ memcpy(&ctx->buf[used], ptr, left);
+ SipHash_CRounds(ctx, ctx->buf, rc);
+ len -= left;
+ ptr += left;
+ } else {
+ memcpy(&ctx->buf[used], ptr, len);
+ return;
+ }
+ }
+
+ while (len >= sizeof(ctx->buf)) {
+ SipHash_CRounds(ctx, ptr, rc);
+ len -= sizeof(ctx->buf);
+ ptr += sizeof(ctx->buf);
+ }
+
+ if (len > 0)
+ memcpy(&ctx->buf[used], ptr, len);
+}
+
+void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf)
+{
+ u64 r;
+
+ r = SipHash_End(ctx, rc, rf);
+
+ *((__le64 *) dst) = cpu_to_le64(r);
+}
+
+u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
+{
+ u64 r;
+ size_t left, used;
+
+ used = ctx->bytes % sizeof(ctx->buf);
+ left = sizeof(ctx->buf) - used;
+ memset(&ctx->buf[used], 0, left - 1);
+ ctx->buf[7] = ctx->bytes;
+
+ SipHash_CRounds(ctx, ctx->buf, rc);
+ ctx->v[2] ^= 0xff;
+ SipHash_Rounds(ctx, rf);
+
+ r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]);
+ memset(ctx, 0, sizeof(*ctx));
+ return r;
+}
+
+u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len)
+{
+ SIPHASH_CTX ctx;
+
+ SipHash_Init(&ctx, key);
+ SipHash_Update(&ctx, rc, rf, src, len);
+ return SipHash_End(&ctx, rc, rf);
+}
diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h
new file mode 100644
index 000000000000..3dfaf34a43b2
--- /dev/null
+++ b/fs/bcachefs/siphash.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */
+/*-
+ * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions)
+ * optimized for speed on short messages returning a 64bit hash/digest value.
+ *
+ * The number of rounds is defined during the initialization:
+ * SipHash24_Init() for the fast and resonable strong version
+ * SipHash48_Init() for the strong version (half as fast)
+ *
+ * struct SIPHASH_CTX ctx;
+ * SipHash24_Init(&ctx);
+ * SipHash_SetKey(&ctx, "16bytes long key");
+ * SipHash_Update(&ctx, pointer_to_string, length_of_string);
+ * SipHash_Final(output, &ctx);
+ */
+
+#ifndef _SIPHASH_H_
+#define _SIPHASH_H_
+
+#include <linux/types.h>
+
+#define SIPHASH_BLOCK_LENGTH 8
+#define SIPHASH_KEY_LENGTH 16
+#define SIPHASH_DIGEST_LENGTH 8
+
+typedef struct _SIPHASH_CTX {
+ u64 v[4];
+ u8 buf[SIPHASH_BLOCK_LENGTH];
+ u32 bytes;
+} SIPHASH_CTX;
+
+typedef struct {
+ __le64 k0;
+ __le64 k1;
+} SIPHASH_KEY;
+
+void SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *);
+void SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t);
+u64 SipHash_End(SIPHASH_CTX *, int, int);
+void SipHash_Final(void *, SIPHASH_CTX *, int, int);
+u64 SipHash(const SIPHASH_KEY *, int, int, const void *, size_t);
+
+#define SipHash24_Init(_c, _k) SipHash_Init((_c), (_k))
+#define SipHash24_Update(_c, _p, _l) SipHash_Update((_c), 2, 4, (_p), (_l))
+#define SipHash24_End(_d) SipHash_End((_d), 2, 4)
+#define SipHash24_Final(_d, _c) SipHash_Final((_d), (_c), 2, 4)
+#define SipHash24(_k, _p, _l) SipHash((_k), 2, 4, (_p), (_l))
+
+#define SipHash48_Init(_c, _k) SipHash_Init((_c), (_k))
+#define SipHash48_Update(_c, _p, _l) SipHash_Update((_c), 4, 8, (_p), (_l))
+#define SipHash48_End(_d) SipHash_End((_d), 4, 8)
+#define SipHash48_Final(_d, _c) SipHash_Final((_d), (_c), 4, 8)
+#define SipHash48(_k, _p, _l) SipHash((_k), 4, 8, (_p), (_l))
+
+#endif /* _SIPHASH_H_ */
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
new file mode 100644
index 000000000000..97790445e67a
--- /dev/null
+++ b/fs/bcachefs/six.c
@@ -0,0 +1,920 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/export.h>
+#include <linux/log2.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+
+#include <trace/events/lock.h>
+
+#include "six.h"
+
+#ifdef DEBUG
+#define EBUG_ON(cond) BUG_ON(cond)
+#else
+#define EBUG_ON(cond) do {} while (0)
+#endif
+
+#define six_acquire(l, t, r, ip) lock_acquire(l, 0, t, r, 1, NULL, ip)
+#define six_release(l, ip) lock_release(l, ip)
+
+static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
+
+#define SIX_LOCK_HELD_read_OFFSET 0
+#define SIX_LOCK_HELD_read ~(~0U << 26)
+#define SIX_LOCK_HELD_intent (1U << 26)
+#define SIX_LOCK_HELD_write (1U << 27)
+#define SIX_LOCK_WAITING_read (1U << (28 + SIX_LOCK_read))
+#define SIX_LOCK_WAITING_write (1U << (28 + SIX_LOCK_write))
+#define SIX_LOCK_NOSPIN (1U << 31)
+
+struct six_lock_vals {
+ /* Value we add to the lock in order to take the lock: */
+ u32 lock_val;
+
+ /* If the lock has this value (used as a mask), taking the lock fails: */
+ u32 lock_fail;
+
+ /* Mask that indicates lock is held for this type: */
+ u32 held_mask;
+
+ /* Waitlist we wakeup when releasing the lock: */
+ enum six_lock_type unlock_wakeup;
+};
+
+static const struct six_lock_vals l[] = {
+ [SIX_LOCK_read] = {
+ .lock_val = 1U << SIX_LOCK_HELD_read_OFFSET,
+ .lock_fail = SIX_LOCK_HELD_write,
+ .held_mask = SIX_LOCK_HELD_read,
+ .unlock_wakeup = SIX_LOCK_write,
+ },
+ [SIX_LOCK_intent] = {
+ .lock_val = SIX_LOCK_HELD_intent,
+ .lock_fail = SIX_LOCK_HELD_intent,
+ .held_mask = SIX_LOCK_HELD_intent,
+ .unlock_wakeup = SIX_LOCK_intent,
+ },
+ [SIX_LOCK_write] = {
+ .lock_val = SIX_LOCK_HELD_write,
+ .lock_fail = SIX_LOCK_HELD_read,
+ .held_mask = SIX_LOCK_HELD_write,
+ .unlock_wakeup = SIX_LOCK_read,
+ },
+};
+
+static inline void six_set_bitmask(struct six_lock *lock, u32 mask)
+{
+ if ((atomic_read(&lock->state) & mask) != mask)
+ atomic_or(mask, &lock->state);
+}
+
+static inline void six_clear_bitmask(struct six_lock *lock, u32 mask)
+{
+ if (atomic_read(&lock->state) & mask)
+ atomic_and(~mask, &lock->state);
+}
+
+static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
+ u32 old, struct task_struct *owner)
+{
+ if (type != SIX_LOCK_intent)
+ return;
+
+ if (!(old & SIX_LOCK_HELD_intent)) {
+ EBUG_ON(lock->owner);
+ lock->owner = owner;
+ } else {
+ EBUG_ON(lock->owner != current);
+ }
+}
+
+static inline unsigned pcpu_read_count(struct six_lock *lock)
+{
+ unsigned read_count = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ read_count += *per_cpu_ptr(lock->readers, cpu);
+ return read_count;
+}
+
+/*
+ * __do_six_trylock() - main trylock routine
+ *
+ * Returns 1 on success, 0 on failure
+ *
+ * In percpu reader mode, a failed trylock may cause a spurious trylock failure
+ * for anoter thread taking the competing lock type, and we may havve to do a
+ * wakeup: when a wakeup is required, we return -1 - wakeup_type.
+ */
+static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
+ struct task_struct *task, bool try)
+{
+ int ret;
+ u32 old;
+
+ EBUG_ON(type == SIX_LOCK_write && lock->owner != task);
+ EBUG_ON(type == SIX_LOCK_write &&
+ (try != !(atomic_read(&lock->state) & SIX_LOCK_HELD_write)));
+
+ /*
+ * Percpu reader mode:
+ *
+ * The basic idea behind this algorithm is that you can implement a lock
+ * between two threads without any atomics, just memory barriers:
+ *
+ * For two threads you'll need two variables, one variable for "thread a
+ * has the lock" and another for "thread b has the lock".
+ *
+ * To take the lock, a thread sets its variable indicating that it holds
+ * the lock, then issues a full memory barrier, then reads from the
+ * other thread's variable to check if the other thread thinks it has
+ * the lock. If we raced, we backoff and retry/sleep.
+ *
+ * Failure to take the lock may cause a spurious trylock failure in
+ * another thread, because we temporarily set the lock to indicate that
+ * we held it. This would be a problem for a thread in six_lock(), when
+ * they are calling trylock after adding themself to the waitlist and
+ * prior to sleeping.
+ *
+ * Therefore, if we fail to get the lock, and there were waiters of the
+ * type we conflict with, we will have to issue a wakeup.
+ *
+ * Since we may be called under wait_lock (and by the wakeup code
+ * itself), we return that the wakeup has to be done instead of doing it
+ * here.
+ */
+ if (type == SIX_LOCK_read && lock->readers) {
+ preempt_disable();
+ this_cpu_inc(*lock->readers); /* signal that we own lock */
+
+ smp_mb();
+
+ old = atomic_read(&lock->state);
+ ret = !(old & l[type].lock_fail);
+
+ this_cpu_sub(*lock->readers, !ret);
+ preempt_enable();
+
+ if (!ret) {
+ smp_mb();
+ if (atomic_read(&lock->state) & SIX_LOCK_WAITING_write)
+ ret = -1 - SIX_LOCK_write;
+ }
+ } else if (type == SIX_LOCK_write && lock->readers) {
+ if (try) {
+ atomic_add(SIX_LOCK_HELD_write, &lock->state);
+ smp_mb__after_atomic();
+ }
+
+ ret = !pcpu_read_count(lock);
+
+ if (try && !ret) {
+ old = atomic_sub_return(SIX_LOCK_HELD_write, &lock->state);
+ if (old & SIX_LOCK_WAITING_read)
+ ret = -1 - SIX_LOCK_read;
+ }
+ } else {
+ old = atomic_read(&lock->state);
+ do {
+ ret = !(old & l[type].lock_fail);
+ if (!ret || (type == SIX_LOCK_write && !try)) {
+ smp_mb();
+ break;
+ }
+ } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, old + l[type].lock_val));
+
+ EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask));
+ }
+
+ if (ret > 0)
+ six_set_owner(lock, type, old, task);
+
+ EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 &&
+ (atomic_read(&lock->state) & SIX_LOCK_HELD_write));
+
+ return ret;
+}
+
+static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type)
+{
+ struct six_lock_waiter *w, *next;
+ struct task_struct *task;
+ bool saw_one;
+ int ret;
+again:
+ ret = 0;
+ saw_one = false;
+ raw_spin_lock(&lock->wait_lock);
+
+ list_for_each_entry_safe(w, next, &lock->wait_list, list) {
+ if (w->lock_want != lock_type)
+ continue;
+
+ if (saw_one && lock_type != SIX_LOCK_read)
+ goto unlock;
+ saw_one = true;
+
+ ret = __do_six_trylock(lock, lock_type, w->task, false);
+ if (ret <= 0)
+ goto unlock;
+
+ /*
+ * Similar to percpu_rwsem_wake_function(), we need to guard
+ * against the wakee noticing w->lock_acquired, returning, and
+ * then exiting before we do the wakeup:
+ */
+ task = get_task_struct(w->task);
+ __list_del(w->list.prev, w->list.next);
+ /*
+ * The release barrier here ensures the ordering of the
+ * __list_del before setting w->lock_acquired; @w is on the
+ * stack of the thread doing the waiting and will be reused
+ * after it sees w->lock_acquired with no other locking:
+ * pairs with smp_load_acquire() in six_lock_slowpath()
+ */
+ smp_store_release(&w->lock_acquired, true);
+ wake_up_process(task);
+ put_task_struct(task);
+ }
+
+ six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type);
+unlock:
+ raw_spin_unlock(&lock->wait_lock);
+
+ if (ret < 0) {
+ lock_type = -ret - 1;
+ goto again;
+ }
+}
+
+__always_inline
+static void six_lock_wakeup(struct six_lock *lock, u32 state,
+ enum six_lock_type lock_type)
+{
+ if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read))
+ return;
+
+ if (!(state & (SIX_LOCK_WAITING_read << lock_type)))
+ return;
+
+ __six_lock_wakeup(lock, lock_type);
+}
+
+__always_inline
+static bool do_six_trylock(struct six_lock *lock, enum six_lock_type type, bool try)
+{
+ int ret;
+
+ ret = __do_six_trylock(lock, type, current, try);
+ if (ret < 0)
+ __six_lock_wakeup(lock, -ret - 1);
+
+ return ret > 0;
+}
+
+/**
+ * six_trylock_ip - attempt to take a six lock without blocking
+ * @lock: lock to take
+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * Return: true on success, false on failure.
+ */
+bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
+{
+ if (!do_six_trylock(lock, type, true))
+ return false;
+
+ if (type != SIX_LOCK_write)
+ six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
+ return true;
+}
+EXPORT_SYMBOL_GPL(six_trylock_ip);
+
+/**
+ * six_relock_ip - attempt to re-take a lock that was held previously
+ * @lock: lock to take
+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @seq: lock sequence number obtained from six_lock_seq() while lock was
+ * held previously
+ * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * Return: true on success, false on failure.
+ */
+bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
+ unsigned seq, unsigned long ip)
+{
+ if (six_lock_seq(lock) != seq || !six_trylock_ip(lock, type, ip))
+ return false;
+
+ if (six_lock_seq(lock) != seq) {
+ six_unlock_ip(lock, type, ip);
+ return false;
+ }
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(six_relock_ip);
+
+#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER
+
+static inline bool six_can_spin_on_owner(struct six_lock *lock)
+{
+ struct task_struct *owner;
+ bool ret;
+
+ if (need_resched())
+ return false;
+
+ rcu_read_lock();
+ owner = READ_ONCE(lock->owner);
+ ret = !owner || owner_on_cpu(owner);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static inline bool six_spin_on_owner(struct six_lock *lock,
+ struct task_struct *owner,
+ u64 end_time)
+{
+ bool ret = true;
+ unsigned loop = 0;
+
+ rcu_read_lock();
+ while (lock->owner == owner) {
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_
+ * checking lock->owner still matches owner. If that fails,
+ * owner might point to freed memory. If it still matches,
+ * the rcu_read_lock() ensures the memory stays valid.
+ */
+ barrier();
+
+ if (!owner_on_cpu(owner) || need_resched()) {
+ ret = false;
+ break;
+ }
+
+ if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) {
+ six_set_bitmask(lock, SIX_LOCK_NOSPIN);
+ ret = false;
+ break;
+ }
+
+ cpu_relax();
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+{
+ struct task_struct *task = current;
+ u64 end_time;
+
+ if (type == SIX_LOCK_write)
+ return false;
+
+ preempt_disable();
+ if (!six_can_spin_on_owner(lock))
+ goto fail;
+
+ if (!osq_lock(&lock->osq))
+ goto fail;
+
+ end_time = sched_clock() + 10 * NSEC_PER_USEC;
+
+ while (1) {
+ struct task_struct *owner;
+
+ /*
+ * If there's an owner, wait for it to either
+ * release the lock or go to sleep.
+ */
+ owner = READ_ONCE(lock->owner);
+ if (owner && !six_spin_on_owner(lock, owner, end_time))
+ break;
+
+ if (do_six_trylock(lock, type, false)) {
+ osq_unlock(&lock->osq);
+ preempt_enable();
+ return true;
+ }
+
+ /*
+ * When there's no owner, we might have preempted between the
+ * owner acquiring the lock and setting the owner field. If
+ * we're an RT task that will live-lock because we won't let
+ * the owner complete.
+ */
+ if (!owner && (need_resched() || rt_task(task)))
+ break;
+
+ /*
+ * The cpu_relax() call is a compiler barrier which forces
+ * everything in this loop to be re-loaded. We don't need
+ * memory barriers as we'll eventually observe the right
+ * values at the cost of a few extra spins.
+ */
+ cpu_relax();
+ }
+
+ osq_unlock(&lock->osq);
+fail:
+ preempt_enable();
+
+ /*
+ * If we fell out of the spin path because of need_resched(),
+ * reschedule now, before we try-lock again. This avoids getting
+ * scheduled out right after we obtained the lock.
+ */
+ if (need_resched())
+ schedule();
+
+ return false;
+}
+
+#else /* CONFIG_SIX_LOCK_SPIN_ON_OWNER */
+
+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+{
+ return false;
+}
+
+#endif
+
+noinline
+static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
+ struct six_lock_waiter *wait,
+ six_lock_should_sleep_fn should_sleep_fn, void *p,
+ unsigned long ip)
+{
+ int ret = 0;
+
+ if (type == SIX_LOCK_write) {
+ EBUG_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
+ atomic_add(SIX_LOCK_HELD_write, &lock->state);
+ smp_mb__after_atomic();
+ }
+
+ trace_contention_begin(lock, 0);
+ lock_contended(&lock->dep_map, ip);
+
+ if (six_optimistic_spin(lock, type))
+ goto out;
+
+ wait->task = current;
+ wait->lock_want = type;
+ wait->lock_acquired = false;
+
+ raw_spin_lock(&lock->wait_lock);
+ six_set_bitmask(lock, SIX_LOCK_WAITING_read << type);
+ /*
+ * Retry taking the lock after taking waitlist lock, in case we raced
+ * with an unlock:
+ */
+ ret = __do_six_trylock(lock, type, current, false);
+ if (ret <= 0) {
+ wait->start_time = local_clock();
+
+ if (!list_empty(&lock->wait_list)) {
+ struct six_lock_waiter *last =
+ list_last_entry(&lock->wait_list,
+ struct six_lock_waiter, list);
+
+ if (time_before_eq64(wait->start_time, last->start_time))
+ wait->start_time = last->start_time + 1;
+ }
+
+ list_add_tail(&wait->list, &lock->wait_list);
+ }
+ raw_spin_unlock(&lock->wait_lock);
+
+ if (unlikely(ret > 0)) {
+ ret = 0;
+ goto out;
+ }
+
+ if (unlikely(ret < 0)) {
+ __six_lock_wakeup(lock, -ret - 1);
+ ret = 0;
+ }
+
+ while (1) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+
+ /*
+ * Ensures that writes to the waitlist entry happen after we see
+ * wait->lock_acquired: pairs with the smp_store_release in
+ * __six_lock_wakeup
+ */
+ if (smp_load_acquire(&wait->lock_acquired))
+ break;
+
+ ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
+ if (unlikely(ret)) {
+ bool acquired;
+
+ /*
+ * If should_sleep_fn() returns an error, we are
+ * required to return that error even if we already
+ * acquired the lock - should_sleep_fn() might have
+ * modified external state (e.g. when the deadlock cycle
+ * detector in bcachefs issued a transaction restart)
+ */
+ raw_spin_lock(&lock->wait_lock);
+ acquired = wait->lock_acquired;
+ if (!acquired)
+ list_del(&wait->list);
+ raw_spin_unlock(&lock->wait_lock);
+
+ if (unlikely(acquired))
+ do_six_unlock_type(lock, type);
+ break;
+ }
+
+ schedule();
+ }
+
+ __set_current_state(TASK_RUNNING);
+out:
+ if (ret && type == SIX_LOCK_write) {
+ six_clear_bitmask(lock, SIX_LOCK_HELD_write);
+ six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read);
+ }
+ trace_contention_end(lock, 0);
+
+ return ret;
+}
+
+/**
+ * six_lock_ip_waiter - take a lock, with full waitlist interface
+ * @lock: lock to take
+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @wait: pointer to wait object, which will be added to lock's waitlist
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ * to scheduling
+ * @p: passed through to @should_sleep_fn
+ * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * This is the most general six_lock() variant, with parameters to support full
+ * cycle detection for deadlock avoidance.
+ *
+ * The code calling this function must implement tracking of held locks, and the
+ * @wait object should be embedded into the struct that tracks held locks -
+ * which must also be accessible in a thread-safe way.
+ *
+ * @should_sleep_fn should invoke the cycle detector; it should walk each
+ * lock's waiters, and for each waiter recursively walk their held locks.
+ *
+ * When this function must block, @wait will be added to @lock's waitlist before
+ * calling trylock, and before calling @should_sleep_fn, and @wait will not be
+ * removed from the lock waitlist until the lock has been successfully acquired,
+ * or we abort.
+ *
+ * @wait.start_time will be monotonically increasing for any given waitlist, and
+ * thus may be used as a loop cursor.
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
+ struct six_lock_waiter *wait,
+ six_lock_should_sleep_fn should_sleep_fn, void *p,
+ unsigned long ip)
+{
+ int ret;
+
+ wait->start_time = 0;
+
+ if (type != SIX_LOCK_write)
+ six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip);
+
+ ret = do_six_trylock(lock, type, true) ? 0
+ : six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip);
+
+ if (ret && type != SIX_LOCK_write)
+ six_release(&lock->dep_map, ip);
+ if (!ret)
+ lock_acquired(&lock->dep_map, ip);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(six_lock_ip_waiter);
+
+__always_inline
+static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+{
+ u32 state;
+
+ if (type == SIX_LOCK_intent)
+ lock->owner = NULL;
+
+ if (type == SIX_LOCK_read &&
+ lock->readers) {
+ smp_mb(); /* unlock barrier */
+ this_cpu_dec(*lock->readers);
+ smp_mb(); /* between unlocking and checking for waiters */
+ state = atomic_read(&lock->state);
+ } else {
+ u32 v = l[type].lock_val;
+
+ if (type != SIX_LOCK_read)
+ v += atomic_read(&lock->state) & SIX_LOCK_NOSPIN;
+
+ EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask));
+ state = atomic_sub_return_release(v, &lock->state);
+ }
+
+ six_lock_wakeup(lock, state, l[type].unlock_wakeup);
+}
+
+/**
+ * six_unlock_ip - drop a six lock
+ * @lock: lock to unlock
+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * When a lock is held multiple times (because six_lock_incement()) was used),
+ * this decrements the 'lock held' counter by one.
+ *
+ * For example:
+ * six_lock_read(&foo->lock); read count 1
+ * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0
+ */
+void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
+{
+ EBUG_ON(type == SIX_LOCK_write &&
+ !(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
+ EBUG_ON((type == SIX_LOCK_write ||
+ type == SIX_LOCK_intent) &&
+ lock->owner != current);
+
+ if (type != SIX_LOCK_write)
+ six_release(&lock->dep_map, ip);
+ else
+ lock->seq++;
+
+ if (type == SIX_LOCK_intent &&
+ lock->intent_lock_recurse) {
+ --lock->intent_lock_recurse;
+ return;
+ }
+
+ do_six_unlock_type(lock, type);
+}
+EXPORT_SYMBOL_GPL(six_unlock_ip);
+
+/**
+ * six_lock_downgrade - convert an intent lock to a read lock
+ * @lock: lock to dowgrade
+ *
+ * @lock will have read count incremented and intent count decremented
+ */
+void six_lock_downgrade(struct six_lock *lock)
+{
+ six_lock_increment(lock, SIX_LOCK_read);
+ six_unlock_intent(lock);
+}
+EXPORT_SYMBOL_GPL(six_lock_downgrade);
+
+/**
+ * six_lock_tryupgrade - attempt to convert read lock to an intent lock
+ * @lock: lock to upgrade
+ *
+ * On success, @lock will have intent count incremented and read count
+ * decremented
+ *
+ * Return: true on success, false on failure
+ */
+bool six_lock_tryupgrade(struct six_lock *lock)
+{
+ u32 old = atomic_read(&lock->state), new;
+
+ do {
+ new = old;
+
+ if (new & SIX_LOCK_HELD_intent)
+ return false;
+
+ if (!lock->readers) {
+ EBUG_ON(!(new & SIX_LOCK_HELD_read));
+ new -= l[SIX_LOCK_read].lock_val;
+ }
+
+ new |= SIX_LOCK_HELD_intent;
+ } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, new));
+
+ if (lock->readers)
+ this_cpu_dec(*lock->readers);
+
+ six_set_owner(lock, SIX_LOCK_intent, old, current);
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(six_lock_tryupgrade);
+
+/**
+ * six_trylock_convert - attempt to convert a held lock from one type to another
+ * @lock: lock to upgrade
+ * @from: SIX_LOCK_read or SIX_LOCK_intent
+ * @to: SIX_LOCK_read or SIX_LOCK_intent
+ *
+ * On success, @lock will have intent count incremented and read count
+ * decremented
+ *
+ * Return: true on success, false on failure
+ */
+bool six_trylock_convert(struct six_lock *lock,
+ enum six_lock_type from,
+ enum six_lock_type to)
+{
+ EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write);
+
+ if (to == from)
+ return true;
+
+ if (to == SIX_LOCK_read) {
+ six_lock_downgrade(lock);
+ return true;
+ } else {
+ return six_lock_tryupgrade(lock);
+ }
+}
+EXPORT_SYMBOL_GPL(six_trylock_convert);
+
+/**
+ * six_lock_increment - increase held lock count on a lock that is already held
+ * @lock: lock to increment
+ * @type: SIX_LOCK_read or SIX_LOCK_intent
+ *
+ * @lock must already be held, with a lock type that is greater than or equal to
+ * @type
+ *
+ * A corresponding six_unlock_type() call will be required for @lock to be fully
+ * unlocked.
+ */
+void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
+{
+ six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_);
+
+ /* XXX: assert already locked, and that we don't overflow: */
+
+ switch (type) {
+ case SIX_LOCK_read:
+ if (lock->readers) {
+ this_cpu_inc(*lock->readers);
+ } else {
+ EBUG_ON(!(atomic_read(&lock->state) &
+ (SIX_LOCK_HELD_read|
+ SIX_LOCK_HELD_intent)));
+ atomic_add(l[type].lock_val, &lock->state);
+ }
+ break;
+ case SIX_LOCK_intent:
+ EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
+ lock->intent_lock_recurse++;
+ break;
+ case SIX_LOCK_write:
+ BUG();
+ break;
+ }
+}
+EXPORT_SYMBOL_GPL(six_lock_increment);
+
+/**
+ * six_lock_wakeup_all - wake up all waiters on @lock
+ * @lock: lock to wake up waiters for
+ *
+ * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then
+ * abort the lock operation.
+ *
+ * This function is never needed in a bug-free program; it's only useful in
+ * debug code, e.g. to determine if a cycle detector is at fault.
+ */
+void six_lock_wakeup_all(struct six_lock *lock)
+{
+ u32 state = atomic_read(&lock->state);
+ struct six_lock_waiter *w;
+
+ six_lock_wakeup(lock, state, SIX_LOCK_read);
+ six_lock_wakeup(lock, state, SIX_LOCK_intent);
+ six_lock_wakeup(lock, state, SIX_LOCK_write);
+
+ raw_spin_lock(&lock->wait_lock);
+ list_for_each_entry(w, &lock->wait_list, list)
+ wake_up_process(w->task);
+ raw_spin_unlock(&lock->wait_lock);
+}
+EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
+
+/**
+ * six_lock_counts - return held lock counts, for each lock type
+ * @lock: lock to return counters for
+ *
+ * Return: the number of times a lock is held for read, intent and write.
+ */
+struct six_lock_count six_lock_counts(struct six_lock *lock)
+{
+ struct six_lock_count ret;
+
+ ret.n[SIX_LOCK_read] = !lock->readers
+ ? atomic_read(&lock->state) & SIX_LOCK_HELD_read
+ : pcpu_read_count(lock);
+ ret.n[SIX_LOCK_intent] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) +
+ lock->intent_lock_recurse;
+ ret.n[SIX_LOCK_write] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(six_lock_counts);
+
+/**
+ * six_lock_readers_add - directly manipulate reader count of a lock
+ * @lock: lock to add/subtract readers for
+ * @nr: reader count to add/subtract
+ *
+ * When an upper layer is implementing lock reentrency, we may have both read
+ * and intent locks on the same lock.
+ *
+ * When we need to take a write lock, the read locks will cause self-deadlock,
+ * because six locks themselves do not track which read locks are held by the
+ * current thread and which are held by a different thread - it does no
+ * per-thread tracking of held locks.
+ *
+ * The upper layer that is tracking held locks may however, if trylock() has
+ * failed, count up its own read locks, subtract them, take the write lock, and
+ * then re-add them.
+ *
+ * As in any other situation when taking a write lock, @lock must be held for
+ * intent one (or more) times, so @lock will never be left unlocked.
+ */
+void six_lock_readers_add(struct six_lock *lock, int nr)
+{
+ if (lock->readers) {
+ this_cpu_add(*lock->readers, nr);
+ } else {
+ EBUG_ON((int) (atomic_read(&lock->state) & SIX_LOCK_HELD_read) + nr < 0);
+ /* reader count starts at bit 0 */
+ atomic_add(nr, &lock->state);
+ }
+}
+EXPORT_SYMBOL_GPL(six_lock_readers_add);
+
+/**
+ * six_lock_exit - release resources held by a lock prior to freeing
+ * @lock: lock to exit
+ *
+ * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is
+ * required to free the percpu read counts.
+ */
+void six_lock_exit(struct six_lock *lock)
+{
+ WARN_ON(lock->readers && pcpu_read_count(lock));
+ WARN_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_read);
+
+ free_percpu(lock->readers);
+ lock->readers = NULL;
+}
+EXPORT_SYMBOL_GPL(six_lock_exit);
+
+void __six_lock_init(struct six_lock *lock, const char *name,
+ struct lock_class_key *key, enum six_lock_init_flags flags)
+{
+ atomic_set(&lock->state, 0);
+ raw_spin_lock_init(&lock->wait_lock);
+ INIT_LIST_HEAD(&lock->wait_list);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ debug_check_no_locks_freed((void *) lock, sizeof(*lock));
+ lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+
+ /*
+ * Don't assume that we have real percpu variables available in
+ * userspace:
+ */
+#ifdef __KERNEL__
+ if (flags & SIX_LOCK_INIT_PCPU) {
+ /*
+ * We don't return an error here on memory allocation failure
+ * since percpu is an optimization, and locks will work with the
+ * same semantics in non-percpu mode: callers can check for
+ * failure if they wish by checking lock->readers, but generally
+ * will not want to treat it as an error.
+ */
+ lock->readers = alloc_percpu(unsigned);
+ }
+#endif
+}
+EXPORT_SYMBOL_GPL(__six_lock_init);
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
new file mode 100644
index 000000000000..4c268b0b8316
--- /dev/null
+++ b/fs/bcachefs/six.h
@@ -0,0 +1,393 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_SIX_H
+#define _LINUX_SIX_H
+
+/**
+ * DOC: SIX locks overview
+ *
+ * Shared/intent/exclusive locks: sleepable read/write locks, like rw semaphores
+ * but with an additional state: read/shared, intent, exclusive/write
+ *
+ * The purpose of the intent state is to allow for greater concurrency on tree
+ * structures without deadlocking. In general, a read can't be upgraded to a
+ * write lock without deadlocking, so an operation that updates multiple nodes
+ * will have to take write locks for the full duration of the operation.
+ *
+ * But by adding an intent state, which is exclusive with other intent locks but
+ * not with readers, we can take intent locks at thte start of the operation,
+ * and then take write locks only for the actual update to each individual
+ * nodes, without deadlocking.
+ *
+ * Example usage:
+ * six_lock_read(&foo->lock);
+ * six_unlock_read(&foo->lock);
+ *
+ * An intent lock must be held before taking a write lock:
+ * six_lock_intent(&foo->lock);
+ * six_lock_write(&foo->lock);
+ * six_unlock_write(&foo->lock);
+ * six_unlock_intent(&foo->lock);
+ *
+ * Other operations:
+ * six_trylock_read()
+ * six_trylock_intent()
+ * six_trylock_write()
+ *
+ * six_lock_downgrade() convert from intent to read
+ * six_lock_tryupgrade() attempt to convert from read to intent, may fail
+ *
+ * There are also interfaces that take the lock type as an enum:
+ *
+ * six_lock_type(&foo->lock, SIX_LOCK_read);
+ * six_trylock_convert(&foo->lock, SIX_LOCK_read, SIX_LOCK_intent)
+ * six_lock_type(&foo->lock, SIX_LOCK_write);
+ * six_unlock_type(&foo->lock, SIX_LOCK_write);
+ * six_unlock_type(&foo->lock, SIX_LOCK_intent);
+ *
+ * Lock sequence numbers - unlock(), relock():
+ *
+ * Locks embed sequences numbers, which are incremented on write lock/unlock.
+ * This allows locks to be dropped and the retaken iff the state they protect
+ * hasn't changed; this makes it much easier to avoid holding locks while e.g.
+ * doing IO or allocating memory.
+ *
+ * Example usage:
+ * six_lock_read(&foo->lock);
+ * u32 seq = six_lock_seq(&foo->lock);
+ * six_unlock_read(&foo->lock);
+ *
+ * some_operation_that_may_block();
+ *
+ * if (six_relock_read(&foo->lock, seq)) { ... }
+ *
+ * If the relock operation succeeds, it is as if the lock was never unlocked.
+ *
+ * Reentrancy:
+ *
+ * Six locks are not by themselves reentrent, but have counters for both the
+ * read and intent states that can be used to provide reentrency by an upper
+ * layer that tracks held locks. If a lock is known to already be held in the
+ * read or intent state, six_lock_increment() can be used to bump the "lock
+ * held in this state" counter, increasing the number of unlock calls that
+ * will be required to fully unlock it.
+ *
+ * Example usage:
+ * six_lock_read(&foo->lock);
+ * six_lock_increment(&foo->lock, SIX_LOCK_read);
+ * six_unlock_read(&foo->lock);
+ * six_unlock_read(&foo->lock);
+ * foo->lock is now fully unlocked.
+ *
+ * Since the intent state supercedes read, it's legal to increment the read
+ * counter when holding an intent lock, but not the reverse.
+ *
+ * A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write)
+ * is not legal.
+ *
+ * should_sleep_fn:
+ *
+ * There is a six_lock() variant that takes a function pointer that is called
+ * immediately prior to schedule() when blocking, and may return an error to
+ * abort.
+ *
+ * One possible use for this feature is when objects being locked are part of
+ * a cache and may reused, and lock ordering is based on a property of the
+ * object that will change when the object is reused - i.e. logical key order.
+ *
+ * If looking up an object in the cache may race with object reuse, and lock
+ * ordering is required to prevent deadlock, object reuse may change the
+ * correct lock order for that object and cause a deadlock. should_sleep_fn
+ * can be used to check if the object is still the object we want and avoid
+ * this deadlock.
+ *
+ * Wait list entry interface:
+ *
+ * There is a six_lock() variant, six_lock_waiter(), that takes a pointer to a
+ * wait list entry. By embedding six_lock_waiter into another object, and by
+ * traversing lock waitlists, it is then possible for an upper layer to
+ * implement full cycle detection for deadlock avoidance.
+ *
+ * should_sleep_fn should be used for invoking the cycle detector, walking the
+ * graph of held locks to check for a deadlock. The upper layer must track
+ * held locks for each thread, and each thread's held locks must be reachable
+ * from its six_lock_waiter object.
+ *
+ * six_lock_waiter() will add the wait object to the waitlist re-trying taking
+ * the lock, and before calling should_sleep_fn, and the wait object will not
+ * be removed from the waitlist until either the lock has been successfully
+ * acquired, or we aborted because should_sleep_fn returned an error.
+ *
+ * Also, six_lock_waiter contains a timestamp, and waiters on a waitlist will
+ * have timestamps in strictly ascending order - this is so the timestamp can
+ * be used as a cursor for lock graph traverse.
+ */
+
+#include <linux/lockdep.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+
+#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER
+#include <linux/osq_lock.h>
+#endif
+
+enum six_lock_type {
+ SIX_LOCK_read,
+ SIX_LOCK_intent,
+ SIX_LOCK_write,
+};
+
+struct six_lock {
+ atomic_t state;
+ u32 seq;
+ unsigned intent_lock_recurse;
+ struct task_struct *owner;
+ unsigned __percpu *readers;
+#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER
+ struct optimistic_spin_queue osq;
+#endif
+ raw_spinlock_t wait_lock;
+ struct list_head wait_list;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
+};
+
+struct six_lock_waiter {
+ struct list_head list;
+ struct task_struct *task;
+ enum six_lock_type lock_want;
+ bool lock_acquired;
+ u64 start_time;
+};
+
+typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
+
+void six_lock_exit(struct six_lock *lock);
+
+enum six_lock_init_flags {
+ SIX_LOCK_INIT_PCPU = 1U << 0,
+};
+
+void __six_lock_init(struct six_lock *lock, const char *name,
+ struct lock_class_key *key, enum six_lock_init_flags flags);
+
+/**
+ * six_lock_init - initialize a six lock
+ * @lock: lock to initialize
+ * @flags: optional flags, i.e. SIX_LOCK_INIT_PCPU
+ */
+#define six_lock_init(lock, flags) \
+do { \
+ static struct lock_class_key __key; \
+ \
+ __six_lock_init((lock), #lock, &__key, flags); \
+} while (0)
+
+/**
+ * six_lock_seq - obtain current lock sequence number
+ * @lock: six_lock to obtain sequence number for
+ *
+ * @lock should be held for read or intent, and not write
+ *
+ * By saving the lock sequence number, we can unlock @lock and then (typically
+ * after some blocking operation) attempt to relock it: the relock will succeed
+ * if the sequence number hasn't changed, meaning no write locks have been taken
+ * and state corresponding to what @lock protects is still valid.
+ */
+static inline u32 six_lock_seq(const struct six_lock *lock)
+{
+ return lock->seq;
+}
+
+bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
+
+/**
+ * six_trylock_type - attempt to take a six lock without blocking
+ * @lock: lock to take
+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ *
+ * Return: true on success, false on failure.
+ */
+static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
+{
+ return six_trylock_ip(lock, type, _THIS_IP_);
+}
+
+int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
+ struct six_lock_waiter *wait,
+ six_lock_should_sleep_fn should_sleep_fn, void *p,
+ unsigned long ip);
+
+/**
+ * six_lock_waiter - take a lock, with full waitlist interface
+ * @lock: lock to take
+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @wait: pointer to wait object, which will be added to lock's waitlist
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ * to scheduling
+ * @p: passed through to @should_sleep_fn
+ *
+ * This is a convenience wrapper around six_lock_ip_waiter(), see that function
+ * for full documentation.
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+static inline int six_lock_waiter(struct six_lock *lock, enum six_lock_type type,
+ struct six_lock_waiter *wait,
+ six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+ return six_lock_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_);
+}
+
+/**
+ * six_lock_ip - take a six lock lock
+ * @lock: lock to take
+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ * to scheduling
+ * @p: passed through to @should_sleep_fn
+ * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+static inline int six_lock_ip(struct six_lock *lock, enum six_lock_type type,
+ six_lock_should_sleep_fn should_sleep_fn, void *p,
+ unsigned long ip)
+{
+ struct six_lock_waiter wait;
+
+ return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip);
+}
+
+/**
+ * six_lock_type - take a six lock lock
+ * @lock: lock to take
+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ * to scheduling
+ * @p: passed through to @should_sleep_fn
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
+ six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+ struct six_lock_waiter wait;
+
+ return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_);
+}
+
+bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
+ unsigned seq, unsigned long ip);
+
+/**
+ * six_relock_type - attempt to re-take a lock that was held previously
+ * @lock: lock to take
+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @seq: lock sequence number obtained from six_lock_seq() while lock was
+ * held previously
+ *
+ * Return: true on success, false on failure.
+ */
+static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
+ unsigned seq)
+{
+ return six_relock_ip(lock, type, seq, _THIS_IP_);
+}
+
+void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
+
+/**
+ * six_unlock_type - drop a six lock
+ * @lock: lock to unlock
+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ *
+ * When a lock is held multiple times (because six_lock_incement()) was used),
+ * this decrements the 'lock held' counter by one.
+ *
+ * For example:
+ * six_lock_read(&foo->lock); read count 1
+ * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0
+ */
+static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+{
+ six_unlock_ip(lock, type, _THIS_IP_);
+}
+
+#define __SIX_LOCK(type) \
+static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\
+{ \
+ return six_trylock_ip(lock, SIX_LOCK_##type, ip); \
+} \
+ \
+static inline bool six_trylock_##type(struct six_lock *lock) \
+{ \
+ return six_trylock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \
+} \
+ \
+static inline int six_lock_ip_waiter_##type(struct six_lock *lock, \
+ struct six_lock_waiter *wait, \
+ six_lock_should_sleep_fn should_sleep_fn, void *p,\
+ unsigned long ip) \
+{ \
+ return six_lock_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\
+} \
+ \
+static inline int six_lock_ip_##type(struct six_lock *lock, \
+ six_lock_should_sleep_fn should_sleep_fn, void *p, \
+ unsigned long ip) \
+{ \
+ return six_lock_ip(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\
+} \
+ \
+static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\
+{ \
+ return six_relock_ip(lock, SIX_LOCK_##type, seq, ip); \
+} \
+ \
+static inline bool six_relock_##type(struct six_lock *lock, u32 seq) \
+{ \
+ return six_relock_ip(lock, SIX_LOCK_##type, seq, _THIS_IP_); \
+} \
+ \
+static inline int six_lock_##type(struct six_lock *lock, \
+ six_lock_should_sleep_fn fn, void *p)\
+{ \
+ return six_lock_ip_##type(lock, fn, p, _THIS_IP_); \
+} \
+ \
+static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip) \
+{ \
+ six_unlock_ip(lock, SIX_LOCK_##type, ip); \
+} \
+ \
+static inline void six_unlock_##type(struct six_lock *lock) \
+{ \
+ six_unlock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \
+}
+
+__SIX_LOCK(read)
+__SIX_LOCK(intent)
+__SIX_LOCK(write)
+#undef __SIX_LOCK
+
+void six_lock_downgrade(struct six_lock *);
+bool six_lock_tryupgrade(struct six_lock *);
+bool six_trylock_convert(struct six_lock *, enum six_lock_type,
+ enum six_lock_type);
+
+void six_lock_increment(struct six_lock *, enum six_lock_type);
+
+void six_lock_wakeup_all(struct six_lock *);
+
+struct six_lock_count {
+ unsigned n[3];
+};
+
+struct six_lock_count six_lock_counts(struct six_lock *);
+void six_lock_readers_add(struct six_lock *, int);
+
+#endif /* _LINUX_SIX_H */
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
new file mode 100644
index 000000000000..5dac038f0851
--- /dev/null
+++ b/fs/bcachefs/snapshot.c
@@ -0,0 +1,1713 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "btree_key_cache.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "errcode.h"
+#include "error.h"
+#include "fs.h"
+#include "snapshot.h"
+
+#include <linux/random.h>
+
+/*
+ * Snapshot trees:
+ *
+ * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they
+ * exist to provide a stable identifier for the whole lifetime of a snapshot
+ * tree.
+ */
+
+void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
+
+ prt_printf(out, "subvol %u root snapshot %u",
+ le32_to_cpu(t.v->master_subvol),
+ le32_to_cpu(t.v->root_snapshot));
+}
+
+int bch2_snapshot_tree_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
+{
+ int ret = 0;
+
+ bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
+ bkey_lt(k.k->p, POS(0, 1)), c, err,
+ snapshot_tree_pos_bad,
+ "bad pos");
+fsck_err:
+ return ret;
+}
+
+int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
+ struct bch_snapshot_tree *s)
+{
+ int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
+ BTREE_ITER_WITH_UPDATES, snapshot_tree, s);
+
+ if (bch2_err_matches(ret, ENOENT))
+ ret = -BCH_ERR_ENOENT_snapshot_tree;
+ return ret;
+}
+
+struct bkey_i_snapshot_tree *
+__bch2_snapshot_tree_create(struct btree_trans *trans)
+{
+ struct btree_iter iter;
+ int ret = bch2_bkey_get_empty_slot(trans, &iter,
+ BTREE_ID_snapshot_trees, POS(0, U32_MAX));
+ struct bkey_i_snapshot_tree *s_t;
+
+ if (ret == -BCH_ERR_ENOSPC_btree_slot)
+ ret = -BCH_ERR_ENOSPC_snapshot_tree;
+ if (ret)
+ return ERR_PTR(ret);
+
+ s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
+ ret = PTR_ERR_OR_ZERO(s_t);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret ? ERR_PTR(ret) : s_t;
+}
+
+static int bch2_snapshot_tree_create(struct btree_trans *trans,
+ u32 root_id, u32 subvol_id, u32 *tree_id)
+{
+ struct bkey_i_snapshot_tree *n_tree =
+ __bch2_snapshot_tree_create(trans);
+
+ if (IS_ERR(n_tree))
+ return PTR_ERR(n_tree);
+
+ n_tree->v.master_subvol = cpu_to_le32(subvol_id);
+ n_tree->v.root_snapshot = cpu_to_le32(root_id);
+ *tree_id = n_tree->k.p.offset;
+ return 0;
+}
+
+/* Snapshot nodes: */
+
+static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
+{
+ struct snapshot_table *t;
+
+ rcu_read_lock();
+ t = rcu_dereference(c->snapshots);
+
+ while (id && id < ancestor)
+ id = __snapshot_t(t, id)->parent;
+ rcu_read_unlock();
+
+ return id == ancestor;
+}
+
+static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
+{
+ const struct snapshot_t *s = __snapshot_t(t, id);
+
+ if (s->skip[2] <= ancestor)
+ return s->skip[2];
+ if (s->skip[1] <= ancestor)
+ return s->skip[1];
+ if (s->skip[0] <= ancestor)
+ return s->skip[0];
+ return s->parent;
+}
+
+bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
+{
+ struct snapshot_table *t;
+ bool ret;
+
+ EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots);
+
+ rcu_read_lock();
+ t = rcu_dereference(c->snapshots);
+
+ while (id && id < ancestor - IS_ANCESTOR_BITMAP)
+ id = get_ancestor_below(t, id, ancestor);
+
+ if (id && id < ancestor) {
+ ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor);
+
+ EBUG_ON(ret != bch2_snapshot_is_ancestor_early(c, id, ancestor));
+ } else {
+ ret = id == ancestor;
+ }
+
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
+{
+ size_t idx = U32_MAX - id;
+ size_t new_size;
+ struct snapshot_table *new, *old;
+
+ new_size = max(16UL, roundup_pow_of_two(idx + 1));
+
+ new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+ old = rcu_dereference_protected(c->snapshots, true);
+ if (old)
+ memcpy(new->s,
+ rcu_dereference_protected(c->snapshots, true)->s,
+ sizeof(new->s[0]) * c->snapshot_table_size);
+
+ rcu_assign_pointer(c->snapshots, new);
+ c->snapshot_table_size = new_size;
+ kvfree_rcu_mightsleep(old);
+
+ return &rcu_dereference_protected(c->snapshots, true)->s[idx];
+}
+
+static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
+{
+ size_t idx = U32_MAX - id;
+
+ lockdep_assert_held(&c->snapshot_table_lock);
+
+ if (likely(idx < c->snapshot_table_size))
+ return &rcu_dereference_protected(c->snapshots, true)->s[idx];
+
+ return __snapshot_t_mut(c, id);
+}
+
+void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
+
+ prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
+ BCH_SNAPSHOT_SUBVOL(s.v),
+ BCH_SNAPSHOT_DELETED(s.v),
+ le32_to_cpu(s.v->parent),
+ le32_to_cpu(s.v->children[0]),
+ le32_to_cpu(s.v->children[1]),
+ le32_to_cpu(s.v->subvol),
+ le32_to_cpu(s.v->tree));
+
+ if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
+ prt_printf(out, " depth %u skiplist %u %u %u",
+ le32_to_cpu(s.v->depth),
+ le32_to_cpu(s.v->skip[0]),
+ le32_to_cpu(s.v->skip[1]),
+ le32_to_cpu(s.v->skip[2]));
+}
+
+int bch2_snapshot_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
+{
+ struct bkey_s_c_snapshot s;
+ u32 i, id;
+ int ret = 0;
+
+ bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
+ bkey_lt(k.k->p, POS(0, 1)), c, err,
+ snapshot_pos_bad,
+ "bad pos");
+
+ s = bkey_s_c_to_snapshot(k);
+
+ id = le32_to_cpu(s.v->parent);
+ bkey_fsck_err_on(id && id <= k.k->p.offset, c, err,
+ snapshot_parent_bad,
+ "bad parent node (%u <= %llu)",
+ id, k.k->p.offset);
+
+ bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]), c, err,
+ snapshot_children_not_normalized,
+ "children not normalized");
+
+ bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1], c, err,
+ snapshot_child_duplicate,
+ "duplicate child nodes");
+
+ for (i = 0; i < 2; i++) {
+ id = le32_to_cpu(s.v->children[i]);
+
+ bkey_fsck_err_on(id >= k.k->p.offset, c, err,
+ snapshot_child_bad,
+ "bad child node (%u >= %llu)",
+ id, k.k->p.offset);
+ }
+
+ if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
+ bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
+ le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]), c, err,
+ snapshot_skiplist_not_normalized,
+ "skiplist not normalized");
+
+ for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
+ id = le32_to_cpu(s.v->skip[i]);
+
+ bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent), c, err,
+ snapshot_skiplist_bad,
+ "bad skiplist node %u", id);
+ }
+ }
+fsck_err:
+ return ret;
+}
+
+static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
+{
+ struct snapshot_t *t = snapshot_t_mut(c, id);
+ u32 parent = id;
+
+ while ((parent = bch2_snapshot_parent_early(c, parent)) &&
+ parent - id - 1 < IS_ANCESTOR_BITMAP)
+ __set_bit(parent - id - 1, t->is_ancestor);
+}
+
+static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
+{
+ mutex_lock(&c->snapshot_table_lock);
+ __set_is_ancestor_bitmap(c, id);
+ mutex_unlock(&c->snapshot_table_lock);
+}
+
+int bch2_mark_snapshot(struct btree_trans *trans,
+ enum btree_id btree, unsigned level,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct snapshot_t *t;
+ u32 id = new.k->p.offset;
+ int ret = 0;
+
+ mutex_lock(&c->snapshot_table_lock);
+
+ t = snapshot_t_mut(c, id);
+ if (!t) {
+ ret = -BCH_ERR_ENOMEM_mark_snapshot;
+ goto err;
+ }
+
+ if (new.k->type == KEY_TYPE_snapshot) {
+ struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
+
+ t->parent = le32_to_cpu(s.v->parent);
+ t->children[0] = le32_to_cpu(s.v->children[0]);
+ t->children[1] = le32_to_cpu(s.v->children[1]);
+ t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
+ t->tree = le32_to_cpu(s.v->tree);
+
+ if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
+ t->depth = le32_to_cpu(s.v->depth);
+ t->skip[0] = le32_to_cpu(s.v->skip[0]);
+ t->skip[1] = le32_to_cpu(s.v->skip[1]);
+ t->skip[2] = le32_to_cpu(s.v->skip[2]);
+ } else {
+ t->depth = 0;
+ t->skip[0] = 0;
+ t->skip[1] = 0;
+ t->skip[2] = 0;
+ }
+
+ __set_is_ancestor_bitmap(c, id);
+
+ if (BCH_SNAPSHOT_DELETED(s.v)) {
+ set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags);
+ if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots)
+ bch2_delete_dead_snapshots_async(c);
+ }
+ } else {
+ memset(t, 0, sizeof(*t));
+ }
+err:
+ mutex_unlock(&c->snapshot_table_lock);
+ return ret;
+}
+
+int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
+ struct bch_snapshot *s)
+{
+ return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
+ BTREE_ITER_WITH_UPDATES, snapshot, s);
+}
+
+static int bch2_snapshot_live(struct btree_trans *trans, u32 id)
+{
+ struct bch_snapshot v;
+ int ret;
+
+ if (!id)
+ return 0;
+
+ ret = bch2_snapshot_lookup(trans, id, &v);
+ if (bch2_err_matches(ret, ENOENT))
+ bch_err(trans->c, "snapshot node %u not found", id);
+ if (ret)
+ return ret;
+
+ return !BCH_SNAPSHOT_DELETED(&v);
+}
+
+/*
+ * If @k is a snapshot with just one live child, it's part of a linear chain,
+ * which we consider to be an equivalence class: and then after snapshot
+ * deletion cleanup, there should only be a single key at a given position in
+ * this equivalence class.
+ *
+ * This sets the equivalence class of @k to be the child's equivalence class, if
+ * it's part of such a linear chain: this correctly sets equivalence classes on
+ * startup if we run leaf to root (i.e. in natural key order).
+ */
+static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ unsigned i, nr_live = 0, live_idx = 0;
+ struct bkey_s_c_snapshot snap;
+ u32 id = k.k->p.offset, child[2];
+
+ if (k.k->type != KEY_TYPE_snapshot)
+ return 0;
+
+ snap = bkey_s_c_to_snapshot(k);
+
+ child[0] = le32_to_cpu(snap.v->children[0]);
+ child[1] = le32_to_cpu(snap.v->children[1]);
+
+ for (i = 0; i < 2; i++) {
+ int ret = bch2_snapshot_live(trans, child[i]);
+
+ if (ret < 0)
+ return ret;
+
+ if (ret)
+ live_idx = i;
+ nr_live += ret;
+ }
+
+ mutex_lock(&c->snapshot_table_lock);
+
+ snapshot_t_mut(c, id)->equiv = nr_live == 1
+ ? snapshot_t_mut(c, child[live_idx])->equiv
+ : id;
+
+ mutex_unlock(&c->snapshot_table_lock);
+
+ return 0;
+}
+
+/* fsck: */
+
+static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
+{
+ return snapshot_t(c, id)->children[child];
+}
+
+static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
+{
+ return bch2_snapshot_child(c, id, 0);
+}
+
+static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
+{
+ return bch2_snapshot_child(c, id, 1);
+}
+
+static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
+{
+ u32 n, parent;
+
+ n = bch2_snapshot_left_child(c, id);
+ if (n)
+ return n;
+
+ while ((parent = bch2_snapshot_parent(c, id))) {
+ n = bch2_snapshot_right_child(c, parent);
+ if (n && n != id)
+ return n;
+ id = parent;
+ }
+
+ return 0;
+}
+
+static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
+{
+ u32 id = snapshot_root;
+ u32 subvol = 0, s;
+
+ while (id) {
+ s = snapshot_t(c, id)->subvol;
+
+ if (s && (!subvol || s < subvol))
+ subvol = s;
+
+ id = bch2_snapshot_tree_next(c, id);
+ }
+
+ return subvol;
+}
+
+static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
+ u32 snapshot_root, u32 *subvol_id)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_s_c_subvolume s;
+ bool found = false;
+ int ret;
+
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
+ 0, k, ret) {
+ if (k.k->type != KEY_TYPE_subvolume)
+ continue;
+
+ s = bkey_s_c_to_subvolume(k);
+ if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
+ continue;
+ if (!BCH_SUBVOLUME_SNAP(s.v)) {
+ *subvol_id = s.k->p.offset;
+ found = true;
+ break;
+ }
+ }
+
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (!ret && !found) {
+ struct bkey_i_subvolume *u;
+
+ *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
+
+ u = bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_subvolumes, POS(0, *subvol_id),
+ 0, subvolume);
+ ret = PTR_ERR_OR_ZERO(u);
+ if (ret)
+ return ret;
+
+ SET_BCH_SUBVOLUME_SNAP(&u->v, false);
+ }
+
+ return ret;
+}
+
+static int check_snapshot_tree(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c_snapshot_tree st;
+ struct bch_snapshot s;
+ struct bch_subvolume subvol;
+ struct printbuf buf = PRINTBUF;
+ u32 root_id;
+ int ret;
+
+ if (k.k->type != KEY_TYPE_snapshot_tree)
+ return 0;
+
+ st = bkey_s_c_to_snapshot_tree(k);
+ root_id = le32_to_cpu(st.v->root_snapshot);
+
+ ret = bch2_snapshot_lookup(trans, root_id, &s);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ goto err;
+
+ if (fsck_err_on(ret ||
+ root_id != bch2_snapshot_root(c, root_id) ||
+ st.k->p.offset != le32_to_cpu(s.tree),
+ c, snapshot_tree_to_missing_snapshot,
+ "snapshot tree points to missing/incorrect snapshot:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, iter, 0);
+ goto err;
+ }
+
+ ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol),
+ false, 0, &subvol);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ goto err;
+
+ if (fsck_err_on(ret,
+ c, snapshot_tree_to_missing_subvol,
+ "snapshot tree points to missing subvolume:\n %s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
+ fsck_err_on(!bch2_snapshot_is_ancestor_early(c,
+ le32_to_cpu(subvol.snapshot),
+ root_id),
+ c, snapshot_tree_to_wrong_subvol,
+ "snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
+ fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
+ c, snapshot_tree_to_snapshot_subvol,
+ "snapshot tree points to snapshot subvolume:\n %s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
+ struct bkey_i_snapshot_tree *u;
+ u32 subvol_id;
+
+ ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
+ if (ret)
+ goto err;
+
+ u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
+ ret = PTR_ERR_OR_ZERO(u);
+ if (ret)
+ goto err;
+
+ u->v.master_subvol = cpu_to_le32(subvol_id);
+ st = snapshot_tree_i_to_s_c(u);
+ }
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+/*
+ * For each snapshot_tree, make sure it points to the root of a snapshot tree
+ * and that snapshot entry points back to it, or delete it.
+ *
+ * And, make sure it points to a subvolume within that snapshot tree, or correct
+ * it to point to the oldest subvolume within that snapshot tree.
+ */
+int bch2_check_snapshot_trees(struct bch_fs *c)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter,
+ BTREE_ID_snapshot_trees, POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ check_snapshot_tree(trans, &iter, k)));
+
+ if (ret)
+ bch_err(c, "error %i checking snapshot trees", ret);
+ return ret;
+}
+
+/*
+ * Look up snapshot tree for @tree_id and find root,
+ * make sure @snap_id is a descendent:
+ */
+static int snapshot_tree_ptr_good(struct btree_trans *trans,
+ u32 snap_id, u32 tree_id)
+{
+ struct bch_snapshot_tree s_t;
+ int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
+
+ if (bch2_err_matches(ret, ENOENT))
+ return 0;
+ if (ret)
+ return ret;
+
+ return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
+}
+
+u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
+{
+ const struct snapshot_t *s;
+
+ if (!id)
+ return 0;
+
+ rcu_read_lock();
+ s = snapshot_t(c, id);
+ if (s->parent)
+ id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
+ rcu_read_unlock();
+
+ return id;
+}
+
+static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
+{
+ unsigned i;
+
+ for (i = 0; i < 3; i++)
+ if (!s.parent) {
+ if (s.skip[i])
+ return false;
+ } else {
+ if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i])))
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
+ * its snapshot_tree pointer is correct (allocate new one if necessary), then
+ * update this node's pointer to root node's pointer:
+ */
+static int snapshot_tree_ptr_repair(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ struct bch_snapshot *s)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter root_iter;
+ struct bch_snapshot_tree s_t;
+ struct bkey_s_c_snapshot root;
+ struct bkey_i_snapshot *u;
+ u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
+ int ret;
+
+ root = bch2_bkey_get_iter_typed(trans, &root_iter,
+ BTREE_ID_snapshots, POS(0, root_id),
+ BTREE_ITER_WITH_UPDATES, snapshot);
+ ret = bkey_err(root);
+ if (ret)
+ goto err;
+
+ tree_id = le32_to_cpu(root.v->tree);
+
+ ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ return ret;
+
+ if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
+ u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
+ ret = PTR_ERR_OR_ZERO(u) ?:
+ bch2_snapshot_tree_create(trans, root_id,
+ bch2_snapshot_tree_oldest_subvol(c, root_id),
+ &tree_id);
+ if (ret)
+ goto err;
+
+ u->v.tree = cpu_to_le32(tree_id);
+ if (k.k->p.offset == root_id)
+ *s = u->v;
+ }
+
+ if (k.k->p.offset != root_id) {
+ u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
+ ret = PTR_ERR_OR_ZERO(u);
+ if (ret)
+ goto err;
+
+ u->v.tree = cpu_to_le32(tree_id);
+ *s = u->v;
+ }
+err:
+ bch2_trans_iter_exit(trans, &root_iter);
+ return ret;
+}
+
+static int check_snapshot(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_snapshot s;
+ struct bch_subvolume subvol;
+ struct bch_snapshot v;
+ struct bkey_i_snapshot *u;
+ u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
+ u32 real_depth;
+ struct printbuf buf = PRINTBUF;
+ bool should_have_subvol;
+ u32 i, id;
+ int ret = 0;
+
+ if (k.k->type != KEY_TYPE_snapshot)
+ return 0;
+
+ memset(&s, 0, sizeof(s));
+ memcpy(&s, k.v, bkey_val_bytes(k.k));
+
+ id = le32_to_cpu(s.parent);
+ if (id) {
+ ret = bch2_snapshot_lookup(trans, id, &v);
+ if (bch2_err_matches(ret, ENOENT))
+ bch_err(c, "snapshot with nonexistent parent:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ if (ret)
+ goto err;
+
+ if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
+ le32_to_cpu(v.children[1]) != k.k->p.offset) {
+ bch_err(c, "snapshot parent %u missing pointer to child %llu",
+ id, k.k->p.offset);
+ ret = -EINVAL;
+ goto err;
+ }
+ }
+
+ for (i = 0; i < 2 && s.children[i]; i++) {
+ id = le32_to_cpu(s.children[i]);
+
+ ret = bch2_snapshot_lookup(trans, id, &v);
+ if (bch2_err_matches(ret, ENOENT))
+ bch_err(c, "snapshot node %llu has nonexistent child %u",
+ k.k->p.offset, id);
+ if (ret)
+ goto err;
+
+ if (le32_to_cpu(v.parent) != k.k->p.offset) {
+ bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
+ id, le32_to_cpu(v.parent), k.k->p.offset);
+ ret = -EINVAL;
+ goto err;
+ }
+ }
+
+ should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
+ !BCH_SNAPSHOT_DELETED(&s);
+
+ if (should_have_subvol) {
+ id = le32_to_cpu(s.subvol);
+ ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
+ if (bch2_err_matches(ret, ENOENT))
+ bch_err(c, "snapshot points to nonexistent subvolume:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ if (ret)
+ goto err;
+
+ if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
+ bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
+ k.k->p.offset);
+ ret = -EINVAL;
+ goto err;
+ }
+ } else {
+ if (fsck_err_on(s.subvol,
+ c, snapshot_should_not_have_subvol,
+ "snapshot should not point to subvol:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
+ ret = PTR_ERR_OR_ZERO(u);
+ if (ret)
+ goto err;
+
+ u->v.subvol = 0;
+ s = u->v;
+ }
+ }
+
+ ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
+ if (ret < 0)
+ goto err;
+
+ if (fsck_err_on(!ret, c, snapshot_to_bad_snapshot_tree,
+ "snapshot points to missing/incorrect tree:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
+ if (ret)
+ goto err;
+ }
+ ret = 0;
+
+ real_depth = bch2_snapshot_depth(c, parent_id);
+
+ if (le32_to_cpu(s.depth) != real_depth &&
+ (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
+ fsck_err(c, snapshot_bad_depth,
+ "snapshot with incorrect depth field, should be %u:\n %s",
+ real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
+ u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
+ ret = PTR_ERR_OR_ZERO(u);
+ if (ret)
+ goto err;
+
+ u->v.depth = cpu_to_le32(real_depth);
+ s = u->v;
+ }
+
+ ret = snapshot_skiplist_good(trans, k.k->p.offset, s);
+ if (ret < 0)
+ goto err;
+
+ if (!ret &&
+ (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
+ fsck_err(c, snapshot_bad_skiplist,
+ "snapshot with bad skiplist field:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
+ u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
+ ret = PTR_ERR_OR_ZERO(u);
+ if (ret)
+ goto err;
+
+ for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
+ u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id));
+
+ bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
+ s = u->v;
+ }
+ ret = 0;
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+int bch2_check_snapshots(struct bch_fs *c)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ /*
+ * We iterate backwards as checking/fixing the depth field requires that
+ * the parent's depth already be correct:
+ */
+ ret = bch2_trans_run(c,
+ for_each_btree_key_reverse_commit(trans, iter,
+ BTREE_ID_snapshots, POS_MAX,
+ BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ check_snapshot(trans, &iter, k)));
+ if (ret)
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+/*
+ * Mark a snapshot as deleted, for future cleanup:
+ */
+int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
+{
+ struct btree_iter iter;
+ struct bkey_i_snapshot *s;
+ int ret = 0;
+
+ s = bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_snapshots, POS(0, id),
+ 0, snapshot);
+ ret = PTR_ERR_OR_ZERO(s);
+ if (unlikely(ret)) {
+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
+ trans->c, "missing snapshot %u", id);
+ return ret;
+ }
+
+ /* already deleted? */
+ if (BCH_SNAPSHOT_DELETED(&s->v))
+ goto err;
+
+ SET_BCH_SNAPSHOT_DELETED(&s->v, true);
+ SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
+ s->v.subvol = 0;
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
+{
+ if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1]))
+ swap(s->children[0], s->children[1]);
+}
+
+static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
+ struct btree_iter c_iter = (struct btree_iter) { NULL };
+ struct btree_iter tree_iter = (struct btree_iter) { NULL };
+ struct bkey_s_c_snapshot s;
+ u32 parent_id, child_id;
+ unsigned i;
+ int ret = 0;
+
+ s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
+ BTREE_ITER_INTENT, snapshot);
+ ret = bkey_err(s);
+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+ "missing snapshot %u", id);
+
+ if (ret)
+ goto err;
+
+ BUG_ON(s.v->children[1]);
+
+ parent_id = le32_to_cpu(s.v->parent);
+ child_id = le32_to_cpu(s.v->children[0]);
+
+ if (parent_id) {
+ struct bkey_i_snapshot *parent;
+
+ parent = bch2_bkey_get_mut_typed(trans, &p_iter,
+ BTREE_ID_snapshots, POS(0, parent_id),
+ 0, snapshot);
+ ret = PTR_ERR_OR_ZERO(parent);
+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+ "missing snapshot %u", parent_id);
+ if (unlikely(ret))
+ goto err;
+
+ /* find entry in parent->children for node being deleted */
+ for (i = 0; i < 2; i++)
+ if (le32_to_cpu(parent->v.children[i]) == id)
+ break;
+
+ if (bch2_fs_inconsistent_on(i == 2, c,
+ "snapshot %u missing child pointer to %u",
+ parent_id, id))
+ goto err;
+
+ parent->v.children[i] = cpu_to_le32(child_id);
+
+ normalize_snapshot_child_pointers(&parent->v);
+ }
+
+ if (child_id) {
+ struct bkey_i_snapshot *child;
+
+ child = bch2_bkey_get_mut_typed(trans, &c_iter,
+ BTREE_ID_snapshots, POS(0, child_id),
+ 0, snapshot);
+ ret = PTR_ERR_OR_ZERO(child);
+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+ "missing snapshot %u", child_id);
+ if (unlikely(ret))
+ goto err;
+
+ child->v.parent = cpu_to_le32(parent_id);
+
+ if (!child->v.parent) {
+ child->v.skip[0] = 0;
+ child->v.skip[1] = 0;
+ child->v.skip[2] = 0;
+ }
+ }
+
+ if (!parent_id) {
+ /*
+ * We're deleting the root of a snapshot tree: update the
+ * snapshot_tree entry to point to the new root, or delete it if
+ * this is the last snapshot ID in this tree:
+ */
+ struct bkey_i_snapshot_tree *s_t;
+
+ BUG_ON(s.v->children[1]);
+
+ s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
+ BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
+ 0, snapshot_tree);
+ ret = PTR_ERR_OR_ZERO(s_t);
+ if (ret)
+ goto err;
+
+ if (s.v->children[0]) {
+ s_t->v.root_snapshot = s.v->children[0];
+ } else {
+ s_t->k.type = KEY_TYPE_deleted;
+ set_bkey_val_u64s(&s_t->k, 0);
+ }
+ }
+
+ ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+ bch2_trans_iter_exit(trans, &tree_iter);
+ bch2_trans_iter_exit(trans, &p_iter);
+ bch2_trans_iter_exit(trans, &c_iter);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
+ u32 *new_snapids,
+ u32 *snapshot_subvols,
+ unsigned nr_snapids)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_i_snapshot *n;
+ struct bkey_s_c k;
+ unsigned i, j;
+ u32 depth = bch2_snapshot_depth(c, parent);
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
+ POS_MIN, BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ for (i = 0; i < nr_snapids; i++) {
+ k = bch2_btree_iter_prev_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (!k.k || !k.k->p.offset) {
+ ret = -BCH_ERR_ENOSPC_snapshot_create;
+ goto err;
+ }
+
+ n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ goto err;
+
+ n->v.flags = 0;
+ n->v.parent = cpu_to_le32(parent);
+ n->v.subvol = cpu_to_le32(snapshot_subvols[i]);
+ n->v.tree = cpu_to_le32(tree);
+ n->v.depth = cpu_to_le32(depth);
+
+ for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
+ n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
+
+ bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
+ SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
+
+ ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
+ bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
+ if (ret)
+ goto err;
+
+ new_snapids[i] = iter.pos.offset;
+
+ mutex_lock(&c->snapshot_table_lock);
+ snapshot_t_mut(c, new_snapids[i])->equiv = new_snapids[i];
+ mutex_unlock(&c->snapshot_table_lock);
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+/*
+ * Create new snapshot IDs as children of an existing snapshot ID:
+ */
+static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
+ u32 *new_snapids,
+ u32 *snapshot_subvols,
+ unsigned nr_snapids)
+{
+ struct btree_iter iter;
+ struct bkey_i_snapshot *n_parent;
+ int ret = 0;
+
+ n_parent = bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_snapshots, POS(0, parent),
+ 0, snapshot);
+ ret = PTR_ERR_OR_ZERO(n_parent);
+ if (unlikely(ret)) {
+ if (bch2_err_matches(ret, ENOENT))
+ bch_err(trans->c, "snapshot %u not found", parent);
+ return ret;
+ }
+
+ if (n_parent->v.children[0] || n_parent->v.children[1]) {
+ bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
+ new_snapids, snapshot_subvols, nr_snapids);
+ if (ret)
+ goto err;
+
+ n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
+ n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
+ n_parent->v.subvol = 0;
+ SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+/*
+ * Create a snapshot node that is the root of a new tree:
+ */
+static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
+ u32 *new_snapids,
+ u32 *snapshot_subvols,
+ unsigned nr_snapids)
+{
+ struct bkey_i_snapshot_tree *n_tree;
+ int ret;
+
+ n_tree = __bch2_snapshot_tree_create(trans);
+ ret = PTR_ERR_OR_ZERO(n_tree) ?:
+ create_snapids(trans, 0, n_tree->k.p.offset,
+ new_snapids, snapshot_subvols, nr_snapids);
+ if (ret)
+ return ret;
+
+ n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]);
+ n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]);
+ return 0;
+}
+
+int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
+ u32 *new_snapids,
+ u32 *snapshot_subvols,
+ unsigned nr_snapids)
+{
+ BUG_ON((parent == 0) != (nr_snapids == 1));
+ BUG_ON((parent != 0) != (nr_snapids == 2));
+
+ return parent
+ ? bch2_snapshot_node_create_children(trans, parent,
+ new_snapids, snapshot_subvols, nr_snapids)
+ : bch2_snapshot_node_create_tree(trans,
+ new_snapids, snapshot_subvols, nr_snapids);
+
+}
+
+/*
+ * If we have an unlinked inode in an internal snapshot node, and the inode
+ * really has been deleted in all child snapshots, how does this get cleaned up?
+ *
+ * first there is the problem of how keys that have been overwritten in all
+ * child snapshots get deleted (unimplemented?), but inodes may perhaps be
+ * special?
+ *
+ * also: unlinked inode in internal snapshot appears to not be getting deleted
+ * correctly if inode doesn't exist in leaf snapshots
+ *
+ * solution:
+ *
+ * for a key in an interior snapshot node that needs work to be done that
+ * requires it to be mutated: iterate over all descendent leaf nodes and copy
+ * that key to snapshot leaf nodes, where we can mutate it
+ */
+
+static int snapshot_delete_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ snapshot_id_list *deleted,
+ snapshot_id_list *equiv_seen,
+ struct bpos *last_pos)
+{
+ struct bch_fs *c = trans->c;
+ u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
+ if (!bkey_eq(k.k->p, *last_pos))
+ equiv_seen->nr = 0;
+ *last_pos = k.k->p;
+
+ if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
+ snapshot_list_has_id(equiv_seen, equiv)) {
+ return bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ } else {
+ return snapshot_list_add(c, equiv_seen, equiv);
+ }
+}
+
+static int move_key_to_correct_snapshot(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
+ /*
+ * When we have a linear chain of snapshot nodes, we consider
+ * those to form an equivalence class: we're going to collapse
+ * them all down to a single node, and keep the leaf-most node -
+ * which has the same id as the equivalence class id.
+ *
+ * If there are multiple keys in different snapshots at the same
+ * position, we're only going to keep the one in the newest
+ * snapshot - the rest have been overwritten and are redundant,
+ * and for the key we're going to keep we need to move it to the
+ * equivalance class ID if it's not there already.
+ */
+ if (equiv != k.k->p.snapshot) {
+ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+ struct btree_iter new_iter;
+ int ret;
+
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ return ret;
+
+ new->k.p.snapshot = equiv;
+
+ bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p,
+ BTREE_ITER_ALL_SNAPSHOTS|
+ BTREE_ITER_CACHED|
+ BTREE_ITER_INTENT);
+
+ ret = bch2_btree_iter_traverse(&new_iter) ?:
+ bch2_trans_update(trans, &new_iter, new,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ bch2_trans_iter_exit(trans, &new_iter);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int bch2_snapshot_needs_delete(struct btree_trans *trans, struct bkey_s_c k)
+{
+ struct bkey_s_c_snapshot snap;
+ u32 children[2];
+ int ret;
+
+ if (k.k->type != KEY_TYPE_snapshot)
+ return 0;
+
+ snap = bkey_s_c_to_snapshot(k);
+ if (BCH_SNAPSHOT_DELETED(snap.v) ||
+ BCH_SNAPSHOT_SUBVOL(snap.v))
+ return 0;
+
+ children[0] = le32_to_cpu(snap.v->children[0]);
+ children[1] = le32_to_cpu(snap.v->children[1]);
+
+ ret = bch2_snapshot_live(trans, children[0]) ?:
+ bch2_snapshot_live(trans, children[1]);
+ if (ret < 0)
+ return ret;
+ return !ret;
+}
+
+/*
+ * For a given snapshot, if it doesn't have a subvolume that points to it, and
+ * it doesn't have child snapshot nodes - it's now redundant and we can mark it
+ * as deleted.
+ */
+static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct bkey_s_c k)
+{
+ int ret = bch2_snapshot_needs_delete(trans, k);
+
+ return ret <= 0
+ ? ret
+ : bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
+}
+
+static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
+ snapshot_id_list *skip)
+{
+ rcu_read_lock();
+ while (snapshot_list_has_id(skip, id))
+ id = __bch2_snapshot_parent(c, id);
+
+ while (n--) {
+ do {
+ id = __bch2_snapshot_parent(c, id);
+ } while (snapshot_list_has_id(skip, id));
+ }
+ rcu_read_unlock();
+
+ return id;
+}
+
+static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
+ struct btree_iter *iter, struct bkey_s_c k,
+ snapshot_id_list *deleted)
+{
+ struct bch_fs *c = trans->c;
+ u32 nr_deleted_ancestors = 0;
+ struct bkey_i_snapshot *s;
+ u32 *i;
+ int ret;
+
+ if (k.k->type != KEY_TYPE_snapshot)
+ return 0;
+
+ if (snapshot_list_has_id(deleted, k.k->p.offset))
+ return 0;
+
+ s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
+ ret = PTR_ERR_OR_ZERO(s);
+ if (ret)
+ return ret;
+
+ darray_for_each(*deleted, i)
+ nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i);
+
+ if (!nr_deleted_ancestors)
+ return 0;
+
+ le32_add_cpu(&s->v.depth, -nr_deleted_ancestors);
+
+ if (!s->v.depth) {
+ s->v.skip[0] = 0;
+ s->v.skip[1] = 0;
+ s->v.skip[2] = 0;
+ } else {
+ u32 depth = le32_to_cpu(s->v.depth);
+ u32 parent = bch2_snapshot_parent(c, s->k.p.offset);
+
+ for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
+ u32 id = le32_to_cpu(s->v.skip[j]);
+
+ if (snapshot_list_has_id(deleted, id)) {
+ id = bch2_snapshot_nth_parent_skip(c,
+ parent,
+ depth > 1
+ ? get_random_u32_below(depth - 1)
+ : 0,
+ deleted);
+ s->v.skip[j] = cpu_to_le32(id);
+ }
+ }
+
+ bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32);
+ }
+
+ return bch2_trans_update(trans, iter, &s->k_i, 0);
+}
+
+int bch2_delete_dead_snapshots(struct bch_fs *c)
+{
+ struct btree_trans *trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_s_c_snapshot snap;
+ snapshot_id_list deleted = { 0 };
+ snapshot_id_list deleted_interior = { 0 };
+ u32 *i, id;
+ int ret = 0;
+
+ if (!test_and_clear_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags))
+ return 0;
+
+ if (!test_bit(BCH_FS_STARTED, &c->flags)) {
+ ret = bch2_fs_read_write_early(c);
+ if (ret) {
+ bch_err_msg(c, ret, "deleting dead snapshots: error going rw");
+ return ret;
+ }
+ }
+
+ trans = bch2_trans_get(c);
+
+ /*
+ * For every snapshot node: If we have no live children and it's not
+ * pointed to by a subvolume, delete it:
+ */
+ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k,
+ NULL, NULL, 0,
+ bch2_delete_redundant_snapshot(trans, k));
+ if (ret) {
+ bch_err_msg(c, ret, "deleting redundant snapshots");
+ goto err;
+ }
+
+ ret = for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k,
+ bch2_snapshot_set_equiv(trans, k));
+ if (ret) {
+ bch_err_msg(c, ret, "in bch2_snapshots_set_equiv");
+ goto err;
+ }
+
+ for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k, ret) {
+ if (k.k->type != KEY_TYPE_snapshot)
+ continue;
+
+ snap = bkey_s_c_to_snapshot(k);
+ if (BCH_SNAPSHOT_DELETED(snap.v)) {
+ ret = snapshot_list_add(c, &deleted, k.k->p.offset);
+ if (ret)
+ break;
+ }
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (ret) {
+ bch_err_msg(c, ret, "walking snapshots");
+ goto err;
+ }
+
+ for (id = 0; id < BTREE_ID_NR; id++) {
+ struct bpos last_pos = POS_MIN;
+ snapshot_id_list equiv_seen = { 0 };
+ struct disk_reservation res = { 0 };
+
+ if (!btree_type_has_snapshots(id))
+ continue;
+
+ /*
+ * deleted inodes btree is maintained by a trigger on the inodes
+ * btree - no work for us to do here, and it's not safe to scan
+ * it because we'll see out of date keys due to the btree write
+ * buffer:
+ */
+ if (id == BTREE_ID_deleted_inodes)
+ continue;
+
+ ret = for_each_btree_key_commit(trans, iter,
+ id, POS_MIN,
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ &res, NULL, BTREE_INSERT_NOFAIL,
+ snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
+ for_each_btree_key_commit(trans, iter,
+ id, POS_MIN,
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ &res, NULL, BTREE_INSERT_NOFAIL,
+ move_key_to_correct_snapshot(trans, &iter, k));
+
+ bch2_disk_reservation_put(c, &res);
+ darray_exit(&equiv_seen);
+
+ if (ret) {
+ bch_err_msg(c, ret, "deleting keys from dying snapshots");
+ goto err;
+ }
+ }
+
+ bch2_trans_unlock(trans);
+ down_write(&c->snapshot_create_lock);
+
+ for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k, ret) {
+ u32 snapshot = k.k->p.offset;
+ u32 equiv = bch2_snapshot_equiv(c, snapshot);
+
+ if (equiv != snapshot)
+ snapshot_list_add(c, &deleted_interior, snapshot);
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (ret)
+ goto err_create_lock;
+
+ /*
+ * Fixing children of deleted snapshots can't be done completely
+ * atomically, if we crash between here and when we delete the interior
+ * nodes some depth fields will be off:
+ */
+ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
+ BTREE_ITER_INTENT, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL,
+ bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior));
+ if (ret)
+ goto err_create_lock;
+
+ darray_for_each(deleted, i) {
+ ret = commit_do(trans, NULL, NULL, 0,
+ bch2_snapshot_node_delete(trans, *i));
+ if (ret) {
+ bch_err_msg(c, ret, "deleting snapshot %u", *i);
+ goto err_create_lock;
+ }
+ }
+
+ darray_for_each(deleted_interior, i) {
+ ret = commit_do(trans, NULL, NULL, 0,
+ bch2_snapshot_node_delete(trans, *i));
+ if (ret) {
+ bch_err_msg(c, ret, "deleting snapshot %u", *i);
+ goto err_create_lock;
+ }
+ }
+err_create_lock:
+ up_write(&c->snapshot_create_lock);
+err:
+ darray_exit(&deleted_interior);
+ darray_exit(&deleted);
+ bch2_trans_put(trans);
+ if (ret)
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+void bch2_delete_dead_snapshots_work(struct work_struct *work)
+{
+ struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
+
+ bch2_delete_dead_snapshots(c);
+ bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
+}
+
+void bch2_delete_dead_snapshots_async(struct bch_fs *c)
+{
+ if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
+ !queue_work(c->write_ref_wq, &c->snapshot_delete_work))
+ bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
+}
+
+int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
+ enum btree_id id,
+ struct bpos pos)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, id, pos,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS);
+ while (1) {
+ k = bch2_btree_iter_prev(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ break;
+
+ if (!k.k)
+ break;
+
+ if (!bkey_eq(pos, k.k->p))
+ break;
+
+ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
+ ret = 1;
+ break;
+ }
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret;
+}
+
+static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id)
+{
+ const struct snapshot_t *s = snapshot_t(c, id);
+
+ return s->children[1] ?: s->children[0];
+}
+
+static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id)
+{
+ u32 child;
+
+ while ((child = bch2_snapshot_smallest_child(c, id)))
+ id = child;
+ return id;
+}
+
+static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans,
+ enum btree_id btree,
+ struct bkey_s_c interior_k,
+ u32 leaf_id, struct bpos *new_min_pos)
+{
+ struct btree_iter iter;
+ struct bpos pos = interior_k.k->p;
+ struct bkey_s_c k;
+ struct bkey_i *new;
+ int ret;
+
+ pos.snapshot = leaf_id;
+
+ bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto out;
+
+ /* key already overwritten in this snapshot? */
+ if (k.k->p.snapshot != interior_k.k->p.snapshot)
+ goto out;
+
+ if (bpos_eq(*new_min_pos, POS_MIN)) {
+ *new_min_pos = k.k->p;
+ new_min_pos->snapshot = leaf_id;
+ }
+
+ new = bch2_bkey_make_mut_noupdate(trans, interior_k);
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ goto out;
+
+ new->k.p.snapshot = leaf_id;
+ ret = bch2_trans_update(trans, &iter, new, 0);
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans,
+ enum btree_id btree,
+ struct bkey_s_c k,
+ struct bpos *new_min_pos)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_buf sk;
+ u32 restart_count = trans->restart_count;
+ int ret = 0;
+
+ bch2_bkey_buf_init(&sk);
+ bch2_bkey_buf_reassemble(&sk, c, k);
+ k = bkey_i_to_s_c(sk.k);
+
+ *new_min_pos = POS_MIN;
+
+ for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot);
+ id < k.k->p.snapshot;
+ id++) {
+ if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) ||
+ !bch2_snapshot_is_leaf(c, id))
+ continue;
+again:
+ ret = btree_trans_too_many_iters(trans) ?:
+ bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos) ?:
+ bch2_trans_commit(trans, NULL, NULL, 0);
+ if (ret && bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+ bch2_trans_begin(trans);
+ goto again;
+ }
+
+ if (ret)
+ break;
+ }
+
+ bch2_bkey_buf_exit(&sk, c);
+
+ return ret ?: trans_was_restarted(trans, restart_count);
+}
+
+static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c_snapshot snap;
+ int ret = 0;
+
+ if (k.k->type != KEY_TYPE_snapshot)
+ return 0;
+
+ snap = bkey_s_c_to_snapshot(k);
+ if (BCH_SNAPSHOT_DELETED(snap.v) ||
+ bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset ||
+ (ret = bch2_snapshot_needs_delete(trans, k)) > 0) {
+ set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags);
+ return 0;
+ }
+
+ return ret;
+}
+
+int bch2_snapshots_read(struct bch_fs *c)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ ret = bch2_trans_run(c,
+ for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k,
+ bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
+ bch2_snapshot_set_equiv(trans, k) ?:
+ bch2_check_snapshot_needs_deletion(trans, k)) ?:
+ for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k,
+ (set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
+ if (ret)
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+void bch2_fs_snapshots_exit(struct bch_fs *c)
+{
+ kfree(rcu_dereference_protected(c->snapshots, true));
+}
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
new file mode 100644
index 000000000000..f09a22f44239
--- /dev/null
+++ b/fs/bcachefs/snapshot.h
@@ -0,0 +1,268 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SNAPSHOT_H
+#define _BCACHEFS_SNAPSHOT_H
+
+enum bkey_invalid_flags;
+
+void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+int bch2_snapshot_tree_invalid(struct bch_fs *, struct bkey_s_c,
+ enum bkey_invalid_flags, struct printbuf *);
+
+#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \
+ .key_invalid = bch2_snapshot_tree_invalid, \
+ .val_to_text = bch2_snapshot_tree_to_text, \
+ .min_val_size = 8, \
+})
+
+struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *);
+
+int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *);
+
+void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+int bch2_snapshot_invalid(struct bch_fs *, struct bkey_s_c,
+ enum bkey_invalid_flags, struct printbuf *);
+int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s_c, unsigned);
+
+#define bch2_bkey_ops_snapshot ((struct bkey_ops) { \
+ .key_invalid = bch2_snapshot_invalid, \
+ .val_to_text = bch2_snapshot_to_text, \
+ .atomic_trigger = bch2_mark_snapshot, \
+ .min_val_size = 24, \
+})
+
+static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id)
+{
+ return &t->s[U32_MAX - id];
+}
+
+static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
+{
+ return __snapshot_t(rcu_dereference(c->snapshots), id);
+}
+
+static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
+{
+ rcu_read_lock();
+ id = snapshot_t(c, id)->tree;
+ rcu_read_unlock();
+
+ return id;
+}
+
+static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
+{
+ return snapshot_t(c, id)->parent;
+}
+
+static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
+{
+ rcu_read_lock();
+ id = __bch2_snapshot_parent_early(c, id);
+ rcu_read_unlock();
+
+ return id;
+}
+
+static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+ u32 parent = snapshot_t(c, id)->parent;
+
+ if (parent &&
+ snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1)
+ panic("id %u depth=%u parent %u depth=%u\n",
+ id, snapshot_t(c, id)->depth,
+ parent, snapshot_t(c, parent)->depth);
+
+ return parent;
+#else
+ return snapshot_t(c, id)->parent;
+#endif
+}
+
+static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
+{
+ rcu_read_lock();
+ id = __bch2_snapshot_parent(c, id);
+ rcu_read_unlock();
+
+ return id;
+}
+
+static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n)
+{
+ rcu_read_lock();
+ while (n--)
+ id = __bch2_snapshot_parent(c, id);
+ rcu_read_unlock();
+
+ return id;
+}
+
+u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32);
+
+static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
+{
+ u32 parent;
+
+ rcu_read_lock();
+ while ((parent = __bch2_snapshot_parent(c, id)))
+ id = parent;
+ rcu_read_unlock();
+
+ return id;
+}
+
+static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id)
+{
+ return snapshot_t(c, id)->equiv;
+}
+
+static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
+{
+ rcu_read_lock();
+ id = __bch2_snapshot_equiv(c, id);
+ rcu_read_unlock();
+
+ return id;
+}
+
+static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id)
+{
+ return id == bch2_snapshot_equiv(c, id);
+}
+
+static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
+{
+ const struct snapshot_t *s;
+ bool ret;
+
+ rcu_read_lock();
+ s = snapshot_t(c, id);
+ ret = s->children[0];
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
+{
+ return !bch2_snapshot_is_internal_node(c, id);
+}
+
+static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id)
+{
+ const struct snapshot_t *s;
+ u32 parent = __bch2_snapshot_parent(c, id);
+
+ if (!parent)
+ return 0;
+
+ s = snapshot_t(c, __bch2_snapshot_parent(c, id));
+ if (id == s->children[0])
+ return s->children[1];
+ if (id == s->children[1])
+ return s->children[0];
+ return 0;
+}
+
+static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent)
+{
+ u32 depth;
+
+ rcu_read_lock();
+ depth = parent ? snapshot_t(c, parent)->depth + 1 : 0;
+ rcu_read_unlock();
+
+ return depth;
+}
+
+bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32);
+
+static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
+{
+ return id == ancestor
+ ? true
+ : __bch2_snapshot_is_ancestor(c, id, ancestor);
+}
+
+static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
+{
+ const struct snapshot_t *t;
+ bool ret;
+
+ rcu_read_lock();
+ t = snapshot_t(c, id);
+ ret = (t->children[0]|t->children[1]) != 0;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
+{
+ u32 *i;
+
+ darray_for_each(*s, i)
+ if (*i == id)
+ return true;
+ return false;
+}
+
+static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id)
+{
+ u32 *i;
+
+ darray_for_each(*s, i)
+ if (bch2_snapshot_is_ancestor(c, id, *i))
+ return true;
+ return false;
+}
+
+static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id)
+{
+ int ret;
+
+ BUG_ON(snapshot_list_has_id(s, id));
+ ret = darray_push(s, id);
+ if (ret)
+ bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
+ return ret;
+}
+
+int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
+ struct bch_snapshot *s);
+int bch2_snapshot_get_subvol(struct btree_trans *, u32,
+ struct bch_subvolume *);
+
+/* only exported for tests: */
+int bch2_snapshot_node_create(struct btree_trans *, u32,
+ u32 *, u32 *, unsigned);
+
+int bch2_check_snapshot_trees(struct bch_fs *);
+int bch2_check_snapshots(struct bch_fs *);
+
+int bch2_snapshot_node_set_deleted(struct btree_trans *, u32);
+void bch2_delete_dead_snapshots_work(struct work_struct *);
+
+int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos);
+
+static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
+ enum btree_id id,
+ struct bpos pos)
+{
+ if (!btree_type_has_snapshots(id) ||
+ bch2_snapshot_is_leaf(trans->c, pos.snapshot))
+ return 0;
+
+ return __bch2_key_has_snapshot_overwrites(trans, id, pos);
+}
+
+int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *, enum btree_id,
+ struct bkey_s_c, struct bpos *);
+
+int bch2_snapshots_read(struct bch_fs *);
+void bch2_fs_snapshots_exit(struct bch_fs *);
+
+#endif /* _BCACHEFS_SNAPSHOT_H */
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
new file mode 100644
index 000000000000..ae21a8cca1b4
--- /dev/null
+++ b/fs/bcachefs/str_hash.h
@@ -0,0 +1,370 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_STR_HASH_H
+#define _BCACHEFS_STR_HASH_H
+
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "checksum.h"
+#include "error.h"
+#include "inode.h"
+#include "siphash.h"
+#include "subvolume.h"
+#include "super.h"
+
+#include <linux/crc32c.h>
+#include <crypto/hash.h>
+#include <crypto/sha2.h>
+
+static inline enum bch_str_hash_type
+bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
+{
+ switch (opt) {
+ case BCH_STR_HASH_OPT_crc32c:
+ return BCH_STR_HASH_crc32c;
+ case BCH_STR_HASH_OPT_crc64:
+ return BCH_STR_HASH_crc64;
+ case BCH_STR_HASH_OPT_siphash:
+ return c->sb.features & (1ULL << BCH_FEATURE_new_siphash)
+ ? BCH_STR_HASH_siphash
+ : BCH_STR_HASH_siphash_old;
+ default:
+ BUG();
+ }
+}
+
+struct bch_hash_info {
+ u8 type;
+ /*
+ * For crc32 or crc64 string hashes the first key value of
+ * the siphash_key (k0) is used as the key.
+ */
+ SIPHASH_KEY siphash_key;
+};
+
+static inline struct bch_hash_info
+bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
+{
+ /* XXX ick */
+ struct bch_hash_info info = {
+ .type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) &
+ ~(~0U << INODE_STR_HASH_BITS),
+ .siphash_key = { .k0 = bi->bi_hash_seed }
+ };
+
+ if (unlikely(info.type == BCH_STR_HASH_siphash_old)) {
+ SHASH_DESC_ON_STACK(desc, c->sha256);
+ u8 digest[SHA256_DIGEST_SIZE];
+
+ desc->tfm = c->sha256;
+
+ crypto_shash_digest(desc, (void *) &bi->bi_hash_seed,
+ sizeof(bi->bi_hash_seed), digest);
+ memcpy(&info.siphash_key, digest, sizeof(info.siphash_key));
+ }
+
+ return info;
+}
+
+struct bch_str_hash_ctx {
+ union {
+ u32 crc32c;
+ u64 crc64;
+ SIPHASH_CTX siphash;
+ };
+};
+
+static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx,
+ const struct bch_hash_info *info)
+{
+ switch (info->type) {
+ case BCH_STR_HASH_crc32c:
+ ctx->crc32c = crc32c(~0, &info->siphash_key.k0,
+ sizeof(info->siphash_key.k0));
+ break;
+ case BCH_STR_HASH_crc64:
+ ctx->crc64 = crc64_be(~0, &info->siphash_key.k0,
+ sizeof(info->siphash_key.k0));
+ break;
+ case BCH_STR_HASH_siphash_old:
+ case BCH_STR_HASH_siphash:
+ SipHash24_Init(&ctx->siphash, &info->siphash_key);
+ break;
+ default:
+ BUG();
+ }
+}
+
+static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx,
+ const struct bch_hash_info *info,
+ const void *data, size_t len)
+{
+ switch (info->type) {
+ case BCH_STR_HASH_crc32c:
+ ctx->crc32c = crc32c(ctx->crc32c, data, len);
+ break;
+ case BCH_STR_HASH_crc64:
+ ctx->crc64 = crc64_be(ctx->crc64, data, len);
+ break;
+ case BCH_STR_HASH_siphash_old:
+ case BCH_STR_HASH_siphash:
+ SipHash24_Update(&ctx->siphash, data, len);
+ break;
+ default:
+ BUG();
+ }
+}
+
+static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx,
+ const struct bch_hash_info *info)
+{
+ switch (info->type) {
+ case BCH_STR_HASH_crc32c:
+ return ctx->crc32c;
+ case BCH_STR_HASH_crc64:
+ return ctx->crc64 >> 1;
+ case BCH_STR_HASH_siphash_old:
+ case BCH_STR_HASH_siphash:
+ return SipHash24_End(&ctx->siphash) >> 1;
+ default:
+ BUG();
+ }
+}
+
+struct bch_hash_desc {
+ enum btree_id btree_id;
+ u8 key_type;
+
+ u64 (*hash_key)(const struct bch_hash_info *, const void *);
+ u64 (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c);
+ bool (*cmp_key)(struct bkey_s_c, const void *);
+ bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
+ bool (*is_visible)(subvol_inum inum, struct bkey_s_c);
+};
+
+static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, struct bkey_s_c k)
+{
+ return k.k->type == desc.key_type &&
+ (!desc.is_visible ||
+ !inum.inum ||
+ desc.is_visible(inum, k));
+}
+
+static __always_inline int
+bch2_hash_lookup(struct btree_trans *trans,
+ struct btree_iter *iter,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ subvol_inum inum, const void *key,
+ unsigned flags)
+{
+ struct bkey_s_c k;
+ u32 snapshot;
+ int ret;
+
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ return ret;
+
+ for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
+ SPOS(inum.inum, desc.hash_key(info, key), snapshot),
+ POS(inum.inum, U64_MAX),
+ BTREE_ITER_SLOTS|flags, k, ret) {
+ if (is_visible_key(desc, inum, k)) {
+ if (!desc.cmp_key(k, key))
+ return 0;
+ } else if (k.k->type == KEY_TYPE_hash_whiteout) {
+ ;
+ } else {
+ /* hole, not found */
+ break;
+ }
+ }
+ bch2_trans_iter_exit(trans, iter);
+
+ return ret ?: -BCH_ERR_ENOENT_str_hash_lookup;
+}
+
+static __always_inline int
+bch2_hash_hole(struct btree_trans *trans,
+ struct btree_iter *iter,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ subvol_inum inum, const void *key)
+{
+ struct bkey_s_c k;
+ u32 snapshot;
+ int ret;
+
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ return ret;
+
+ for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
+ SPOS(inum.inum, desc.hash_key(info, key), snapshot),
+ POS(inum.inum, U64_MAX),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret)
+ if (!is_visible_key(desc, inum, k))
+ return 0;
+ bch2_trans_iter_exit(trans, iter);
+
+ return ret ?: -BCH_ERR_ENOSPC_str_hash_create;
+}
+
+static __always_inline
+int bch2_hash_needs_whiteout(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ struct btree_iter *start)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_trans_copy_iter(&iter, start);
+
+ bch2_btree_iter_advance(&iter);
+
+ for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, k, ret) {
+ if (k.k->type != desc.key_type &&
+ k.k->type != KEY_TYPE_hash_whiteout)
+ break;
+
+ if (k.k->type == desc.key_type &&
+ desc.hash_bkey(info, k) <= start->pos.offset) {
+ ret = 1;
+ break;
+ }
+ }
+
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static __always_inline
+int bch2_hash_set_snapshot(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ subvol_inum inum, u32 snapshot,
+ struct bkey_i *insert,
+ int flags,
+ int update_flags)
+{
+ struct btree_iter iter, slot = { NULL };
+ struct bkey_s_c k;
+ bool found = false;
+ int ret;
+
+ for_each_btree_key_upto_norestart(trans, iter, desc.btree_id,
+ SPOS(insert->k.p.inode,
+ desc.hash_bkey(info, bkey_i_to_s_c(insert)),
+ snapshot),
+ POS(insert->k.p.inode, U64_MAX),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
+ if (is_visible_key(desc, inum, k)) {
+ if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
+ goto found;
+
+ /* hash collision: */
+ continue;
+ }
+
+ if (!slot.path &&
+ !(flags & BCH_HASH_SET_MUST_REPLACE))
+ bch2_trans_copy_iter(&slot, &iter);
+
+ if (k.k->type != KEY_TYPE_hash_whiteout)
+ goto not_found;
+ }
+
+ if (!ret)
+ ret = -BCH_ERR_ENOSPC_str_hash_create;
+out:
+ bch2_trans_iter_exit(trans, &slot);
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret;
+found:
+ found = true;
+not_found:
+
+ if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) {
+ ret = -BCH_ERR_ENOENT_str_hash_set_must_replace;
+ } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) {
+ ret = -EEXIST;
+ } else {
+ if (!found && slot.path)
+ swap(iter, slot);
+
+ insert->k.p = iter.pos;
+ ret = bch2_trans_update(trans, &iter, insert, 0);
+ }
+
+ goto out;
+}
+
+static __always_inline
+int bch2_hash_set(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ subvol_inum inum,
+ struct bkey_i *insert, int flags)
+{
+ u32 snapshot;
+ int ret;
+
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ return ret;
+
+ insert->k.p.inode = inum.inum;
+
+ return bch2_hash_set_snapshot(trans, desc, info, inum,
+ snapshot, insert, flags, 0);
+}
+
+static __always_inline
+int bch2_hash_delete_at(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ struct btree_iter *iter,
+ unsigned update_flags)
+{
+ struct bkey_i *delete;
+ int ret;
+
+ delete = bch2_trans_kmalloc(trans, sizeof(*delete));
+ ret = PTR_ERR_OR_ZERO(delete);
+ if (ret)
+ return ret;
+
+ ret = bch2_hash_needs_whiteout(trans, desc, info, iter);
+ if (ret < 0)
+ return ret;
+
+ bkey_init(&delete->k);
+ delete->k.p = iter->pos;
+ delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted;
+
+ return bch2_trans_update(trans, iter, delete, update_flags);
+}
+
+static __always_inline
+int bch2_hash_delete(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ subvol_inum inum, const void *key)
+{
+ struct btree_iter iter;
+ int ret;
+
+ ret = bch2_hash_lookup(trans, &iter, desc, info, inum, key,
+ BTREE_ITER_INTENT);
+ if (ret)
+ return ret;
+
+ ret = bch2_hash_delete_at(trans, desc, info, &iter, 0);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+#endif /* _BCACHEFS_STR_HASH_H */
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
new file mode 100644
index 000000000000..22b34a8e4d6e
--- /dev/null
+++ b/fs/bcachefs/subvolume.c
@@ -0,0 +1,455 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_key_cache.h"
+#include "btree_update.h"
+#include "errcode.h"
+#include "error.h"
+#include "fs.h"
+#include "snapshot.h"
+#include "subvolume.h"
+
+#include <linux/random.h>
+
+static int bch2_subvolume_delete(struct btree_trans *, u32);
+
+static int check_subvol(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c_subvolume subvol;
+ struct bch_snapshot snapshot;
+ unsigned snapid;
+ int ret = 0;
+
+ if (k.k->type != KEY_TYPE_subvolume)
+ return 0;
+
+ subvol = bkey_s_c_to_subvolume(k);
+ snapid = le32_to_cpu(subvol.v->snapshot);
+ ret = bch2_snapshot_lookup(trans, snapid, &snapshot);
+
+ if (bch2_err_matches(ret, ENOENT))
+ bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
+ k.k->p.offset, snapid);
+ if (ret)
+ return ret;
+
+ if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
+ bch2_fs_lazy_rw(c);
+
+ ret = bch2_subvolume_delete(trans, iter->pos.offset);
+ if (ret)
+ bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
+ return ret ?: -BCH_ERR_transaction_restart_nested;
+ }
+
+ if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
+ u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot));
+ u32 snapshot_tree;
+ struct bch_snapshot_tree st;
+
+ rcu_read_lock();
+ snapshot_tree = snapshot_t(c, snapshot_root)->tree;
+ rcu_read_unlock();
+
+ ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st);
+
+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+ "%s: snapshot tree %u not found", __func__, snapshot_tree);
+
+ if (ret)
+ return ret;
+
+ if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset,
+ c, subvol_not_master_and_not_snapshot,
+ "subvolume %llu is not set as snapshot but is not master subvolume",
+ k.k->p.offset)) {
+ struct bkey_i_subvolume *s =
+ bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume);
+ ret = PTR_ERR_OR_ZERO(s);
+ if (ret)
+ return ret;
+
+ SET_BCH_SUBVOLUME_SNAP(&s->v, true);
+ }
+ }
+
+fsck_err:
+ return ret;
+}
+
+int bch2_check_subvols(struct bch_fs *c)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter,
+ BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ check_subvol(trans, &iter, k)));
+ if (ret)
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+/* Subvolumes: */
+
+int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags, struct printbuf *err)
+{
+ int ret = 0;
+
+ bkey_fsck_err_on(bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
+ bkey_gt(k.k->p, SUBVOL_POS_MAX), c, err,
+ subvol_pos_bad,
+ "invalid pos");
+fsck_err:
+ return ret;
+}
+
+void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+
+ prt_printf(out, "root %llu snapshot id %u",
+ le64_to_cpu(s.v->inode),
+ le32_to_cpu(s.v->snapshot));
+
+ if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, parent))
+ prt_printf(out, " parent %u", le32_to_cpu(s.v->parent));
+}
+
+static __always_inline int
+bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol,
+ bool inconsistent_if_not_found,
+ int iter_flags,
+ struct bch_subvolume *s)
+{
+ int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol),
+ iter_flags, subvolume, s);
+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) &&
+ inconsistent_if_not_found,
+ trans->c, "missing subvolume %u", subvol);
+ return ret;
+}
+
+int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
+ bool inconsistent_if_not_found,
+ int iter_flags,
+ struct bch_subvolume *s)
+{
+ return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, iter_flags, s);
+}
+
+int bch2_subvol_is_ro_trans(struct btree_trans *trans, u32 subvol)
+{
+ struct bch_subvolume s;
+ int ret = bch2_subvolume_get_inlined(trans, subvol, true, 0, &s);
+ if (ret)
+ return ret;
+
+ if (BCH_SUBVOLUME_RO(&s))
+ return -EROFS;
+ return 0;
+}
+
+int bch2_subvol_is_ro(struct bch_fs *c, u32 subvol)
+{
+ return bch2_trans_do(c, NULL, NULL, 0,
+ bch2_subvol_is_ro_trans(trans, subvol));
+}
+
+int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
+ struct bch_subvolume *subvol)
+{
+ struct bch_snapshot snap;
+
+ return bch2_snapshot_lookup(trans, snapshot, &snap) ?:
+ bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol);
+}
+
+int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
+ u32 *snapid)
+{
+ struct btree_iter iter;
+ struct bkey_s_c_subvolume subvol;
+ int ret;
+
+ subvol = bch2_bkey_get_iter_typed(trans, &iter,
+ BTREE_ID_subvolumes, POS(0, subvolid),
+ BTREE_ITER_CACHED|BTREE_ITER_WITH_UPDATES,
+ subvolume);
+ ret = bkey_err(subvol);
+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
+ "missing subvolume %u", subvolid);
+
+ if (likely(!ret))
+ *snapid = le32_to_cpu(subvol.v->snapshot);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int bch2_subvolume_reparent(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ u32 old_parent, u32 new_parent)
+{
+ struct bkey_i_subvolume *s;
+ int ret;
+
+ if (k.k->type != KEY_TYPE_subvolume)
+ return 0;
+
+ if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, parent) &&
+ le32_to_cpu(bkey_s_c_to_subvolume(k).v->parent) != old_parent)
+ return 0;
+
+ s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume);
+ ret = PTR_ERR_OR_ZERO(s);
+ if (ret)
+ return ret;
+
+ s->v.parent = cpu_to_le32(new_parent);
+ return 0;
+}
+
+/*
+ * Separate from the snapshot tree in the snapshots btree, we record the tree
+ * structure of how snapshot subvolumes were created - the parent subvolume of
+ * each snapshot subvolume.
+ *
+ * When a subvolume is deleted, we scan for child subvolumes and reparant them,
+ * to avoid dangling references:
+ */
+static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_subvolume s;
+
+ return lockrestart_do(trans,
+ bch2_subvolume_get(trans, subvolid_to_delete, true,
+ BTREE_ITER_CACHED, &s)) ?:
+ for_each_btree_key_commit(trans, iter,
+ BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL,
+ bch2_subvolume_reparent(trans, &iter, k,
+ subvolid_to_delete, le32_to_cpu(s.parent)));
+}
+
+/*
+ * Delete subvolume, mark snapshot ID as deleted, queue up snapshot
+ * deletion/cleanup:
+ */
+static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
+{
+ struct btree_iter iter;
+ struct bkey_s_c_subvolume subvol;
+ u32 snapid;
+ int ret = 0;
+
+ subvol = bch2_bkey_get_iter_typed(trans, &iter,
+ BTREE_ID_subvolumes, POS(0, subvolid),
+ BTREE_ITER_CACHED|BTREE_ITER_INTENT,
+ subvolume);
+ ret = bkey_err(subvol);
+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
+ "missing subvolume %u", subvolid);
+ if (ret)
+ return ret;
+
+ snapid = le32_to_cpu(subvol.v->snapshot);
+
+ ret = bch2_btree_delete_at(trans, &iter, 0) ?:
+ bch2_snapshot_node_set_deleted(trans, snapid);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
+{
+ return bch2_subvolumes_reparent(trans, subvolid) ?:
+ commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ __bch2_subvolume_delete(trans, subvolid));
+}
+
+static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
+{
+ struct bch_fs *c = container_of(work, struct bch_fs,
+ snapshot_wait_for_pagecache_and_delete_work);
+ snapshot_id_list s;
+ u32 *id;
+ int ret = 0;
+
+ while (!ret) {
+ mutex_lock(&c->snapshots_unlinked_lock);
+ s = c->snapshots_unlinked;
+ darray_init(&c->snapshots_unlinked);
+ mutex_unlock(&c->snapshots_unlinked_lock);
+
+ if (!s.nr)
+ break;
+
+ bch2_evict_subvolume_inodes(c, &s);
+
+ for (id = s.data; id < s.data + s.nr; id++) {
+ ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id));
+ if (ret) {
+ bch_err_msg(c, ret, "deleting subvolume %u", *id);
+ break;
+ }
+ }
+
+ darray_exit(&s);
+ }
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
+}
+
+struct subvolume_unlink_hook {
+ struct btree_trans_commit_hook h;
+ u32 subvol;
+};
+
+static int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
+ struct btree_trans_commit_hook *_h)
+{
+ struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h);
+ struct bch_fs *c = trans->c;
+ int ret = 0;
+
+ mutex_lock(&c->snapshots_unlinked_lock);
+ if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol))
+ ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol);
+ mutex_unlock(&c->snapshots_unlinked_lock);
+
+ if (ret)
+ return ret;
+
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache))
+ return -EROFS;
+
+ if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
+ bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
+ return 0;
+}
+
+int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
+{
+ struct btree_iter iter;
+ struct bkey_i_subvolume *n;
+ struct subvolume_unlink_hook *h;
+ int ret = 0;
+
+ h = bch2_trans_kmalloc(trans, sizeof(*h));
+ ret = PTR_ERR_OR_ZERO(h);
+ if (ret)
+ return ret;
+
+ h->h.fn = bch2_subvolume_wait_for_pagecache_and_delete_hook;
+ h->subvol = subvolid;
+ bch2_trans_commit_hook(trans, &h->h);
+
+ n = bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_subvolumes, POS(0, subvolid),
+ BTREE_ITER_CACHED, subvolume);
+ ret = PTR_ERR_OR_ZERO(n);
+ if (unlikely(ret)) {
+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
+ "missing subvolume %u", subvolid);
+ return ret;
+ }
+
+ SET_BCH_SUBVOLUME_UNLINKED(&n->v, true);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
+ u32 src_subvolid,
+ u32 *new_subvolid,
+ u32 *new_snapshotid,
+ bool ro)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL };
+ struct bkey_i_subvolume *new_subvol = NULL;
+ struct bkey_i_subvolume *src_subvol = NULL;
+ u32 parent = 0, new_nodes[2], snapshot_subvols[2];
+ int ret = 0;
+
+ ret = bch2_bkey_get_empty_slot(trans, &dst_iter,
+ BTREE_ID_subvolumes, POS(0, U32_MAX));
+ if (ret == -BCH_ERR_ENOSPC_btree_slot)
+ ret = -BCH_ERR_ENOSPC_subvolume_create;
+ if (ret)
+ return ret;
+
+ snapshot_subvols[0] = dst_iter.pos.offset;
+ snapshot_subvols[1] = src_subvolid;
+
+ if (src_subvolid) {
+ /* Creating a snapshot: */
+
+ src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter,
+ BTREE_ID_subvolumes, POS(0, src_subvolid),
+ BTREE_ITER_CACHED, subvolume);
+ ret = PTR_ERR_OR_ZERO(src_subvol);
+ if (unlikely(ret)) {
+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+ "subvolume %u not found", src_subvolid);
+ goto err;
+ }
+
+ parent = le32_to_cpu(src_subvol->v.snapshot);
+ }
+
+ ret = bch2_snapshot_node_create(trans, parent, new_nodes,
+ snapshot_subvols,
+ src_subvolid ? 2 : 1);
+ if (ret)
+ goto err;
+
+ if (src_subvolid) {
+ src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]);
+ ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0);
+ if (ret)
+ goto err;
+ }
+
+ new_subvol = bch2_bkey_alloc(trans, &dst_iter, 0, subvolume);
+ ret = PTR_ERR_OR_ZERO(new_subvol);
+ if (ret)
+ goto err;
+
+ new_subvol->v.flags = 0;
+ new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]);
+ new_subvol->v.inode = cpu_to_le64(inode);
+ new_subvol->v.parent = cpu_to_le32(src_subvolid);
+ new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c));
+ new_subvol->v.otime.hi = 0;
+
+ SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
+ SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
+
+ *new_subvolid = new_subvol->k.p.offset;
+ *new_snapshotid = new_nodes[0];
+err:
+ bch2_trans_iter_exit(trans, &src_iter);
+ bch2_trans_iter_exit(trans, &dst_iter);
+ return ret;
+}
+
+int bch2_fs_subvolumes_init(struct bch_fs *c)
+{
+ INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
+ INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work,
+ bch2_subvolume_wait_for_pagecache_and_delete);
+ mutex_init(&c->snapshots_unlinked_lock);
+ return 0;
+}
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
new file mode 100644
index 000000000000..a6f56f66e27c
--- /dev/null
+++ b/fs/bcachefs/subvolume.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUBVOLUME_H
+#define _BCACHEFS_SUBVOLUME_H
+
+#include "darray.h"
+#include "subvolume_types.h"
+
+enum bkey_invalid_flags;
+
+int bch2_check_subvols(struct bch_fs *);
+
+int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c,
+ enum bkey_invalid_flags, struct printbuf *);
+void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_subvolume ((struct bkey_ops) { \
+ .key_invalid = bch2_subvolume_invalid, \
+ .val_to_text = bch2_subvolume_to_text, \
+ .min_val_size = 16, \
+})
+
+int bch2_subvolume_get(struct btree_trans *, unsigned,
+ bool, int, struct bch_subvolume *);
+int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
+
+int bch2_subvol_is_ro_trans(struct btree_trans *, u32);
+int bch2_subvol_is_ro(struct bch_fs *, u32);
+
+int bch2_delete_dead_snapshots(struct bch_fs *);
+void bch2_delete_dead_snapshots_async(struct bch_fs *);
+
+int bch2_subvolume_unlink(struct btree_trans *, u32);
+int bch2_subvolume_create(struct btree_trans *, u64, u32,
+ u32 *, u32 *, bool);
+
+int bch2_fs_subvolumes_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_SUBVOLUME_H */
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
new file mode 100644
index 000000000000..2d2e66a4e468
--- /dev/null
+++ b/fs/bcachefs/subvolume_types.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUBVOLUME_TYPES_H
+#define _BCACHEFS_SUBVOLUME_TYPES_H
+
+#include "darray.h"
+
+typedef DARRAY(u32) snapshot_id_list;
+
+#define IS_ANCESTOR_BITMAP 128
+
+struct snapshot_t {
+ u32 parent;
+ u32 skip[3];
+ u32 depth;
+ u32 children[2];
+ u32 subvol; /* Nonzero only if a subvolume points to this node: */
+ u32 tree;
+ u32 equiv;
+ unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)];
+};
+
+struct snapshot_table {
+ DECLARE_FLEX_ARRAY(struct snapshot_t, s);
+};
+
+typedef struct {
+ u32 subvol;
+ u64 inum;
+} subvol_inum;
+
+#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
new file mode 100644
index 000000000000..4c98d8cc2a79
--- /dev/null
+++ b/fs/bcachefs/super-io.c
@@ -0,0 +1,1353 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "checksum.h"
+#include "counters.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_sb.h"
+#include "journal_seq_blacklist.h"
+#include "recovery.h"
+#include "replicas.h"
+#include "quota.h"
+#include "sb-clean.h"
+#include "sb-downgrade.h"
+#include "sb-errors.h"
+#include "sb-members.h"
+#include "super-io.h"
+#include "super.h"
+#include "trace.h"
+#include "vstructs.h"
+
+#include <linux/backing-dev.h>
+#include <linux/sort.h>
+
+static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
+};
+
+struct bch2_metadata_version {
+ u16 version;
+ const char *name;
+ u64 recovery_passes;
+};
+
+static const struct bch2_metadata_version bch2_metadata_versions[] = {
+#define x(n, v, _recovery_passes) { \
+ .version = v, \
+ .name = #n, \
+ .recovery_passes = _recovery_passes, \
+},
+ BCH_METADATA_VERSIONS()
+#undef x
+};
+
+void bch2_version_to_text(struct printbuf *out, unsigned v)
+{
+ const char *str = "(unknown version)";
+
+ for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++)
+ if (bch2_metadata_versions[i].version == v) {
+ str = bch2_metadata_versions[i].name;
+ break;
+ }
+
+ prt_printf(out, "%u.%u: %s", BCH_VERSION_MAJOR(v), BCH_VERSION_MINOR(v), str);
+}
+
+unsigned bch2_latest_compatible_version(unsigned v)
+{
+ if (!BCH_VERSION_MAJOR(v))
+ return v;
+
+ for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++)
+ if (bch2_metadata_versions[i].version > v &&
+ BCH_VERSION_MAJOR(bch2_metadata_versions[i].version) ==
+ BCH_VERSION_MAJOR(v))
+ v = bch2_metadata_versions[i].version;
+
+ return v;
+}
+
+u64 bch2_upgrade_recovery_passes(struct bch_fs *c,
+ unsigned old_version,
+ unsigned new_version)
+{
+ u64 ret = 0;
+
+ for (const struct bch2_metadata_version *i = bch2_metadata_versions;
+ i < bch2_metadata_versions + ARRAY_SIZE(bch2_metadata_versions);
+ i++)
+ if (i->version > old_version && i->version <= new_version) {
+ if (i->recovery_passes & RECOVERY_PASS_ALL_FSCK)
+ ret |= bch2_fsck_recovery_passes();
+ ret |= i->recovery_passes;
+ }
+
+ return ret &= ~RECOVERY_PASS_ALL_FSCK;
+}
+
+const char * const bch2_sb_fields[] = {
+#define x(name, nr) #name,
+ BCH_SB_FIELDS()
+#undef x
+ NULL
+};
+
+static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *,
+ struct printbuf *);
+
+struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb,
+ enum bch_sb_field_type type)
+{
+ struct bch_sb_field *f;
+
+ /* XXX: need locking around superblock to access optional fields */
+
+ vstruct_for_each(sb, f)
+ if (le32_to_cpu(f->type) == type)
+ return f;
+ return NULL;
+}
+
+static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb,
+ struct bch_sb_field *f,
+ unsigned u64s)
+{
+ unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0;
+ unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s;
+
+ BUG_ON(__vstruct_bytes(struct bch_sb, sb_u64s) > sb->buffer_size);
+
+ if (!f && !u64s) {
+ /* nothing to do: */
+ } else if (!f) {
+ f = vstruct_last(sb->sb);
+ memset(f, 0, sizeof(u64) * u64s);
+ f->u64s = cpu_to_le32(u64s);
+ f->type = 0;
+ } else {
+ void *src, *dst;
+
+ src = vstruct_end(f);
+
+ if (u64s) {
+ f->u64s = cpu_to_le32(u64s);
+ dst = vstruct_end(f);
+ } else {
+ dst = f;
+ }
+
+ memmove(dst, src, vstruct_end(sb->sb) - src);
+
+ if (dst > src)
+ memset(src, 0, dst - src);
+ }
+
+ sb->sb->u64s = cpu_to_le32(sb_u64s);
+
+ return u64s ? f : NULL;
+}
+
+void bch2_sb_field_delete(struct bch_sb_handle *sb,
+ enum bch_sb_field_type type)
+{
+ struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type);
+
+ if (f)
+ __bch2_sb_field_resize(sb, f, 0);
+}
+
+/* Superblock realloc/free: */
+
+void bch2_free_super(struct bch_sb_handle *sb)
+{
+ kfree(sb->bio);
+ if (!IS_ERR_OR_NULL(sb->bdev))
+ blkdev_put(sb->bdev, sb->holder);
+ kfree(sb->holder);
+ kfree(sb->sb_name);
+
+ kfree(sb->sb);
+ memset(sb, 0, sizeof(*sb));
+}
+
+int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
+{
+ size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s);
+ size_t new_buffer_size;
+ struct bch_sb *new_sb;
+ struct bio *bio;
+
+ if (sb->bdev)
+ new_bytes = max_t(size_t, new_bytes, bdev_logical_block_size(sb->bdev));
+
+ new_buffer_size = roundup_pow_of_two(new_bytes);
+
+ if (sb->sb && sb->buffer_size >= new_buffer_size)
+ return 0;
+
+ if (sb->sb && sb->have_layout) {
+ u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
+
+ if (new_bytes > max_bytes) {
+ pr_err("%pg: superblock too big: want %zu but have %llu",
+ sb->bdev, new_bytes, max_bytes);
+ return -BCH_ERR_ENOSPC_sb;
+ }
+ }
+
+ if (sb->buffer_size >= new_buffer_size && sb->sb)
+ return 0;
+
+ if (dynamic_fault("bcachefs:add:super_realloc"))
+ return -BCH_ERR_ENOMEM_sb_realloc_injected;
+
+ new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO);
+ if (!new_sb)
+ return -BCH_ERR_ENOMEM_sb_buf_realloc;
+
+ sb->sb = new_sb;
+
+ if (sb->have_bio) {
+ unsigned nr_bvecs = buf_pages(sb->sb, new_buffer_size);
+
+ bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
+ if (!bio)
+ return -BCH_ERR_ENOMEM_sb_bio_realloc;
+
+ bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0);
+
+ kfree(sb->bio);
+ sb->bio = bio;
+ }
+
+ sb->buffer_size = new_buffer_size;
+
+ return 0;
+}
+
+struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb,
+ enum bch_sb_field_type type,
+ unsigned u64s)
+{
+ struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type);
+ ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
+ ssize_t d = -old_u64s + u64s;
+
+ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
+ return NULL;
+
+ if (sb->fs_sb) {
+ struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb);
+ struct bch_dev *ca;
+ unsigned i;
+
+ lockdep_assert_held(&c->sb_lock);
+
+ /* XXX: we're not checking that offline device have enough space */
+
+ for_each_online_member(ca, c, i) {
+ struct bch_sb_handle *dev_sb = &ca->disk_sb;
+
+ if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) {
+ percpu_ref_put(&ca->ref);
+ return NULL;
+ }
+ }
+ }
+
+ f = bch2_sb_field_get_id(sb->sb, type);
+ f = __bch2_sb_field_resize(sb, f, u64s);
+ if (f)
+ f->type = cpu_to_le32(type);
+ return f;
+}
+
+struct bch_sb_field *bch2_sb_field_get_minsize_id(struct bch_sb_handle *sb,
+ enum bch_sb_field_type type,
+ unsigned u64s)
+{
+ struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type);
+
+ if (!f || le32_to_cpu(f->u64s) < u64s)
+ f = bch2_sb_field_resize_id(sb, type, u64s);
+ return f;
+}
+
+/* Superblock validate: */
+
+static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out)
+{
+ u64 offset, prev_offset, max_sectors;
+ unsigned i;
+
+ BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
+
+ if (!uuid_equal(&layout->magic, &BCACHE_MAGIC) &&
+ !uuid_equal(&layout->magic, &BCHFS_MAGIC)) {
+ prt_printf(out, "Not a bcachefs superblock layout");
+ return -BCH_ERR_invalid_sb_layout;
+ }
+
+ if (layout->layout_type != 0) {
+ prt_printf(out, "Invalid superblock layout type %u",
+ layout->layout_type);
+ return -BCH_ERR_invalid_sb_layout_type;
+ }
+
+ if (!layout->nr_superblocks) {
+ prt_printf(out, "Invalid superblock layout: no superblocks");
+ return -BCH_ERR_invalid_sb_layout_nr_superblocks;
+ }
+
+ if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) {
+ prt_printf(out, "Invalid superblock layout: too many superblocks");
+ return -BCH_ERR_invalid_sb_layout_nr_superblocks;
+ }
+
+ max_sectors = 1 << layout->sb_max_size_bits;
+
+ prev_offset = le64_to_cpu(layout->sb_offset[0]);
+
+ for (i = 1; i < layout->nr_superblocks; i++) {
+ offset = le64_to_cpu(layout->sb_offset[i]);
+
+ if (offset < prev_offset + max_sectors) {
+ prt_printf(out, "Invalid superblock layout: superblocks overlap\n"
+ " (sb %u ends at %llu next starts at %llu",
+ i - 1, prev_offset + max_sectors, offset);
+ return -BCH_ERR_invalid_sb_layout_superblocks_overlap;
+ }
+ prev_offset = offset;
+ }
+
+ return 0;
+}
+
+static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out)
+{
+ u16 version = le16_to_cpu(sb->version);
+ u16 version_min = le16_to_cpu(sb->version_min);
+
+ if (!bch2_version_compatible(version)) {
+ prt_str(out, "Unsupported superblock version ");
+ bch2_version_to_text(out, version);
+ prt_str(out, " (min ");
+ bch2_version_to_text(out, bcachefs_metadata_version_min);
+ prt_str(out, ", max ");
+ bch2_version_to_text(out, bcachefs_metadata_version_current);
+ prt_str(out, ")");
+ return -BCH_ERR_invalid_sb_version;
+ }
+
+ if (!bch2_version_compatible(version_min)) {
+ prt_str(out, "Unsupported superblock version_min ");
+ bch2_version_to_text(out, version_min);
+ prt_str(out, " (min ");
+ bch2_version_to_text(out, bcachefs_metadata_version_min);
+ prt_str(out, ", max ");
+ bch2_version_to_text(out, bcachefs_metadata_version_current);
+ prt_str(out, ")");
+ return -BCH_ERR_invalid_sb_version;
+ }
+
+ if (version_min > version) {
+ prt_str(out, "Bad minimum version ");
+ bch2_version_to_text(out, version_min);
+ prt_str(out, ", greater than version field ");
+ bch2_version_to_text(out, version);
+ return -BCH_ERR_invalid_sb_version;
+ }
+
+ return 0;
+}
+
+static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
+ int rw)
+{
+ struct bch_sb *sb = disk_sb->sb;
+ struct bch_sb_field *f;
+ struct bch_sb_field_members_v1 *mi;
+ enum bch_opt_id opt_id;
+ u16 block_size;
+ int ret;
+
+ ret = bch2_sb_compatible(sb, out);
+ if (ret)
+ return ret;
+
+ if (sb->features[1] ||
+ (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) {
+ prt_printf(out, "Filesystem has incompatible features");
+ return -BCH_ERR_invalid_sb_features;
+ }
+
+ block_size = le16_to_cpu(sb->block_size);
+
+ if (block_size > PAGE_SECTORS) {
+ prt_printf(out, "Block size too big (got %u, max %u)",
+ block_size, PAGE_SECTORS);
+ return -BCH_ERR_invalid_sb_block_size;
+ }
+
+ if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) {
+ prt_printf(out, "Bad user UUID (got zeroes)");
+ return -BCH_ERR_invalid_sb_uuid;
+ }
+
+ if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid))) {
+ prt_printf(out, "Bad internal UUID (got zeroes)");
+ return -BCH_ERR_invalid_sb_uuid;
+ }
+
+ if (!sb->nr_devices ||
+ sb->nr_devices > BCH_SB_MEMBERS_MAX) {
+ prt_printf(out, "Bad number of member devices %u (max %u)",
+ sb->nr_devices, BCH_SB_MEMBERS_MAX);
+ return -BCH_ERR_invalid_sb_too_many_members;
+ }
+
+ if (sb->dev_idx >= sb->nr_devices) {
+ prt_printf(out, "Bad dev_idx (got %u, nr_devices %u)",
+ sb->dev_idx, sb->nr_devices);
+ return -BCH_ERR_invalid_sb_dev_idx;
+ }
+
+ if (!sb->time_precision ||
+ le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) {
+ prt_printf(out, "Invalid time precision: %u (min 1, max %lu)",
+ le32_to_cpu(sb->time_precision), NSEC_PER_SEC);
+ return -BCH_ERR_invalid_sb_time_precision;
+ }
+
+ if (rw == READ) {
+ /*
+ * Been seeing a bug where these are getting inexplicably
+ * zeroed, so we're now validating them, but we have to be
+ * careful not to preven people's filesystems from mounting:
+ */
+ if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb))
+ SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
+ if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
+ SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000);
+
+ if (!BCH_SB_VERSION_UPGRADE_COMPLETE(sb))
+ SET_BCH_SB_VERSION_UPGRADE_COMPLETE(sb, le16_to_cpu(sb->version));
+ }
+
+ for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
+ const struct bch_option *opt = bch2_opt_table + opt_id;
+
+ if (opt->get_sb != BCH2_NO_SB_OPT) {
+ u64 v = bch2_opt_from_sb(sb, opt_id);
+
+ prt_printf(out, "Invalid option ");
+ ret = bch2_opt_validate(opt, v, out);
+ if (ret)
+ return ret;
+
+ printbuf_reset(out);
+ }
+ }
+
+ /* validate layout */
+ ret = validate_sb_layout(&sb->layout, out);
+ if (ret)
+ return ret;
+
+ vstruct_for_each(sb, f) {
+ if (!f->u64s) {
+ prt_printf(out, "Invalid superblock: optional field with size 0 (type %u)",
+ le32_to_cpu(f->type));
+ return -BCH_ERR_invalid_sb_field_size;
+ }
+
+ if (vstruct_next(f) > vstruct_last(sb)) {
+ prt_printf(out, "Invalid superblock: optional field extends past end of superblock (type %u)",
+ le32_to_cpu(f->type));
+ return -BCH_ERR_invalid_sb_field_size;
+ }
+ }
+
+ /* members must be validated first: */
+ mi = bch2_sb_field_get(sb, members_v1);
+ if (!mi) {
+ prt_printf(out, "Invalid superblock: member info area missing");
+ return -BCH_ERR_invalid_sb_members_missing;
+ }
+
+ ret = bch2_sb_field_validate(sb, &mi->field, out);
+ if (ret)
+ return ret;
+
+ vstruct_for_each(sb, f) {
+ if (le32_to_cpu(f->type) == BCH_SB_FIELD_members_v1)
+ continue;
+
+ ret = bch2_sb_field_validate(sb, f, out);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/* device open: */
+
+static unsigned long le_ulong_to_cpu(unsigned long v)
+{
+ return sizeof(unsigned long) == 8
+ ? le64_to_cpu(v)
+ : le32_to_cpu(v);
+}
+
+static void le_bitvector_to_cpu(unsigned long *dst, unsigned long *src, unsigned nr)
+{
+ BUG_ON(nr & (BITS_PER_TYPE(long) - 1));
+
+ for (unsigned i = 0; i < BITS_TO_LONGS(nr); i++)
+ dst[i] = le_ulong_to_cpu(src[i]);
+}
+
+static void bch2_sb_update(struct bch_fs *c)
+{
+ struct bch_sb *src = c->disk_sb.sb;
+ struct bch_dev *ca;
+ unsigned i;
+
+ lockdep_assert_held(&c->sb_lock);
+
+ c->sb.uuid = src->uuid;
+ c->sb.user_uuid = src->user_uuid;
+ c->sb.version = le16_to_cpu(src->version);
+ c->sb.version_min = le16_to_cpu(src->version_min);
+ c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src);
+ c->sb.nr_devices = src->nr_devices;
+ c->sb.clean = BCH_SB_CLEAN(src);
+ c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src);
+
+ c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision);
+ c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit;
+
+ /* XXX this is wrong, we need a 96 or 128 bit integer type */
+ c->sb.time_base_lo = div_u64(le64_to_cpu(src->time_base_lo),
+ c->sb.nsec_per_time_unit);
+ c->sb.time_base_hi = le32_to_cpu(src->time_base_hi);
+
+ c->sb.features = le64_to_cpu(src->features[0]);
+ c->sb.compat = le64_to_cpu(src->compat[0]);
+
+ memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent));
+
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext);
+ if (ext)
+ le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent,
+ sizeof(c->sb.errors_silent) * 8);
+
+ for_each_member_device(ca, c, i) {
+ struct bch_member m = bch2_sb_member_get(src, ca->dev_idx);
+ ca->mi = bch2_mi_to_cpu(&m);
+ }
+}
+
+static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
+{
+ struct bch_sb_field *src_f, *dst_f;
+ struct bch_sb *dst = dst_handle->sb;
+ unsigned i;
+
+ dst->version = src->version;
+ dst->version_min = src->version_min;
+ dst->seq = src->seq;
+ dst->uuid = src->uuid;
+ dst->user_uuid = src->user_uuid;
+ memcpy(dst->label, src->label, sizeof(dst->label));
+
+ dst->block_size = src->block_size;
+ dst->nr_devices = src->nr_devices;
+
+ dst->time_base_lo = src->time_base_lo;
+ dst->time_base_hi = src->time_base_hi;
+ dst->time_precision = src->time_precision;
+
+ memcpy(dst->flags, src->flags, sizeof(dst->flags));
+ memcpy(dst->features, src->features, sizeof(dst->features));
+ memcpy(dst->compat, src->compat, sizeof(dst->compat));
+
+ for (i = 0; i < BCH_SB_FIELD_NR; i++) {
+ int d;
+
+ if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS)
+ continue;
+
+ src_f = bch2_sb_field_get_id(src, i);
+ dst_f = bch2_sb_field_get_id(dst, i);
+
+ d = (src_f ? le32_to_cpu(src_f->u64s) : 0) -
+ (dst_f ? le32_to_cpu(dst_f->u64s) : 0);
+ if (d > 0) {
+ int ret = bch2_sb_realloc(dst_handle,
+ le32_to_cpu(dst_handle->sb->u64s) + d);
+
+ if (ret)
+ return ret;
+
+ dst = dst_handle->sb;
+ dst_f = bch2_sb_field_get_id(dst, i);
+ }
+
+ dst_f = __bch2_sb_field_resize(dst_handle, dst_f,
+ src_f ? le32_to_cpu(src_f->u64s) : 0);
+
+ if (src_f)
+ memcpy(dst_f, src_f, vstruct_bytes(src_f));
+ }
+
+ return 0;
+}
+
+int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
+{
+ int ret;
+
+ lockdep_assert_held(&c->sb_lock);
+
+ ret = bch2_sb_realloc(&c->disk_sb, 0) ?:
+ __copy_super(&c->disk_sb, src) ?:
+ bch2_sb_replicas_to_cpu_replicas(c) ?:
+ bch2_sb_disk_groups_to_cpu(c);
+ if (ret)
+ return ret;
+
+ bch2_sb_update(c);
+ return 0;
+}
+
+int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
+{
+ return __copy_super(&ca->disk_sb, c->disk_sb.sb);
+}
+
+/* read superblock: */
+
+static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err)
+{
+ struct bch_csum csum;
+ size_t bytes;
+ int ret;
+reread:
+ bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
+ sb->bio->bi_iter.bi_sector = offset;
+ bch2_bio_map(sb->bio, sb->sb, sb->buffer_size);
+
+ ret = submit_bio_wait(sb->bio);
+ if (ret) {
+ prt_printf(err, "IO error: %i", ret);
+ return ret;
+ }
+
+ if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) &&
+ !uuid_equal(&sb->sb->magic, &BCHFS_MAGIC)) {
+ prt_printf(err, "Not a bcachefs superblock");
+ return -BCH_ERR_invalid_sb_magic;
+ }
+
+ ret = bch2_sb_compatible(sb->sb, err);
+ if (ret)
+ return ret;
+
+ bytes = vstruct_bytes(sb->sb);
+
+ if (bytes > 512 << sb->sb->layout.sb_max_size_bits) {
+ prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)",
+ bytes, 512UL << sb->sb->layout.sb_max_size_bits);
+ return -BCH_ERR_invalid_sb_too_big;
+ }
+
+ if (bytes > sb->buffer_size) {
+ ret = bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s));
+ if (ret)
+ return ret;
+ goto reread;
+ }
+
+ if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) {
+ prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
+ return -BCH_ERR_invalid_sb_csum_type;
+ }
+
+ /* XXX: verify MACs */
+ csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
+ null_nonce(), sb->sb);
+
+ if (bch2_crc_cmp(csum, sb->sb->csum)) {
+ prt_printf(err, "bad checksum");
+ return -BCH_ERR_invalid_sb_csum;
+ }
+
+ sb->seq = le64_to_cpu(sb->sb->seq);
+
+ return 0;
+}
+
+int bch2_read_super(const char *path, struct bch_opts *opts,
+ struct bch_sb_handle *sb)
+{
+ u64 offset = opt_get(*opts, sb);
+ struct bch_sb_layout layout;
+ struct printbuf err = PRINTBUF;
+ __le64 *i;
+ int ret;
+#ifndef __KERNEL__
+retry:
+#endif
+ memset(sb, 0, sizeof(*sb));
+ sb->mode = BLK_OPEN_READ;
+ sb->have_bio = true;
+ sb->holder = kmalloc(1, GFP_KERNEL);
+ if (!sb->holder)
+ return -ENOMEM;
+
+ sb->sb_name = kstrdup(path, GFP_KERNEL);
+ if (!sb->sb_name)
+ return -ENOMEM;
+
+#ifndef __KERNEL__
+ if (opt_get(*opts, direct_io) == false)
+ sb->mode |= BLK_OPEN_BUFFERED;
+#endif
+
+ if (!opt_get(*opts, noexcl))
+ sb->mode |= BLK_OPEN_EXCL;
+
+ if (!opt_get(*opts, nochanges))
+ sb->mode |= BLK_OPEN_WRITE;
+
+ sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+ if (IS_ERR(sb->bdev) &&
+ PTR_ERR(sb->bdev) == -EACCES &&
+ opt_get(*opts, read_only)) {
+ sb->mode &= ~BLK_OPEN_WRITE;
+
+ sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+ if (!IS_ERR(sb->bdev))
+ opt_set(*opts, nochanges, true);
+ }
+
+ if (IS_ERR(sb->bdev)) {
+ ret = PTR_ERR(sb->bdev);
+ goto out;
+ }
+
+ ret = bch2_sb_realloc(sb, 0);
+ if (ret) {
+ prt_printf(&err, "error allocating memory for superblock");
+ goto err;
+ }
+
+ if (bch2_fs_init_fault("read_super")) {
+ prt_printf(&err, "dynamic fault");
+ ret = -EFAULT;
+ goto err;
+ }
+
+ ret = read_one_super(sb, offset, &err);
+ if (!ret)
+ goto got_super;
+
+ if (opt_defined(*opts, sb))
+ goto err;
+
+ printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s\n",
+ path, err.buf);
+ printbuf_reset(&err);
+
+ /*
+ * Error reading primary superblock - read location of backup
+ * superblocks:
+ */
+ bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
+ sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
+ /*
+ * use sb buffer to read layout, since sb buffer is page aligned but
+ * layout won't be:
+ */
+ bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout));
+
+ ret = submit_bio_wait(sb->bio);
+ if (ret) {
+ prt_printf(&err, "IO error: %i", ret);
+ goto err;
+ }
+
+ memcpy(&layout, sb->sb, sizeof(layout));
+ ret = validate_sb_layout(&layout, &err);
+ if (ret)
+ goto err;
+
+ for (i = layout.sb_offset;
+ i < layout.sb_offset + layout.nr_superblocks; i++) {
+ offset = le64_to_cpu(*i);
+
+ if (offset == opt_get(*opts, sb))
+ continue;
+
+ ret = read_one_super(sb, offset, &err);
+ if (!ret)
+ goto got_super;
+ }
+
+ goto err;
+
+got_super:
+ if (le16_to_cpu(sb->sb->block_size) << 9 <
+ bdev_logical_block_size(sb->bdev) &&
+ opt_get(*opts, direct_io)) {
+#ifndef __KERNEL__
+ opt_set(*opts, direct_io, false);
+ bch2_free_super(sb);
+ goto retry;
+#endif
+ prt_printf(&err, "block size (%u) smaller than device block size (%u)",
+ le16_to_cpu(sb->sb->block_size) << 9,
+ bdev_logical_block_size(sb->bdev));
+ ret = -BCH_ERR_block_size_too_small;
+ goto err;
+ }
+
+ ret = 0;
+ sb->have_layout = true;
+
+ ret = bch2_sb_validate(sb, &err, READ);
+ if (ret) {
+ printk(KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
+ path, err.buf);
+ goto err_no_print;
+ }
+out:
+ printbuf_exit(&err);
+ return ret;
+err:
+ printk(KERN_ERR "bcachefs (%s): error reading superblock: %s\n",
+ path, err.buf);
+err_no_print:
+ bch2_free_super(sb);
+ goto out;
+}
+
+/* write superblock: */
+
+static void write_super_endio(struct bio *bio)
+{
+ struct bch_dev *ca = bio->bi_private;
+
+ /* XXX: return errors directly */
+
+ if (bch2_dev_io_err_on(bio->bi_status, ca,
+ bio_data_dir(bio)
+ ? BCH_MEMBER_ERROR_write
+ : BCH_MEMBER_ERROR_read,
+ "superblock %s error: %s",
+ bio_data_dir(bio) ? "write" : "read",
+ bch2_blk_status_to_str(bio->bi_status)))
+ ca->sb_write_error = 1;
+
+ closure_put(&ca->fs->sb_write);
+ percpu_ref_put(&ca->io_ref);
+}
+
+static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
+{
+ struct bch_sb *sb = ca->disk_sb.sb;
+ struct bio *bio = ca->disk_sb.bio;
+
+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
+ bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]);
+ bio->bi_end_io = write_super_endio;
+ bio->bi_private = ca;
+ bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE);
+
+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb],
+ bio_sectors(bio));
+
+ percpu_ref_get(&ca->io_ref);
+ closure_bio_submit(bio, &c->sb_write);
+}
+
+static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
+{
+ struct bch_sb *sb = ca->disk_sb.sb;
+ struct bio *bio = ca->disk_sb.bio;
+
+ sb->offset = sb->layout.sb_offset[idx];
+
+ SET_BCH_SB_CSUM_TYPE(sb, bch2_csum_opt_to_type(c->opts.metadata_checksum, false));
+ sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
+ null_nonce(), sb);
+
+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
+ bio->bi_iter.bi_sector = le64_to_cpu(sb->offset);
+ bio->bi_end_io = write_super_endio;
+ bio->bi_private = ca;
+ bch2_bio_map(bio, sb,
+ roundup((size_t) vstruct_bytes(sb),
+ bdev_logical_block_size(ca->disk_sb.bdev)));
+
+ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb],
+ bio_sectors(bio));
+
+ percpu_ref_get(&ca->io_ref);
+ closure_bio_submit(bio, &c->sb_write);
+}
+
+int bch2_write_super(struct bch_fs *c)
+{
+ struct closure *cl = &c->sb_write;
+ struct bch_dev *ca;
+ struct printbuf err = PRINTBUF;
+ unsigned i, sb = 0, nr_wrote;
+ struct bch_devs_mask sb_written;
+ bool wrote, can_mount_without_written, can_mount_with_written;
+ unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
+ int ret = 0;
+
+ trace_and_count(c, write_super, c, _RET_IP_);
+
+ if (c->opts.very_degraded)
+ degraded_flags |= BCH_FORCE_IF_LOST;
+
+ lockdep_assert_held(&c->sb_lock);
+
+ closure_init_stack(cl);
+ memset(&sb_written, 0, sizeof(sb_written));
+
+ /* Make sure we're using the new magic numbers: */
+ c->disk_sb.sb->magic = BCHFS_MAGIC;
+ c->disk_sb.sb->layout.magic = BCHFS_MAGIC;
+
+ le64_add_cpu(&c->disk_sb.sb->seq, 1);
+
+ if (test_bit(BCH_FS_ERROR, &c->flags))
+ SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1);
+ if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags))
+ SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1);
+
+ SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
+
+ bch2_sb_counters_from_cpu(c);
+ bch2_sb_members_from_cpu(c);
+ bch2_sb_members_cpy_v2_v1(&c->disk_sb);
+ bch2_sb_errors_from_cpu(c);
+ bch2_sb_downgrade_update(c);
+
+ for_each_online_member(ca, c, i)
+ bch2_sb_from_fs(c, ca);
+
+ for_each_online_member(ca, c, i) {
+ printbuf_reset(&err);
+
+ ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE);
+ if (ret) {
+ bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
+ percpu_ref_put(&ca->io_ref);
+ goto out;
+ }
+ }
+
+ if (c->opts.nochanges)
+ goto out;
+
+ /*
+ * Defer writing the superblock until filesystem initialization is
+ * complete - don't write out a partly initialized superblock:
+ */
+ if (!BCH_SB_INITIALIZED(c->disk_sb.sb))
+ goto out;
+
+ for_each_online_member(ca, c, i) {
+ __set_bit(ca->dev_idx, sb_written.d);
+ ca->sb_write_error = 0;
+ }
+
+ for_each_online_member(ca, c, i)
+ read_back_super(c, ca);
+ closure_sync(cl);
+
+ for_each_online_member(ca, c, i) {
+ if (ca->sb_write_error)
+ continue;
+
+ if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) {
+ bch2_fs_fatal_error(c,
+ "Superblock write was silently dropped! (seq %llu expected %llu)",
+ le64_to_cpu(ca->sb_read_scratch->seq),
+ ca->disk_sb.seq);
+ percpu_ref_put(&ca->io_ref);
+ ret = -BCH_ERR_erofs_sb_err;
+ goto out;
+ }
+
+ if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) {
+ bch2_fs_fatal_error(c,
+ "Superblock modified by another process (seq %llu expected %llu)",
+ le64_to_cpu(ca->sb_read_scratch->seq),
+ ca->disk_sb.seq);
+ percpu_ref_put(&ca->io_ref);
+ ret = -BCH_ERR_erofs_sb_err;
+ goto out;
+ }
+ }
+
+ do {
+ wrote = false;
+ for_each_online_member(ca, c, i)
+ if (!ca->sb_write_error &&
+ sb < ca->disk_sb.sb->layout.nr_superblocks) {
+ write_one_super(c, ca, sb);
+ wrote = true;
+ }
+ closure_sync(cl);
+ sb++;
+ } while (wrote);
+
+ for_each_online_member(ca, c, i) {
+ if (ca->sb_write_error)
+ __clear_bit(ca->dev_idx, sb_written.d);
+ else
+ ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq);
+ }
+
+ nr_wrote = dev_mask_nr(&sb_written);
+
+ can_mount_with_written =
+ bch2_have_enough_devs(c, sb_written, degraded_flags, false);
+
+ for (i = 0; i < ARRAY_SIZE(sb_written.d); i++)
+ sb_written.d[i] = ~sb_written.d[i];
+
+ can_mount_without_written =
+ bch2_have_enough_devs(c, sb_written, degraded_flags, false);
+
+ /*
+ * If we would be able to mount _without_ the devices we successfully
+ * wrote superblocks to, we weren't able to write to enough devices:
+ *
+ * Exception: if we can mount without the successes because we haven't
+ * written anything (new filesystem), we continue if we'd be able to
+ * mount with the devices we did successfully write to:
+ */
+ if (bch2_fs_fatal_err_on(!nr_wrote ||
+ !can_mount_with_written ||
+ (can_mount_without_written &&
+ !can_mount_with_written), c,
+ "Unable to write superblock to sufficient devices (from %ps)",
+ (void *) _RET_IP_))
+ ret = -1;
+out:
+ /* Make new options visible after they're persistent: */
+ bch2_sb_update(c);
+ printbuf_exit(&err);
+ return ret;
+}
+
+void __bch2_check_set_feature(struct bch_fs *c, unsigned feat)
+{
+ mutex_lock(&c->sb_lock);
+ if (!(c->sb.features & (1ULL << feat))) {
+ c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat);
+
+ bch2_write_super(c);
+ }
+ mutex_unlock(&c->sb_lock);
+}
+
+/* Downgrade if superblock is at a higher version than currently supported: */
+bool bch2_check_version_downgrade(struct bch_fs *c)
+{
+ bool ret = bcachefs_metadata_version_current < c->sb.version;
+
+ lockdep_assert_held(&c->sb_lock);
+
+ /*
+ * Downgrade, if superblock is at a higher version than currently
+ * supported:
+ */
+ if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current)
+ SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
+ if (c->sb.version > bcachefs_metadata_version_current)
+ c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
+ if (c->sb.version_min > bcachefs_metadata_version_current)
+ c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current);
+ c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1);
+ return ret;
+}
+
+void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version)
+{
+ lockdep_assert_held(&c->sb_lock);
+
+ if (BCH_VERSION_MAJOR(new_version) >
+ BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version)))
+ bch2_sb_field_resize(&c->disk_sb, downgrade, 0);
+
+ c->disk_sb.sb->version = cpu_to_le16(new_version);
+ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
+}
+
+static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ struct printbuf *err)
+{
+ if (vstruct_bytes(f) < 88) {
+ prt_printf(err, "field too small (%zu < %u)", vstruct_bytes(f), 88);
+ return -BCH_ERR_invalid_sb_ext;
+ }
+
+ return 0;
+}
+
+static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_ext *e = field_to_type(f, ext);
+
+ prt_printf(out, "Recovery passes required:");
+ prt_tab(out);
+ prt_bitflags(out, bch2_recovery_passes,
+ bch2_recovery_passes_from_stable(le64_to_cpu(e->recovery_passes_required[0])));
+ prt_newline(out);
+
+ unsigned long *errors_silent = kmalloc(sizeof(e->errors_silent), GFP_KERNEL);
+ if (errors_silent) {
+ le_bitvector_to_cpu(errors_silent, (void *) e->errors_silent, sizeof(e->errors_silent) * 8);
+
+ prt_printf(out, "Errors to silently fix:");
+ prt_tab(out);
+ prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent, sizeof(e->errors_silent) * 8);
+ prt_newline(out);
+
+ kfree(errors_silent);
+ }
+}
+
+static const struct bch_sb_field_ops bch_sb_field_ops_ext = {
+ .validate = bch2_sb_ext_validate,
+ .to_text = bch2_sb_ext_to_text,
+};
+
+static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
+#define x(f, nr) \
+ [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f,
+ BCH_SB_FIELDS()
+#undef x
+};
+
+static const struct bch_sb_field_ops bch2_sb_field_null_ops;
+
+static const struct bch_sb_field_ops *bch2_sb_field_type_ops(unsigned type)
+{
+ return likely(type < ARRAY_SIZE(bch2_sb_field_ops))
+ ? bch2_sb_field_ops[type]
+ : &bch2_sb_field_null_ops;
+}
+
+static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ struct printbuf *err)
+{
+ unsigned type = le32_to_cpu(f->type);
+ struct printbuf field_err = PRINTBUF;
+ const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type);
+ int ret;
+
+ ret = ops->validate ? ops->validate(sb, f, &field_err) : 0;
+ if (ret) {
+ prt_printf(err, "Invalid superblock section %s: %s",
+ bch2_sb_fields[type], field_err.buf);
+ prt_newline(err);
+ bch2_sb_field_to_text(err, sb, f);
+ }
+
+ printbuf_exit(&field_err);
+ return ret;
+}
+
+void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ unsigned type = le32_to_cpu(f->type);
+ const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type);
+
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 32);
+
+ if (type < BCH_SB_FIELD_NR)
+ prt_printf(out, "%s", bch2_sb_fields[type]);
+ else
+ prt_printf(out, "(unknown field %u)", type);
+
+ prt_printf(out, " (size %zu):", vstruct_bytes(f));
+ prt_newline(out);
+
+ if (ops->to_text) {
+ printbuf_indent_add(out, 2);
+ ops->to_text(out, sb, f);
+ printbuf_indent_sub(out, 2);
+ }
+}
+
+void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
+{
+ unsigned i;
+
+ prt_printf(out, "Type: %u", l->layout_type);
+ prt_newline(out);
+
+ prt_str(out, "Superblock max size: ");
+ prt_units_u64(out, 512 << l->sb_max_size_bits);
+ prt_newline(out);
+
+ prt_printf(out, "Nr superblocks: %u", l->nr_superblocks);
+ prt_newline(out);
+
+ prt_str(out, "Offsets: ");
+ for (i = 0; i < l->nr_superblocks; i++) {
+ if (i)
+ prt_str(out, ", ");
+ prt_printf(out, "%llu", le64_to_cpu(l->sb_offset[i]));
+ }
+ prt_newline(out);
+}
+
+void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
+ bool print_layout, unsigned fields)
+{
+ struct bch_sb_field *f;
+ u64 fields_have = 0;
+ unsigned nr_devices = 0;
+
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 44);
+
+ for (int i = 0; i < sb->nr_devices; i++)
+ nr_devices += bch2_dev_exists(sb, i);
+
+ prt_printf(out, "External UUID:");
+ prt_tab(out);
+ pr_uuid(out, sb->user_uuid.b);
+ prt_newline(out);
+
+ prt_printf(out, "Internal UUID:");
+ prt_tab(out);
+ pr_uuid(out, sb->uuid.b);
+ prt_newline(out);
+
+ prt_str(out, "Device index:");
+ prt_tab(out);
+ prt_printf(out, "%u", sb->dev_idx);
+ prt_newline(out);
+
+ prt_str(out, "Label:");
+ prt_tab(out);
+ prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label);
+ prt_newline(out);
+
+ prt_str(out, "Version:");
+ prt_tab(out);
+ bch2_version_to_text(out, le16_to_cpu(sb->version));
+ prt_newline(out);
+
+ prt_str(out, "Version upgrade complete:");
+ prt_tab(out);
+ bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb));
+ prt_newline(out);
+
+ prt_printf(out, "Oldest version on disk:");
+ prt_tab(out);
+ bch2_version_to_text(out, le16_to_cpu(sb->version_min));
+ prt_newline(out);
+
+ prt_printf(out, "Created:");
+ prt_tab(out);
+ if (sb->time_base_lo)
+ bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
+ else
+ prt_printf(out, "(not set)");
+ prt_newline(out);
+
+ prt_printf(out, "Sequence number:");
+ prt_tab(out);
+ prt_printf(out, "%llu", le64_to_cpu(sb->seq));
+ prt_newline(out);
+
+ prt_printf(out, "Superblock size:");
+ prt_tab(out);
+ prt_printf(out, "%zu", vstruct_bytes(sb));
+ prt_newline(out);
+
+ prt_printf(out, "Clean:");
+ prt_tab(out);
+ prt_printf(out, "%llu", BCH_SB_CLEAN(sb));
+ prt_newline(out);
+
+ prt_printf(out, "Devices:");
+ prt_tab(out);
+ prt_printf(out, "%u", nr_devices);
+ prt_newline(out);
+
+ prt_printf(out, "Sections:");
+ vstruct_for_each(sb, f)
+ fields_have |= 1 << le32_to_cpu(f->type);
+ prt_tab(out);
+ prt_bitflags(out, bch2_sb_fields, fields_have);
+ prt_newline(out);
+
+ prt_printf(out, "Features:");
+ prt_tab(out);
+ prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0]));
+ prt_newline(out);
+
+ prt_printf(out, "Compat features:");
+ prt_tab(out);
+ prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0]));
+ prt_newline(out);
+
+ prt_newline(out);
+ prt_printf(out, "Options:");
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+ {
+ enum bch_opt_id id;
+
+ for (id = 0; id < bch2_opts_nr; id++) {
+ const struct bch_option *opt = bch2_opt_table + id;
+
+ if (opt->get_sb != BCH2_NO_SB_OPT) {
+ u64 v = bch2_opt_from_sb(sb, id);
+
+ prt_printf(out, "%s:", opt->attr.name);
+ prt_tab(out);
+ bch2_opt_to_text(out, NULL, sb, opt, v,
+ OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST);
+ prt_newline(out);
+ }
+ }
+ }
+
+ printbuf_indent_sub(out, 2);
+
+ if (print_layout) {
+ prt_newline(out);
+ prt_printf(out, "layout:");
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+ bch2_sb_layout_to_text(out, &sb->layout);
+ printbuf_indent_sub(out, 2);
+ }
+
+ vstruct_for_each(sb, f)
+ if (fields & (1 << le32_to_cpu(f->type))) {
+ prt_newline(out);
+ bch2_sb_field_to_text(out, sb, f);
+ }
+}
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
new file mode 100644
index 000000000000..e41e5de531a0
--- /dev/null
+++ b/fs/bcachefs/super-io.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUPER_IO_H
+#define _BCACHEFS_SUPER_IO_H
+
+#include "extents.h"
+#include "eytzinger.h"
+#include "super_types.h"
+#include "super.h"
+#include "sb-members.h"
+
+#include <asm/byteorder.h>
+
+static inline bool bch2_version_compatible(u16 version)
+{
+ return BCH_VERSION_MAJOR(version) <= BCH_VERSION_MAJOR(bcachefs_metadata_version_current) &&
+ version >= bcachefs_metadata_version_min;
+}
+
+void bch2_version_to_text(struct printbuf *, unsigned);
+unsigned bch2_latest_compatible_version(unsigned);
+
+u64 bch2_upgrade_recovery_passes(struct bch_fs *c,
+ unsigned,
+ unsigned);
+
+static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f)
+{
+ return le32_to_cpu(f->u64s) * sizeof(u64);
+}
+
+#define field_to_type(_f, _name) \
+ container_of_or_null(_f, struct bch_sb_field_##_name, field)
+
+struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *, enum bch_sb_field_type);
+#define bch2_sb_field_get(_sb, _name) \
+ field_to_type(bch2_sb_field_get_id(_sb, BCH_SB_FIELD_##_name), _name)
+
+struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *,
+ enum bch_sb_field_type, unsigned);
+#define bch2_sb_field_resize(_sb, _name, _u64s) \
+ field_to_type(bch2_sb_field_resize_id(_sb, BCH_SB_FIELD_##_name, _u64s), _name)
+
+struct bch_sb_field *bch2_sb_field_get_minsize_id(struct bch_sb_handle *,
+ enum bch_sb_field_type, unsigned);
+#define bch2_sb_field_get_minsize(_sb, _name, _u64s) \
+ field_to_type(bch2_sb_field_get_minsize_id(_sb, BCH_SB_FIELD_##_name, _u64s), _name)
+
+#define bch2_sb_field_nr_entries(_f) \
+ (_f ? ((bch2_sb_field_bytes(&_f->field) - sizeof(*_f)) / \
+ sizeof(_f->entries[0])) \
+ : 0)
+
+void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type);
+
+extern const char * const bch2_sb_fields[];
+
+struct bch_sb_field_ops {
+ int (*validate)(struct bch_sb *, struct bch_sb_field *, struct printbuf *);
+ void (*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *);
+};
+
+static inline __le64 bch2_sb_magic(struct bch_fs *c)
+{
+ __le64 ret;
+
+ memcpy(&ret, &c->sb.uuid, sizeof(ret));
+ return ret;
+}
+
+static inline __u64 jset_magic(struct bch_fs *c)
+{
+ return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC);
+}
+
+static inline __u64 bset_magic(struct bch_fs *c)
+{
+ return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC);
+}
+
+int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *);
+int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
+
+void bch2_free_super(struct bch_sb_handle *);
+int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
+
+int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
+int bch2_write_super(struct bch_fs *);
+void __bch2_check_set_feature(struct bch_fs *, unsigned);
+
+static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat)
+{
+ if (!(c->sb.features & (1ULL << feat)))
+ __bch2_check_set_feature(c, feat);
+}
+
+bool bch2_check_version_downgrade(struct bch_fs *);
+void bch2_sb_upgrade(struct bch_fs *, unsigned);
+
+void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
+ struct bch_sb_field *);
+void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *);
+void bch2_sb_to_text(struct printbuf *, struct bch_sb *, bool, unsigned);
+
+#endif /* _BCACHEFS_SUPER_IO_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
new file mode 100644
index 000000000000..818ec467a06b
--- /dev/null
+++ b/fs/bcachefs/super.c
@@ -0,0 +1,2030 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * bcachefs setup/teardown code, and some metadata io - read a superblock and
+ * figure out what to do with it.
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "bkey_sort.h"
+#include "btree_cache.h"
+#include "btree_gc.h"
+#include "btree_journal_iter.h"
+#include "btree_key_cache.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "btree_write_buffer.h"
+#include "buckets_waiting_for_journal.h"
+#include "chardev.h"
+#include "checksum.h"
+#include "clock.h"
+#include "compress.h"
+#include "counters.h"
+#include "debug.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "errcode.h"
+#include "error.h"
+#include "fs.h"
+#include "fs-io.h"
+#include "fs-io-buffered.h"
+#include "fs-io-direct.h"
+#include "fsck.h"
+#include "inode.h"
+#include "io_read.h"
+#include "io_write.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+#include "move.h"
+#include "migrate.h"
+#include "movinggc.h"
+#include "nocow_locking.h"
+#include "quota.h"
+#include "rebalance.h"
+#include "recovery.h"
+#include "replicas.h"
+#include "sb-clean.h"
+#include "sb-errors.h"
+#include "sb-members.h"
+#include "snapshot.h"
+#include "subvolume.h"
+#include "super.h"
+#include "super-io.h"
+#include "sysfs.h"
+#include "trace.h"
+
+#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/idr.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/random.h>
+#include <linux/sysfs.h>
+#include <crypto/hash.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
+MODULE_DESCRIPTION("bcachefs filesystem");
+MODULE_SOFTDEP("pre: crc32c");
+MODULE_SOFTDEP("pre: crc64");
+MODULE_SOFTDEP("pre: sha256");
+MODULE_SOFTDEP("pre: chacha20");
+MODULE_SOFTDEP("pre: poly1305");
+MODULE_SOFTDEP("pre: xxhash");
+
+#define KTYPE(type) \
+static const struct attribute_group type ## _group = { \
+ .attrs = type ## _files \
+}; \
+ \
+static const struct attribute_group *type ## _groups[] = { \
+ &type ## _group, \
+ NULL \
+}; \
+ \
+static const struct kobj_type type ## _ktype = { \
+ .release = type ## _release, \
+ .sysfs_ops = &type ## _sysfs_ops, \
+ .default_groups = type ## _groups \
+}
+
+static void bch2_fs_release(struct kobject *);
+static void bch2_dev_release(struct kobject *);
+static void bch2_fs_counters_release(struct kobject *k)
+{
+}
+
+static void bch2_fs_internal_release(struct kobject *k)
+{
+}
+
+static void bch2_fs_opts_dir_release(struct kobject *k)
+{
+}
+
+static void bch2_fs_time_stats_release(struct kobject *k)
+{
+}
+
+KTYPE(bch2_fs);
+KTYPE(bch2_fs_counters);
+KTYPE(bch2_fs_internal);
+KTYPE(bch2_fs_opts_dir);
+KTYPE(bch2_fs_time_stats);
+KTYPE(bch2_dev);
+
+static struct kset *bcachefs_kset;
+static LIST_HEAD(bch_fs_list);
+static DEFINE_MUTEX(bch_fs_list_lock);
+
+DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait);
+
+static void bch2_dev_free(struct bch_dev *);
+static int bch2_dev_alloc(struct bch_fs *, unsigned);
+static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
+static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
+
+struct bch_fs *bch2_dev_to_fs(dev_t dev)
+{
+ struct bch_fs *c;
+ struct bch_dev *ca;
+ unsigned i;
+
+ mutex_lock(&bch_fs_list_lock);
+ rcu_read_lock();
+
+ list_for_each_entry(c, &bch_fs_list, list)
+ for_each_member_device_rcu(ca, c, i, NULL)
+ if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) {
+ closure_get(&c->cl);
+ goto found;
+ }
+ c = NULL;
+found:
+ rcu_read_unlock();
+ mutex_unlock(&bch_fs_list_lock);
+
+ return c;
+}
+
+static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid)
+{
+ struct bch_fs *c;
+
+ lockdep_assert_held(&bch_fs_list_lock);
+
+ list_for_each_entry(c, &bch_fs_list, list)
+ if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid)))
+ return c;
+
+ return NULL;
+}
+
+struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)
+{
+ struct bch_fs *c;
+
+ mutex_lock(&bch_fs_list_lock);
+ c = __bch2_uuid_to_fs(uuid);
+ if (c)
+ closure_get(&c->cl);
+ mutex_unlock(&bch_fs_list_lock);
+
+ return c;
+}
+
+static void bch2_dev_usage_journal_reserve(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i, nr = 0, u64s =
+ ((sizeof(struct jset_entry_dev_usage) +
+ sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR)) /
+ sizeof(u64);
+
+ rcu_read_lock();
+ for_each_member_device_rcu(ca, c, i, NULL)
+ nr++;
+ rcu_read_unlock();
+
+ bch2_journal_entry_res_resize(&c->journal,
+ &c->dev_usage_journal_res, u64s * nr);
+}
+
+/* Filesystem RO/RW: */
+
+/*
+ * For startup/shutdown of RW stuff, the dependencies are:
+ *
+ * - foreground writes depend on copygc and rebalance (to free up space)
+ *
+ * - copygc and rebalance depend on mark and sweep gc (they actually probably
+ * don't because they either reserve ahead of time or don't block if
+ * allocations fail, but allocations can require mark and sweep gc to run
+ * because of generation number wraparound)
+ *
+ * - all of the above depends on the allocator threads
+ *
+ * - allocator depends on the journal (when it rewrites prios and gens)
+ */
+
+static void __bch2_fs_read_only(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i, clean_passes = 0;
+ u64 seq = 0;
+
+ bch2_fs_ec_stop(c);
+ bch2_open_buckets_stop(c, NULL, true);
+ bch2_rebalance_stop(c);
+ bch2_copygc_stop(c);
+ bch2_gc_thread_stop(c);
+ bch2_fs_ec_flush(c);
+
+ bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu",
+ journal_cur_seq(&c->journal));
+
+ do {
+ clean_passes++;
+
+ if (bch2_btree_interior_updates_flush(c) ||
+ bch2_journal_flush_all_pins(&c->journal) ||
+ bch2_btree_flush_all_writes(c) ||
+ seq != atomic64_read(&c->journal.seq)) {
+ seq = atomic64_read(&c->journal.seq);
+ clean_passes = 0;
+ }
+ } while (clean_passes < 2);
+
+ bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu",
+ journal_cur_seq(&c->journal));
+
+ if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) &&
+ !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+ set_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
+ bch2_fs_journal_stop(&c->journal);
+
+ /*
+ * After stopping journal:
+ */
+ for_each_member_device(ca, c, i)
+ bch2_dev_allocator_remove(c, ca);
+}
+
+#ifndef BCH_WRITE_REF_DEBUG
+static void bch2_writes_disabled(struct percpu_ref *writes)
+{
+ struct bch_fs *c = container_of(writes, struct bch_fs, writes);
+
+ set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+ wake_up(&bch2_read_only_wait);
+}
+#endif
+
+void bch2_fs_read_only(struct bch_fs *c)
+{
+ if (!test_bit(BCH_FS_RW, &c->flags)) {
+ bch2_journal_reclaim_stop(&c->journal);
+ return;
+ }
+
+ BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+
+ /*
+ * Block new foreground-end write operations from starting - any new
+ * writes will return -EROFS:
+ */
+ set_bit(BCH_FS_GOING_RO, &c->flags);
+#ifndef BCH_WRITE_REF_DEBUG
+ percpu_ref_kill(&c->writes);
+#else
+ for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++)
+ bch2_write_ref_put(c, i);
+#endif
+
+ /*
+ * If we're not doing an emergency shutdown, we want to wait on
+ * outstanding writes to complete so they don't see spurious errors due
+ * to shutting down the allocator:
+ *
+ * If we are doing an emergency shutdown outstanding writes may
+ * hang until we shutdown the allocator so we don't want to wait
+ * on outstanding writes before shutting everything down - but
+ * we do need to wait on them before returning and signalling
+ * that going RO is complete:
+ */
+ wait_event(bch2_read_only_wait,
+ test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) ||
+ test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
+
+ __bch2_fs_read_only(c);
+
+ wait_event(bch2_read_only_wait,
+ test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+
+ clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+ clear_bit(BCH_FS_GOING_RO, &c->flags);
+
+ if (!bch2_journal_error(&c->journal) &&
+ !test_bit(BCH_FS_ERROR, &c->flags) &&
+ !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) &&
+ test_bit(BCH_FS_STARTED, &c->flags) &&
+ test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags) &&
+ !c->opts.norecovery) {
+ BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
+ BUG_ON(atomic_read(&c->btree_cache.dirty));
+ BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
+ BUG_ON(c->btree_write_buffer.state.nr);
+
+ bch_verbose(c, "marking filesystem clean");
+ bch2_fs_mark_clean(c);
+ }
+
+ clear_bit(BCH_FS_RW, &c->flags);
+}
+
+static void bch2_fs_read_only_work(struct work_struct *work)
+{
+ struct bch_fs *c =
+ container_of(work, struct bch_fs, read_only_work);
+
+ down_write(&c->state_lock);
+ bch2_fs_read_only(c);
+ up_write(&c->state_lock);
+}
+
+static void bch2_fs_read_only_async(struct bch_fs *c)
+{
+ queue_work(system_long_wq, &c->read_only_work);
+}
+
+bool bch2_fs_emergency_read_only(struct bch_fs *c)
+{
+ bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
+
+ bch2_journal_halt(&c->journal);
+ bch2_fs_read_only_async(c);
+
+ wake_up(&bch2_read_only_wait);
+ return ret;
+}
+
+static int bch2_fs_read_write_late(struct bch_fs *c)
+{
+ int ret;
+
+ /*
+ * Data move operations can't run until after check_snapshots has
+ * completed, and bch2_snapshot_is_ancestor() is available.
+ *
+ * Ideally we'd start copygc/rebalance earlier instead of waiting for
+ * all of recovery/fsck to complete:
+ */
+ ret = bch2_copygc_start(c);
+ if (ret) {
+ bch_err(c, "error starting copygc thread");
+ return ret;
+ }
+
+ ret = bch2_rebalance_start(c);
+ if (ret) {
+ bch_err(c, "error starting rebalance thread");
+ return ret;
+ }
+
+ return 0;
+}
+
+static int __bch2_fs_read_write(struct bch_fs *c, bool early)
+{
+ struct bch_dev *ca;
+ unsigned i;
+ int ret;
+
+ if (test_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags)) {
+ bch_err(c, "cannot go rw, unfixed btree errors");
+ return -BCH_ERR_erofs_unfixed_errors;
+ }
+
+ if (test_bit(BCH_FS_RW, &c->flags))
+ return 0;
+
+ if (c->opts.norecovery)
+ return -BCH_ERR_erofs_norecovery;
+
+ /*
+ * nochanges is used for fsck -n mode - we have to allow going rw
+ * during recovery for that to work:
+ */
+ if (c->opts.nochanges && (!early || c->opts.read_only))
+ return -BCH_ERR_erofs_nochanges;
+
+ bch_info(c, "going read-write");
+
+ ret = bch2_sb_members_v2_init(c);
+ if (ret)
+ goto err;
+
+ ret = bch2_fs_mark_dirty(c);
+ if (ret)
+ goto err;
+
+ clear_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
+
+ /*
+ * First journal write must be a flush write: after a clean shutdown we
+ * don't read the journal, so the first journal write may end up
+ * overwriting whatever was there previously, and there must always be
+ * at least one non-flush write in the journal or recovery will fail:
+ */
+ set_bit(JOURNAL_NEED_FLUSH_WRITE, &c->journal.flags);
+
+ for_each_rw_member(ca, c, i)
+ bch2_dev_allocator_add(c, ca);
+ bch2_recalc_capacity(c);
+
+ set_bit(BCH_FS_RW, &c->flags);
+ set_bit(BCH_FS_WAS_RW, &c->flags);
+
+#ifndef BCH_WRITE_REF_DEBUG
+ percpu_ref_reinit(&c->writes);
+#else
+ for (i = 0; i < BCH_WRITE_REF_NR; i++) {
+ BUG_ON(atomic_long_read(&c->writes[i]));
+ atomic_long_inc(&c->writes[i]);
+ }
+#endif
+
+ ret = bch2_gc_thread_start(c);
+ if (ret) {
+ bch_err(c, "error starting gc thread");
+ return ret;
+ }
+
+ ret = bch2_journal_reclaim_start(&c->journal);
+ if (ret)
+ goto err;
+
+ if (!early) {
+ ret = bch2_fs_read_write_late(c);
+ if (ret)
+ goto err;
+ }
+
+ bch2_do_discards(c);
+ bch2_do_invalidates(c);
+ bch2_do_stripe_deletes(c);
+ bch2_do_pending_node_rewrites(c);
+ return 0;
+err:
+ if (test_bit(BCH_FS_RW, &c->flags))
+ bch2_fs_read_only(c);
+ else
+ __bch2_fs_read_only(c);
+ return ret;
+}
+
+int bch2_fs_read_write(struct bch_fs *c)
+{
+ return __bch2_fs_read_write(c, false);
+}
+
+int bch2_fs_read_write_early(struct bch_fs *c)
+{
+ lockdep_assert_held(&c->state_lock);
+
+ return __bch2_fs_read_write(c, true);
+}
+
+/* Filesystem startup/shutdown: */
+
+static void __bch2_fs_free(struct bch_fs *c)
+{
+ unsigned i;
+
+ for (i = 0; i < BCH_TIME_STAT_NR; i++)
+ bch2_time_stats_exit(&c->times[i]);
+
+ bch2_free_pending_node_rewrites(c);
+ bch2_fs_sb_errors_exit(c);
+ bch2_fs_counters_exit(c);
+ bch2_fs_snapshots_exit(c);
+ bch2_fs_quota_exit(c);
+ bch2_fs_fs_io_direct_exit(c);
+ bch2_fs_fs_io_buffered_exit(c);
+ bch2_fs_fsio_exit(c);
+ bch2_fs_ec_exit(c);
+ bch2_fs_encryption_exit(c);
+ bch2_fs_nocow_locking_exit(c);
+ bch2_fs_io_write_exit(c);
+ bch2_fs_io_read_exit(c);
+ bch2_fs_buckets_waiting_for_journal_exit(c);
+ bch2_fs_btree_interior_update_exit(c);
+ bch2_fs_btree_iter_exit(c);
+ bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
+ bch2_fs_btree_cache_exit(c);
+ bch2_fs_replicas_exit(c);
+ bch2_fs_journal_exit(&c->journal);
+ bch2_io_clock_exit(&c->io_clock[WRITE]);
+ bch2_io_clock_exit(&c->io_clock[READ]);
+ bch2_fs_compress_exit(c);
+ bch2_journal_keys_put_initial(c);
+ BUG_ON(atomic_read(&c->journal_keys.ref));
+ bch2_fs_btree_write_buffer_exit(c);
+ percpu_free_rwsem(&c->mark_lock);
+ free_percpu(c->online_reserved);
+
+ darray_exit(&c->btree_roots_extra);
+ free_percpu(c->pcpu);
+ mempool_exit(&c->large_bkey_pool);
+ mempool_exit(&c->btree_bounce_pool);
+ bioset_exit(&c->btree_bio);
+ mempool_exit(&c->fill_iter);
+#ifndef BCH_WRITE_REF_DEBUG
+ percpu_ref_exit(&c->writes);
+#endif
+ kfree(rcu_dereference_protected(c->disk_groups, 1));
+ kfree(c->journal_seq_blacklist_table);
+ kfree(c->unused_inode_hints);
+
+ if (c->write_ref_wq)
+ destroy_workqueue(c->write_ref_wq);
+ if (c->io_complete_wq)
+ destroy_workqueue(c->io_complete_wq);
+ if (c->copygc_wq)
+ destroy_workqueue(c->copygc_wq);
+ if (c->btree_io_complete_wq)
+ destroy_workqueue(c->btree_io_complete_wq);
+ if (c->btree_update_wq)
+ destroy_workqueue(c->btree_update_wq);
+
+ bch2_free_super(&c->disk_sb);
+ kvpfree(c, sizeof(*c));
+ module_put(THIS_MODULE);
+}
+
+static void bch2_fs_release(struct kobject *kobj)
+{
+ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
+
+ __bch2_fs_free(c);
+}
+
+void __bch2_fs_stop(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i;
+
+ bch_verbose(c, "shutting down");
+
+ set_bit(BCH_FS_STOPPING, &c->flags);
+
+ cancel_work_sync(&c->journal_seq_blacklist_gc_work);
+
+ down_write(&c->state_lock);
+ bch2_fs_read_only(c);
+ up_write(&c->state_lock);
+
+ for_each_member_device(ca, c, i)
+ if (ca->kobj.state_in_sysfs &&
+ ca->disk_sb.bdev)
+ sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
+
+ if (c->kobj.state_in_sysfs)
+ kobject_del(&c->kobj);
+
+ bch2_fs_debug_exit(c);
+ bch2_fs_chardev_exit(c);
+
+ kobject_put(&c->counters_kobj);
+ kobject_put(&c->time_stats);
+ kobject_put(&c->opts_dir);
+ kobject_put(&c->internal);
+
+ /* btree prefetch might have kicked off reads in the background: */
+ bch2_btree_flush_all_reads(c);
+
+ for_each_member_device(ca, c, i)
+ cancel_work_sync(&ca->io_error_work);
+
+ cancel_work_sync(&c->read_only_work);
+}
+
+void bch2_fs_free(struct bch_fs *c)
+{
+ unsigned i;
+
+ mutex_lock(&bch_fs_list_lock);
+ list_del(&c->list);
+ mutex_unlock(&bch_fs_list_lock);
+
+ closure_sync(&c->cl);
+ closure_debug_destroy(&c->cl);
+
+ for (i = 0; i < c->sb.nr_devices; i++) {
+ struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true);
+
+ if (ca) {
+ bch2_free_super(&ca->disk_sb);
+ bch2_dev_free(ca);
+ }
+ }
+
+ bch_verbose(c, "shutdown complete");
+
+ kobject_put(&c->kobj);
+}
+
+void bch2_fs_stop(struct bch_fs *c)
+{
+ __bch2_fs_stop(c);
+ bch2_fs_free(c);
+}
+
+static int bch2_fs_online(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i;
+ int ret = 0;
+
+ lockdep_assert_held(&bch_fs_list_lock);
+
+ if (__bch2_uuid_to_fs(c->sb.uuid)) {
+ bch_err(c, "filesystem UUID already open");
+ return -EINVAL;
+ }
+
+ ret = bch2_fs_chardev_init(c);
+ if (ret) {
+ bch_err(c, "error creating character device");
+ return ret;
+ }
+
+ bch2_fs_debug_init(c);
+
+ ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?:
+ kobject_add(&c->internal, &c->kobj, "internal") ?:
+ kobject_add(&c->opts_dir, &c->kobj, "options") ?:
+ kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
+ kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:
+ bch2_opts_create_sysfs_files(&c->opts_dir);
+ if (ret) {
+ bch_err(c, "error creating sysfs objects");
+ return ret;
+ }
+
+ down_write(&c->state_lock);
+
+ for_each_member_device(ca, c, i) {
+ ret = bch2_dev_sysfs_online(c, ca);
+ if (ret) {
+ bch_err(c, "error creating sysfs objects");
+ percpu_ref_put(&ca->ref);
+ goto err;
+ }
+ }
+
+ BUG_ON(!list_empty(&c->list));
+ list_add(&c->list, &bch_fs_list);
+err:
+ up_write(&c->state_lock);
+ return ret;
+}
+
+static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
+{
+ struct bch_fs *c;
+ struct printbuf name = PRINTBUF;
+ unsigned i, iter_size;
+ int ret = 0;
+
+ c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
+ if (!c) {
+ c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc);
+ goto out;
+ }
+
+ __module_get(THIS_MODULE);
+
+ closure_init(&c->cl, NULL);
+
+ c->kobj.kset = bcachefs_kset;
+ kobject_init(&c->kobj, &bch2_fs_ktype);
+ kobject_init(&c->internal, &bch2_fs_internal_ktype);
+ kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
+ kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
+ kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype);
+
+ c->minor = -1;
+ c->disk_sb.fs_sb = true;
+
+ init_rwsem(&c->state_lock);
+ mutex_init(&c->sb_lock);
+ mutex_init(&c->replicas_gc_lock);
+ mutex_init(&c->btree_root_lock);
+ INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);
+
+ init_rwsem(&c->gc_lock);
+ mutex_init(&c->gc_gens_lock);
+ atomic_set(&c->journal_keys.ref, 1);
+ c->journal_keys.initial_ref_held = true;
+
+ for (i = 0; i < BCH_TIME_STAT_NR; i++)
+ bch2_time_stats_init(&c->times[i]);
+
+ bch2_fs_copygc_init(c);
+ bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
+ bch2_fs_btree_iter_init_early(c);
+ bch2_fs_btree_interior_update_init_early(c);
+ bch2_fs_allocator_background_init(c);
+ bch2_fs_allocator_foreground_init(c);
+ bch2_fs_rebalance_init(c);
+ bch2_fs_quota_init(c);
+ bch2_fs_ec_init_early(c);
+ bch2_fs_move_init(c);
+ bch2_fs_sb_errors_init_early(c);
+
+ INIT_LIST_HEAD(&c->list);
+
+ mutex_init(&c->usage_scratch_lock);
+
+ mutex_init(&c->bio_bounce_pages_lock);
+ mutex_init(&c->snapshot_table_lock);
+ init_rwsem(&c->snapshot_create_lock);
+
+ spin_lock_init(&c->btree_write_error_lock);
+
+ INIT_WORK(&c->journal_seq_blacklist_gc_work,
+ bch2_blacklist_entries_gc);
+
+ INIT_LIST_HEAD(&c->journal_iters);
+
+ INIT_LIST_HEAD(&c->fsck_error_msgs);
+ mutex_init(&c->fsck_error_msgs_lock);
+
+ seqcount_init(&c->gc_pos_lock);
+
+ seqcount_init(&c->usage_lock);
+
+ sema_init(&c->io_in_flight, 128);
+
+ INIT_LIST_HEAD(&c->vfs_inodes_list);
+ mutex_init(&c->vfs_inodes_lock);
+
+ c->copy_gc_enabled = 1;
+ c->rebalance.enabled = 1;
+ c->promote_whole_extents = true;
+
+ c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write];
+ c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write];
+ c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal];
+ c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
+
+ bch2_fs_btree_cache_init_early(&c->btree_cache);
+
+ mutex_init(&c->sectors_available_lock);
+
+ ret = percpu_init_rwsem(&c->mark_lock);
+ if (ret)
+ goto err;
+
+ mutex_lock(&c->sb_lock);
+ ret = bch2_sb_to_fs(c, sb);
+ mutex_unlock(&c->sb_lock);
+
+ if (ret)
+ goto err;
+
+ pr_uuid(&name, c->sb.user_uuid.b);
+ strscpy(c->name, name.buf, sizeof(c->name));
+ printbuf_exit(&name);
+
+ ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;
+ if (ret)
+ goto err;
+
+ /* Compat: */
+ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
+ !BCH_SB_JOURNAL_FLUSH_DELAY(sb))
+ SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
+
+ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
+ !BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
+ SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100);
+
+ c->opts = bch2_opts_default;
+ ret = bch2_opts_from_sb(&c->opts, sb);
+ if (ret)
+ goto err;
+
+ bch2_opts_apply(&c->opts, opts);
+
+ c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc;
+ if (c->opts.inodes_use_key_cache)
+ c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes;
+ c->btree_key_cache_btrees |= 1U << BTREE_ID_logged_ops;
+
+ c->block_bits = ilog2(block_sectors(c));
+ c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
+
+ if (bch2_fs_init_fault("fs_alloc")) {
+ bch_err(c, "fs_alloc fault injected");
+ ret = -EFAULT;
+ goto err;
+ }
+
+ iter_size = sizeof(struct sort_iter) +
+ (btree_blocks(c) + 1) * 2 *
+ sizeof(struct sort_iter_set);
+
+ c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
+
+ if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
+ WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512)) ||
+ !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
+ WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
+ !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
+ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
+ !(c->io_complete_wq = alloc_workqueue("bcachefs_io",
+ WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) ||
+ !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
+ WQ_FREEZABLE, 0)) ||
+#ifndef BCH_WRITE_REF_DEBUG
+ percpu_ref_init(&c->writes, bch2_writes_disabled,
+ PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
+#endif
+ mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
+ bioset_init(&c->btree_bio, 1,
+ max(offsetof(struct btree_read_bio, bio),
+ offsetof(struct btree_write_bio, wbio.bio)),
+ BIOSET_NEED_BVECS) ||
+ !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
+ !(c->online_reserved = alloc_percpu(u64)) ||
+ mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
+ btree_bytes(c)) ||
+ mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
+ !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
+ sizeof(u64), GFP_KERNEL))) {
+ ret = -BCH_ERR_ENOMEM_fs_other_alloc;
+ goto err;
+ }
+
+ ret = bch2_fs_counters_init(c) ?:
+ bch2_fs_sb_errors_init(c) ?:
+ bch2_io_clock_init(&c->io_clock[READ]) ?:
+ bch2_io_clock_init(&c->io_clock[WRITE]) ?:
+ bch2_fs_journal_init(&c->journal) ?:
+ bch2_fs_replicas_init(c) ?:
+ bch2_fs_btree_cache_init(c) ?:
+ bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
+ bch2_fs_btree_iter_init(c) ?:
+ bch2_fs_btree_interior_update_init(c) ?:
+ bch2_fs_buckets_waiting_for_journal_init(c) ?:
+ bch2_fs_btree_write_buffer_init(c) ?:
+ bch2_fs_subvolumes_init(c) ?:
+ bch2_fs_io_read_init(c) ?:
+ bch2_fs_io_write_init(c) ?:
+ bch2_fs_nocow_locking_init(c) ?:
+ bch2_fs_encryption_init(c) ?:
+ bch2_fs_compress_init(c) ?:
+ bch2_fs_ec_init(c) ?:
+ bch2_fs_fsio_init(c) ?:
+ bch2_fs_fs_io_buffered_init(c) ?:
+ bch2_fs_fs_io_direct_init(c);
+ if (ret)
+ goto err;
+
+ for (i = 0; i < c->sb.nr_devices; i++)
+ if (bch2_dev_exists(c->disk_sb.sb, i) &&
+ bch2_dev_alloc(c, i)) {
+ ret = -EEXIST;
+ goto err;
+ }
+
+ bch2_journal_entry_res_resize(&c->journal,
+ &c->btree_root_journal_res,
+ BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX));
+ bch2_dev_usage_journal_reserve(c);
+ bch2_journal_entry_res_resize(&c->journal,
+ &c->clock_journal_res,
+ (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2);
+
+ mutex_lock(&bch_fs_list_lock);
+ ret = bch2_fs_online(c);
+ mutex_unlock(&bch_fs_list_lock);
+
+ if (ret)
+ goto err;
+out:
+ return c;
+err:
+ bch2_fs_free(c);
+ c = ERR_PTR(ret);
+ goto out;
+}
+
+noinline_for_stack
+static void print_mount_opts(struct bch_fs *c)
+{
+ enum bch_opt_id i;
+ struct printbuf p = PRINTBUF;
+ bool first = true;
+
+ prt_str(&p, "mounting version ");
+ bch2_version_to_text(&p, c->sb.version);
+
+ if (c->opts.read_only) {
+ prt_str(&p, " opts=");
+ first = false;
+ prt_printf(&p, "ro");
+ }
+
+ for (i = 0; i < bch2_opts_nr; i++) {
+ const struct bch_option *opt = &bch2_opt_table[i];
+ u64 v = bch2_opt_get_by_id(&c->opts, i);
+
+ if (!(opt->flags & OPT_MOUNT))
+ continue;
+
+ if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
+ continue;
+
+ prt_str(&p, first ? " opts=" : ",");
+ first = false;
+ bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE);
+ }
+
+ bch_info(c, "%s", p.buf);
+ printbuf_exit(&p);
+}
+
+int bch2_fs_start(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ time64_t now = ktime_get_real_seconds();
+ unsigned i;
+ int ret;
+
+ print_mount_opts(c);
+
+ down_write(&c->state_lock);
+
+ BUG_ON(test_bit(BCH_FS_STARTED, &c->flags));
+
+ mutex_lock(&c->sb_lock);
+
+ ret = bch2_sb_members_v2_init(c);
+ if (ret) {
+ mutex_unlock(&c->sb_lock);
+ goto err;
+ }
+
+ for_each_online_member(ca, c, i)
+ bch2_members_v2_get_mut(c->disk_sb.sb, i)->last_mount = cpu_to_le64(now);
+
+ mutex_unlock(&c->sb_lock);
+
+ for_each_rw_member(ca, c, i)
+ bch2_dev_allocator_add(c, ca);
+ bch2_recalc_capacity(c);
+
+ ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
+ ? bch2_fs_recovery(c)
+ : bch2_fs_initialize(c);
+ if (ret)
+ goto err;
+
+ ret = bch2_opts_check_may_set(c);
+ if (ret)
+ goto err;
+
+ if (bch2_fs_init_fault("fs_start")) {
+ bch_err(c, "fs_start fault injected");
+ ret = -EINVAL;
+ goto err;
+ }
+
+ set_bit(BCH_FS_STARTED, &c->flags);
+
+ if (c->opts.read_only || c->opts.nochanges) {
+ bch2_fs_read_only(c);
+ } else {
+ ret = !test_bit(BCH_FS_RW, &c->flags)
+ ? bch2_fs_read_write(c)
+ : bch2_fs_read_write_late(c);
+ if (ret)
+ goto err;
+ }
+
+ ret = 0;
+out:
+ up_write(&c->state_lock);
+ return ret;
+err:
+ bch_err_msg(c, ret, "starting filesystem");
+ goto out;
+}
+
+static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
+{
+ struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
+
+ if (le16_to_cpu(sb->block_size) != block_sectors(c))
+ return -BCH_ERR_mismatched_block_size;
+
+ if (le16_to_cpu(m.bucket_size) <
+ BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
+ return -BCH_ERR_bucket_size_too_small;
+
+ return 0;
+}
+
+static int bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
+{
+ struct bch_sb *newest =
+ le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb;
+
+ if (!uuid_equal(&fs->uuid, &sb->uuid))
+ return -BCH_ERR_device_not_a_member_of_filesystem;
+
+ if (!bch2_dev_exists(newest, sb->dev_idx))
+ return -BCH_ERR_device_has_been_removed;
+
+ if (fs->block_size != sb->block_size)
+ return -BCH_ERR_mismatched_block_size;
+
+ return 0;
+}
+
+/* Device startup/shutdown: */
+
+static void bch2_dev_release(struct kobject *kobj)
+{
+ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
+
+ kfree(ca);
+}
+
+static void bch2_dev_free(struct bch_dev *ca)
+{
+ cancel_work_sync(&ca->io_error_work);
+
+ if (ca->kobj.state_in_sysfs &&
+ ca->disk_sb.bdev)
+ sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
+
+ if (ca->kobj.state_in_sysfs)
+ kobject_del(&ca->kobj);
+
+ bch2_free_super(&ca->disk_sb);
+ bch2_dev_journal_exit(ca);
+
+ free_percpu(ca->io_done);
+ bioset_exit(&ca->replica_set);
+ bch2_dev_buckets_free(ca);
+ free_page((unsigned long) ca->sb_read_scratch);
+
+ bch2_time_stats_exit(&ca->io_latency[WRITE]);
+ bch2_time_stats_exit(&ca->io_latency[READ]);
+
+ percpu_ref_exit(&ca->io_ref);
+ percpu_ref_exit(&ca->ref);
+ kobject_put(&ca->kobj);
+}
+
+static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
+{
+
+ lockdep_assert_held(&c->state_lock);
+
+ if (percpu_ref_is_zero(&ca->io_ref))
+ return;
+
+ __bch2_dev_read_only(c, ca);
+
+ reinit_completion(&ca->io_ref_completion);
+ percpu_ref_kill(&ca->io_ref);
+ wait_for_completion(&ca->io_ref_completion);
+
+ if (ca->kobj.state_in_sysfs) {
+ sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
+ sysfs_remove_link(&ca->kobj, "block");
+ }
+
+ bch2_free_super(&ca->disk_sb);
+ bch2_dev_journal_exit(ca);
+}
+
+static void bch2_dev_ref_complete(struct percpu_ref *ref)
+{
+ struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
+
+ complete(&ca->ref_completion);
+}
+
+static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
+{
+ struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
+
+ complete(&ca->io_ref_completion);
+}
+
+static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
+{
+ int ret;
+
+ if (!c->kobj.state_in_sysfs)
+ return 0;
+
+ if (!ca->kobj.state_in_sysfs) {
+ ret = kobject_add(&ca->kobj, &c->kobj,
+ "dev-%u", ca->dev_idx);
+ if (ret)
+ return ret;
+ }
+
+ if (ca->disk_sb.bdev) {
+ struct kobject *block = bdev_kobj(ca->disk_sb.bdev);
+
+ ret = sysfs_create_link(block, &ca->kobj, "bcachefs");
+ if (ret)
+ return ret;
+
+ ret = sysfs_create_link(&ca->kobj, block, "block");
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
+ struct bch_member *member)
+{
+ struct bch_dev *ca;
+ unsigned i;
+
+ ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+ if (!ca)
+ return NULL;
+
+ kobject_init(&ca->kobj, &bch2_dev_ktype);
+ init_completion(&ca->ref_completion);
+ init_completion(&ca->io_ref_completion);
+
+ init_rwsem(&ca->bucket_lock);
+
+ INIT_WORK(&ca->io_error_work, bch2_io_error_work);
+
+ bch2_time_stats_init(&ca->io_latency[READ]);
+ bch2_time_stats_init(&ca->io_latency[WRITE]);
+
+ ca->mi = bch2_mi_to_cpu(member);
+
+ for (i = 0; i < ARRAY_SIZE(member->errors); i++)
+ atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i]));
+
+ ca->uuid = member->uuid;
+
+ ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
+ ca->mi.bucket_size / btree_sectors(c));
+
+ if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
+ 0, GFP_KERNEL) ||
+ percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
+ PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
+ !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) ||
+ bch2_dev_buckets_alloc(c, ca) ||
+ bioset_init(&ca->replica_set, 4,
+ offsetof(struct bch_write_bio, bio), 0) ||
+ !(ca->io_done = alloc_percpu(*ca->io_done)))
+ goto err;
+
+ return ca;
+err:
+ bch2_dev_free(ca);
+ return NULL;
+}
+
+static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca,
+ unsigned dev_idx)
+{
+ ca->dev_idx = dev_idx;
+ __set_bit(ca->dev_idx, ca->self.d);
+ scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
+
+ ca->fs = c;
+ rcu_assign_pointer(c->devs[ca->dev_idx], ca);
+
+ if (bch2_dev_sysfs_online(c, ca))
+ pr_warn("error creating sysfs objects");
+}
+
+static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
+{
+ struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
+ struct bch_dev *ca = NULL;
+ int ret = 0;
+
+ if (bch2_fs_init_fault("dev_alloc"))
+ goto err;
+
+ ca = __bch2_dev_alloc(c, &member);
+ if (!ca)
+ goto err;
+
+ ca->fs = c;
+
+ bch2_dev_attach(c, ca, dev_idx);
+ return ret;
+err:
+ if (ca)
+ bch2_dev_free(ca);
+ return -BCH_ERR_ENOMEM_dev_alloc;
+}
+
+static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
+{
+ unsigned ret;
+
+ if (bch2_dev_is_online(ca)) {
+ bch_err(ca, "already have device online in slot %u",
+ sb->sb->dev_idx);
+ return -BCH_ERR_device_already_online;
+ }
+
+ if (get_capacity(sb->bdev->bd_disk) <
+ ca->mi.bucket_size * ca->mi.nbuckets) {
+ bch_err(ca, "cannot online: device too small");
+ return -BCH_ERR_device_size_too_small;
+ }
+
+ BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
+
+ ret = bch2_dev_journal_init(ca, sb->sb);
+ if (ret)
+ return ret;
+
+ /* Commit: */
+ ca->disk_sb = *sb;
+ memset(sb, 0, sizeof(*sb));
+
+ ca->dev = ca->disk_sb.bdev->bd_dev;
+
+ percpu_ref_reinit(&ca->io_ref);
+
+ return 0;
+}
+
+static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
+{
+ struct bch_dev *ca;
+ int ret;
+
+ lockdep_assert_held(&c->state_lock);
+
+ if (le64_to_cpu(sb->sb->seq) >
+ le64_to_cpu(c->disk_sb.sb->seq))
+ bch2_sb_to_fs(c, sb->sb);
+
+ BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
+ !c->devs[sb->sb->dev_idx]);
+
+ ca = bch_dev_locked(c, sb->sb->dev_idx);
+
+ ret = __bch2_dev_attach_bdev(ca, sb);
+ if (ret)
+ return ret;
+
+ bch2_dev_sysfs_online(c, ca);
+
+ if (c->sb.nr_devices == 1)
+ snprintf(c->name, sizeof(c->name), "%pg", ca->disk_sb.bdev);
+ snprintf(ca->name, sizeof(ca->name), "%pg", ca->disk_sb.bdev);
+
+ rebalance_wakeup(c);
+ return 0;
+}
+
+/* Device management: */
+
+/*
+ * Note: this function is also used by the error paths - when a particular
+ * device sees an error, we call it to determine whether we can just set the
+ * device RO, or - if this function returns false - we'll set the whole
+ * filesystem RO:
+ *
+ * XXX: maybe we should be more explicit about whether we're changing state
+ * because we got an error or what have you?
+ */
+bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
+ enum bch_member_state new_state, int flags)
+{
+ struct bch_devs_mask new_online_devs;
+ struct bch_dev *ca2;
+ int i, nr_rw = 0, required;
+
+ lockdep_assert_held(&c->state_lock);
+
+ switch (new_state) {
+ case BCH_MEMBER_STATE_rw:
+ return true;
+ case BCH_MEMBER_STATE_ro:
+ if (ca->mi.state != BCH_MEMBER_STATE_rw)
+ return true;
+
+ /* do we have enough devices to write to? */
+ for_each_member_device(ca2, c, i)
+ if (ca2 != ca)
+ nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw;
+
+ required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
+ ? c->opts.metadata_replicas
+ : c->opts.metadata_replicas_required,
+ !(flags & BCH_FORCE_IF_DATA_DEGRADED)
+ ? c->opts.data_replicas
+ : c->opts.data_replicas_required);
+
+ return nr_rw >= required;
+ case BCH_MEMBER_STATE_failed:
+ case BCH_MEMBER_STATE_spare:
+ if (ca->mi.state != BCH_MEMBER_STATE_rw &&
+ ca->mi.state != BCH_MEMBER_STATE_ro)
+ return true;
+
+ /* do we have enough devices to read from? */
+ new_online_devs = bch2_online_devs(c);
+ __clear_bit(ca->dev_idx, new_online_devs.d);
+
+ return bch2_have_enough_devs(c, new_online_devs, flags, false);
+ default:
+ BUG();
+ }
+}
+
+static bool bch2_fs_may_start(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i, flags = 0;
+
+ if (c->opts.very_degraded)
+ flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST;
+
+ if (c->opts.degraded)
+ flags |= BCH_FORCE_IF_DEGRADED;
+
+ if (!c->opts.degraded &&
+ !c->opts.very_degraded) {
+ mutex_lock(&c->sb_lock);
+
+ for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
+ if (!bch2_dev_exists(c->disk_sb.sb, i))
+ continue;
+
+ ca = bch_dev_locked(c, i);
+
+ if (!bch2_dev_is_online(ca) &&
+ (ca->mi.state == BCH_MEMBER_STATE_rw ||
+ ca->mi.state == BCH_MEMBER_STATE_ro)) {
+ mutex_unlock(&c->sb_lock);
+ return false;
+ }
+ }
+ mutex_unlock(&c->sb_lock);
+ }
+
+ return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true);
+}
+
+static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
+{
+ /*
+ * The allocator thread itself allocates btree nodes, so stop it first:
+ */
+ bch2_dev_allocator_remove(c, ca);
+ bch2_dev_journal_stop(&c->journal, ca);
+}
+
+static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
+{
+ lockdep_assert_held(&c->state_lock);
+
+ BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw);
+
+ bch2_dev_allocator_add(c, ca);
+ bch2_recalc_capacity(c);
+}
+
+int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
+ enum bch_member_state new_state, int flags)
+{
+ struct bch_member *m;
+ int ret = 0;
+
+ if (ca->mi.state == new_state)
+ return 0;
+
+ if (!bch2_dev_state_allowed(c, ca, new_state, flags))
+ return -BCH_ERR_device_state_not_allowed;
+
+ if (new_state != BCH_MEMBER_STATE_rw)
+ __bch2_dev_read_only(c, ca);
+
+ bch_notice(ca, "%s", bch2_member_states[new_state]);
+
+ mutex_lock(&c->sb_lock);
+ m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+ SET_BCH_MEMBER_STATE(m, new_state);
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
+ if (new_state == BCH_MEMBER_STATE_rw)
+ __bch2_dev_read_write(c, ca);
+
+ rebalance_wakeup(c);
+
+ return ret;
+}
+
+int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
+ enum bch_member_state new_state, int flags)
+{
+ int ret;
+
+ down_write(&c->state_lock);
+ ret = __bch2_dev_set_state(c, ca, new_state, flags);
+ up_write(&c->state_lock);
+
+ return ret;
+}
+
+/* Device add/removal: */
+
+static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
+{
+ struct bpos start = POS(ca->dev_idx, 0);
+ struct bpos end = POS(ca->dev_idx, U64_MAX);
+ int ret;
+
+ /*
+ * We clear the LRU and need_discard btrees first so that we don't race
+ * with bch2_do_invalidates() and bch2_do_discards()
+ */
+ ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
+ BTREE_TRIGGER_NORUN, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
+ BTREE_TRIGGER_NORUN, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
+ BTREE_TRIGGER_NORUN, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
+ BTREE_TRIGGER_NORUN, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
+ BTREE_TRIGGER_NORUN, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
+ BTREE_TRIGGER_NORUN, NULL);
+ if (ret)
+ bch_err_msg(c, ret, "removing dev alloc info");
+
+ return ret;
+}
+
+int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
+{
+ struct bch_member *m;
+ unsigned dev_idx = ca->dev_idx, data;
+ int ret;
+
+ down_write(&c->state_lock);
+
+ /*
+ * We consume a reference to ca->ref, regardless of whether we succeed
+ * or fail:
+ */
+ percpu_ref_put(&ca->ref);
+
+ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
+ bch_err(ca, "Cannot remove without losing data");
+ ret = -BCH_ERR_device_state_not_allowed;
+ goto err;
+ }
+
+ __bch2_dev_read_only(c, ca);
+
+ ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
+ if (ret) {
+ bch_err_msg(ca, ret, "dropping data");
+ goto err;
+ }
+
+ ret = bch2_dev_remove_alloc(c, ca);
+ if (ret) {
+ bch_err_msg(ca, ret, "deleting alloc info");
+ goto err;
+ }
+
+ ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
+ if (ret) {
+ bch_err_msg(ca, ret, "flushing journal");
+ goto err;
+ }
+
+ ret = bch2_journal_flush(&c->journal);
+ if (ret) {
+ bch_err(ca, "journal error");
+ goto err;
+ }
+
+ ret = bch2_replicas_gc2(c);
+ if (ret) {
+ bch_err_msg(ca, ret, "in replicas_gc2()");
+ goto err;
+ }
+
+ data = bch2_dev_has_data(c, ca);
+ if (data) {
+ struct printbuf data_has = PRINTBUF;
+
+ prt_bitflags(&data_has, bch2_data_types, data);
+ bch_err(ca, "Remove failed, still has data (%s)", data_has.buf);
+ printbuf_exit(&data_has);
+ ret = -EBUSY;
+ goto err;
+ }
+
+ __bch2_dev_offline(c, ca);
+
+ mutex_lock(&c->sb_lock);
+ rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
+ mutex_unlock(&c->sb_lock);
+
+ percpu_ref_kill(&ca->ref);
+ wait_for_completion(&ca->ref_completion);
+
+ bch2_dev_free(ca);
+
+ /*
+ * At this point the device object has been removed in-core, but the
+ * on-disk journal might still refer to the device index via sb device
+ * usage entries. Recovery fails if it sees usage information for an
+ * invalid device. Flush journal pins to push the back of the journal
+ * past now invalid device index references before we update the
+ * superblock, but after the device object has been removed so any
+ * further journal writes elide usage info for the device.
+ */
+ bch2_journal_flush_all_pins(&c->journal);
+
+ /*
+ * Free this device's slot in the bch_member array - all pointers to
+ * this device must be gone:
+ */
+ mutex_lock(&c->sb_lock);
+ m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
+ memset(&m->uuid, 0, sizeof(m->uuid));
+
+ bch2_write_super(c);
+
+ mutex_unlock(&c->sb_lock);
+ up_write(&c->state_lock);
+
+ bch2_dev_usage_journal_reserve(c);
+ return 0;
+err:
+ if (ca->mi.state == BCH_MEMBER_STATE_rw &&
+ !percpu_ref_is_zero(&ca->io_ref))
+ __bch2_dev_read_write(c, ca);
+ up_write(&c->state_lock);
+ return ret;
+}
+
+/* Add new device to running filesystem: */
+int bch2_dev_add(struct bch_fs *c, const char *path)
+{
+ struct bch_opts opts = bch2_opts_empty();
+ struct bch_sb_handle sb;
+ struct bch_dev *ca = NULL;
+ struct bch_sb_field_members_v2 *mi;
+ struct bch_member dev_mi;
+ unsigned dev_idx, nr_devices, u64s;
+ struct printbuf errbuf = PRINTBUF;
+ struct printbuf label = PRINTBUF;
+ int ret;
+
+ ret = bch2_read_super(path, &opts, &sb);
+ if (ret) {
+ bch_err_msg(c, ret, "reading super");
+ goto err;
+ }
+
+ dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
+
+ if (BCH_MEMBER_GROUP(&dev_mi)) {
+ bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
+ if (label.allocation_failure) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ }
+
+ ret = bch2_dev_may_add(sb.sb, c);
+ if (ret) {
+ bch_err_fn(c, ret);
+ goto err;
+ }
+
+ ca = __bch2_dev_alloc(c, &dev_mi);
+ if (!ca) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ bch2_dev_usage_init(ca);
+
+ ret = __bch2_dev_attach_bdev(ca, &sb);
+ if (ret)
+ goto err;
+
+ ret = bch2_dev_journal_alloc(ca);
+ if (ret) {
+ bch_err_msg(c, ret, "allocating journal");
+ goto err;
+ }
+
+ down_write(&c->state_lock);
+ mutex_lock(&c->sb_lock);
+
+ ret = bch2_sb_from_fs(c, ca);
+ if (ret) {
+ bch_err_msg(c, ret, "setting up new superblock");
+ goto err_unlock;
+ }
+
+ if (dynamic_fault("bcachefs:add:no_slot"))
+ goto no_slot;
+
+ for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
+ if (!bch2_dev_exists(c->disk_sb.sb, dev_idx))
+ goto have_slot;
+no_slot:
+ ret = -BCH_ERR_ENOSPC_sb_members;
+ bch_err_msg(c, ret, "setting up new superblock");
+ goto err_unlock;
+
+have_slot:
+ nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
+
+ mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+ u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) +
+ le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64));
+
+ mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
+ if (!mi) {
+ ret = -BCH_ERR_ENOSPC_sb_members;
+ bch_err_msg(c, ret, "setting up new superblock");
+ goto err_unlock;
+ }
+ struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
+
+ /* success: */
+
+ *m = dev_mi;
+ m->last_mount = cpu_to_le64(ktime_get_real_seconds());
+ c->disk_sb.sb->nr_devices = nr_devices;
+
+ ca->disk_sb.sb->dev_idx = dev_idx;
+ bch2_dev_attach(c, ca, dev_idx);
+
+ if (BCH_MEMBER_GROUP(&dev_mi)) {
+ ret = __bch2_dev_group_set(c, ca, label.buf);
+ if (ret) {
+ bch_err_msg(c, ret, "creating new label");
+ goto err_unlock;
+ }
+ }
+
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
+ bch2_dev_usage_journal_reserve(c);
+
+ ret = bch2_trans_mark_dev_sb(c, ca);
+ if (ret) {
+ bch_err_msg(ca, ret, "marking new superblock");
+ goto err_late;
+ }
+
+ ret = bch2_fs_freespace_init(c);
+ if (ret) {
+ bch_err_msg(ca, ret, "initializing free space");
+ goto err_late;
+ }
+
+ ca->new_fs_bucket_idx = 0;
+
+ if (ca->mi.state == BCH_MEMBER_STATE_rw)
+ __bch2_dev_read_write(c, ca);
+
+ up_write(&c->state_lock);
+ return 0;
+
+err_unlock:
+ mutex_unlock(&c->sb_lock);
+ up_write(&c->state_lock);
+err:
+ if (ca)
+ bch2_dev_free(ca);
+ bch2_free_super(&sb);
+ printbuf_exit(&label);
+ printbuf_exit(&errbuf);
+ return ret;
+err_late:
+ up_write(&c->state_lock);
+ ca = NULL;
+ goto err;
+}
+
+/* Hot add existing device to running filesystem: */
+int bch2_dev_online(struct bch_fs *c, const char *path)
+{
+ struct bch_opts opts = bch2_opts_empty();
+ struct bch_sb_handle sb = { NULL };
+ struct bch_dev *ca;
+ unsigned dev_idx;
+ int ret;
+
+ down_write(&c->state_lock);
+
+ ret = bch2_read_super(path, &opts, &sb);
+ if (ret) {
+ up_write(&c->state_lock);
+ return ret;
+ }
+
+ dev_idx = sb.sb->dev_idx;
+
+ ret = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
+ if (ret) {
+ bch_err_msg(c, ret, "bringing %s online", path);
+ goto err;
+ }
+
+ ret = bch2_dev_attach_bdev(c, &sb);
+ if (ret)
+ goto err;
+
+ ca = bch_dev_locked(c, dev_idx);
+
+ ret = bch2_trans_mark_dev_sb(c, ca);
+ if (ret) {
+ bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path);
+ goto err;
+ }
+
+ if (ca->mi.state == BCH_MEMBER_STATE_rw)
+ __bch2_dev_read_write(c, ca);
+
+ if (!ca->mi.freespace_initialized) {
+ ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
+ bch_err_msg(ca, ret, "initializing free space");
+ if (ret)
+ goto err;
+ }
+
+ if (!ca->journal.nr) {
+ ret = bch2_dev_journal_alloc(ca);
+ bch_err_msg(ca, ret, "allocating journal");
+ if (ret)
+ goto err;
+ }
+
+ mutex_lock(&c->sb_lock);
+ bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount =
+ cpu_to_le64(ktime_get_real_seconds());
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
+ up_write(&c->state_lock);
+ return 0;
+err:
+ up_write(&c->state_lock);
+ bch2_free_super(&sb);
+ return ret;
+}
+
+int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
+{
+ down_write(&c->state_lock);
+
+ if (!bch2_dev_is_online(ca)) {
+ bch_err(ca, "Already offline");
+ up_write(&c->state_lock);
+ return 0;
+ }
+
+ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
+ bch_err(ca, "Cannot offline required disk");
+ up_write(&c->state_lock);
+ return -BCH_ERR_device_state_not_allowed;
+ }
+
+ __bch2_dev_offline(c, ca);
+
+ up_write(&c->state_lock);
+ return 0;
+}
+
+int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
+{
+ struct bch_member *m;
+ u64 old_nbuckets;
+ int ret = 0;
+
+ down_write(&c->state_lock);
+ old_nbuckets = ca->mi.nbuckets;
+
+ if (nbuckets < ca->mi.nbuckets) {
+ bch_err(ca, "Cannot shrink yet");
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (bch2_dev_is_online(ca) &&
+ get_capacity(ca->disk_sb.bdev->bd_disk) <
+ ca->mi.bucket_size * nbuckets) {
+ bch_err(ca, "New size larger than device");
+ ret = -BCH_ERR_device_size_too_small;
+ goto err;
+ }
+
+ ret = bch2_dev_buckets_resize(c, ca, nbuckets);
+ if (ret) {
+ bch_err_msg(ca, ret, "resizing buckets");
+ goto err;
+ }
+
+ ret = bch2_trans_mark_dev_sb(c, ca);
+ if (ret)
+ goto err;
+
+ mutex_lock(&c->sb_lock);
+ m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+ m->nbuckets = cpu_to_le64(nbuckets);
+
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
+ if (ca->mi.freespace_initialized) {
+ ret = bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets);
+ if (ret)
+ goto err;
+
+ /*
+ * XXX: this is all wrong transactionally - we'll be able to do
+ * this correctly after the disk space accounting rewrite
+ */
+ ca->usage_base->d[BCH_DATA_free].buckets += nbuckets - old_nbuckets;
+ }
+
+ bch2_recalc_capacity(c);
+err:
+ up_write(&c->state_lock);
+ return ret;
+}
+
+/* return with ref on ca->ref: */
+struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
+{
+ struct bch_dev *ca;
+ unsigned i;
+
+ rcu_read_lock();
+ for_each_member_device_rcu(ca, c, i, NULL)
+ if (!strcmp(name, ca->name))
+ goto found;
+ ca = ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
+found:
+ rcu_read_unlock();
+
+ return ca;
+}
+
+/* Filesystem open: */
+
+struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
+ struct bch_opts opts)
+{
+ DARRAY(struct bch_sb_handle) sbs = { 0 };
+ struct bch_fs *c = NULL;
+ struct bch_sb_handle *sb, *best = NULL;
+ struct printbuf errbuf = PRINTBUF;
+ int ret = 0;
+
+ if (!try_module_get(THIS_MODULE))
+ return ERR_PTR(-ENODEV);
+
+ if (!nr_devices) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ret = darray_make_room(&sbs, nr_devices);
+ if (ret)
+ goto err;
+
+ for (unsigned i = 0; i < nr_devices; i++) {
+ struct bch_sb_handle sb = { NULL };
+
+ ret = bch2_read_super(devices[i], &opts, &sb);
+ if (ret)
+ goto err;
+
+ BUG_ON(darray_push(&sbs, sb));
+ }
+
+ darray_for_each(sbs, sb)
+ if (!best || le64_to_cpu(sb->sb->seq) > le64_to_cpu(best->sb->seq))
+ best = sb;
+
+ darray_for_each_reverse(sbs, sb) {
+ if (sb != best && !bch2_dev_exists(best->sb, sb->sb->dev_idx)) {
+ pr_info("%pg has been removed, skipping", sb->bdev);
+ bch2_free_super(sb);
+ darray_remove_item(&sbs, sb);
+ best -= best > sb;
+ continue;
+ }
+
+ ret = bch2_dev_in_fs(best->sb, sb->sb);
+ if (ret)
+ goto err_print;
+ }
+
+ c = bch2_fs_alloc(best->sb, opts);
+ ret = PTR_ERR_OR_ZERO(c);
+ if (ret)
+ goto err;
+
+ down_write(&c->state_lock);
+ darray_for_each(sbs, sb) {
+ ret = bch2_dev_attach_bdev(c, sb);
+ if (ret) {
+ up_write(&c->state_lock);
+ goto err;
+ }
+ }
+ up_write(&c->state_lock);
+
+ if (!bch2_fs_may_start(c)) {
+ ret = -BCH_ERR_insufficient_devices_to_start;
+ goto err_print;
+ }
+
+ if (!c->opts.nostart) {
+ ret = bch2_fs_start(c);
+ if (ret)
+ goto err;
+ }
+out:
+ darray_for_each(sbs, sb)
+ bch2_free_super(sb);
+ darray_exit(&sbs);
+ printbuf_exit(&errbuf);
+ module_put(THIS_MODULE);
+ return c;
+err_print:
+ pr_err("bch_fs_open err opening %s: %s",
+ devices[0], bch2_err_str(ret));
+err:
+ if (!IS_ERR_OR_NULL(c))
+ bch2_fs_stop(c);
+ c = ERR_PTR(ret);
+ goto out;
+}
+
+/* Global interfaces/init */
+
+static void bcachefs_exit(void)
+{
+ bch2_debug_exit();
+ bch2_vfs_exit();
+ bch2_chardev_exit();
+ bch2_btree_key_cache_exit();
+ if (bcachefs_kset)
+ kset_unregister(bcachefs_kset);
+}
+
+static int __init bcachefs_init(void)
+{
+ bch2_bkey_pack_test();
+
+ if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
+ bch2_btree_key_cache_init() ||
+ bch2_chardev_init() ||
+ bch2_vfs_init() ||
+ bch2_debug_init())
+ goto err;
+
+ return 0;
+err:
+ bcachefs_exit();
+ return -ENOMEM;
+}
+
+#define BCH_DEBUG_PARAM(name, description) \
+ bool bch2_##name; \
+ module_param_named(name, bch2_##name, bool, 0644); \
+ MODULE_PARM_DESC(name, description);
+BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+__maybe_unused
+static unsigned bch2_metadata_version = bcachefs_metadata_version_current;
+module_param_named(version, bch2_metadata_version, uint, 0400);
+
+module_exit(bcachefs_exit);
+module_init(bcachefs_init);
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
new file mode 100644
index 000000000000..bf762df18012
--- /dev/null
+++ b/fs/bcachefs/super.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUPER_H
+#define _BCACHEFS_SUPER_H
+
+#include "extents.h"
+
+#include "bcachefs_ioctl.h"
+
+#include <linux/math64.h>
+
+struct bch_fs *bch2_dev_to_fs(dev_t);
+struct bch_fs *bch2_uuid_to_fs(__uuid_t);
+
+bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *,
+ enum bch_member_state, int);
+int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
+ enum bch_member_state, int);
+int bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
+ enum bch_member_state, int);
+
+int bch2_dev_fail(struct bch_dev *, int);
+int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int);
+int bch2_dev_add(struct bch_fs *, const char *);
+int bch2_dev_online(struct bch_fs *, const char *);
+int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int);
+int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64);
+struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
+
+bool bch2_fs_emergency_read_only(struct bch_fs *);
+void bch2_fs_read_only(struct bch_fs *);
+
+int bch2_fs_read_write(struct bch_fs *);
+int bch2_fs_read_write_early(struct bch_fs *);
+
+/*
+ * Only for use in the recovery/fsck path:
+ */
+static inline void bch2_fs_lazy_rw(struct bch_fs *c)
+{
+ if (!test_bit(BCH_FS_RW, &c->flags) &&
+ !test_bit(BCH_FS_WAS_RW, &c->flags))
+ bch2_fs_read_write_early(c);
+}
+
+void __bch2_fs_stop(struct bch_fs *);
+void bch2_fs_free(struct bch_fs *);
+void bch2_fs_stop(struct bch_fs *);
+
+int bch2_fs_start(struct bch_fs *);
+struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
+
+#endif /* _BCACHEFS_SUPER_H */
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
new file mode 100644
index 000000000000..9c1fd4ca2b10
--- /dev/null
+++ b/fs/bcachefs/super_types.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUPER_TYPES_H
+#define _BCACHEFS_SUPER_TYPES_H
+
+struct bch_sb_handle {
+ struct bch_sb *sb;
+ struct block_device *bdev;
+ char *sb_name;
+ struct bio *bio;
+ void *holder;
+ size_t buffer_size;
+ blk_mode_t mode;
+ unsigned have_layout:1;
+ unsigned have_bio:1;
+ unsigned fs_sb:1;
+ u64 seq;
+};
+
+struct bch_devs_mask {
+ unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)];
+};
+
+struct bch_devs_list {
+ u8 nr;
+ u8 devs[BCH_BKEY_PTRS_MAX];
+};
+
+struct bch_member_cpu {
+ u64 nbuckets; /* device size */
+ u16 first_bucket; /* index of first bucket used */
+ u16 bucket_size; /* sectors */
+ u16 group;
+ u8 state;
+ u8 discard;
+ u8 data_allowed;
+ u8 durability;
+ u8 freespace_initialized;
+ u8 valid;
+};
+
+#endif /* _BCACHEFS_SUPER_TYPES_H */
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
new file mode 100644
index 000000000000..f3cb7115b530
--- /dev/null
+++ b/fs/bcachefs/sysfs.c
@@ -0,0 +1,1034 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * bcache sysfs interfaces
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#ifndef NO_BCACHEFS_SYSFS
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "sysfs.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_key_cache.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "clock.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "inode.h"
+#include "journal.h"
+#include "keylist.h"
+#include "move.h"
+#include "movinggc.h"
+#include "nocow_locking.h"
+#include "opts.h"
+#include "rebalance.h"
+#include "replicas.h"
+#include "super-io.h"
+#include "tests.h"
+
+#include <linux/blkdev.h>
+#include <linux/sort.h>
+#include <linux/sched/clock.h>
+
+#include "util.h"
+
+#define SYSFS_OPS(type) \
+const struct sysfs_ops type ## _sysfs_ops = { \
+ .show = type ## _show, \
+ .store = type ## _store \
+}
+
+#define SHOW(fn) \
+static ssize_t fn ## _to_text(struct printbuf *, \
+ struct kobject *, struct attribute *); \
+ \
+static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
+ char *buf) \
+{ \
+ struct printbuf out = PRINTBUF; \
+ ssize_t ret = fn ## _to_text(&out, kobj, attr); \
+ \
+ if (out.pos && out.buf[out.pos - 1] != '\n') \
+ prt_newline(&out); \
+ \
+ if (!ret && out.allocation_failure) \
+ ret = -ENOMEM; \
+ \
+ if (!ret) { \
+ ret = min_t(size_t, out.pos, PAGE_SIZE - 1); \
+ memcpy(buf, out.buf, ret); \
+ } \
+ printbuf_exit(&out); \
+ return bch2_err_class(ret); \
+} \
+ \
+static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\
+ struct attribute *attr)
+
+#define STORE(fn) \
+static ssize_t fn ## _store_inner(struct kobject *, struct attribute *,\
+ const char *, size_t); \
+ \
+static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
+ const char *buf, size_t size) \
+{ \
+ return bch2_err_class(fn##_store_inner(kobj, attr, buf, size)); \
+} \
+ \
+static ssize_t fn ## _store_inner(struct kobject *kobj, struct attribute *attr,\
+ const char *buf, size_t size)
+
+#define __sysfs_attribute(_name, _mode) \
+ static struct attribute sysfs_##_name = \
+ { .name = #_name, .mode = _mode }
+
+#define write_attribute(n) __sysfs_attribute(n, 0200)
+#define read_attribute(n) __sysfs_attribute(n, 0444)
+#define rw_attribute(n) __sysfs_attribute(n, 0644)
+
+#define sysfs_printf(file, fmt, ...) \
+do { \
+ if (attr == &sysfs_ ## file) \
+ prt_printf(out, fmt "\n", __VA_ARGS__); \
+} while (0)
+
+#define sysfs_print(file, var) \
+do { \
+ if (attr == &sysfs_ ## file) \
+ snprint(out, var); \
+} while (0)
+
+#define sysfs_hprint(file, val) \
+do { \
+ if (attr == &sysfs_ ## file) \
+ prt_human_readable_s64(out, val); \
+} while (0)
+
+#define sysfs_strtoul(file, var) \
+do { \
+ if (attr == &sysfs_ ## file) \
+ return strtoul_safe(buf, var) ?: (ssize_t) size; \
+} while (0)
+
+#define sysfs_strtoul_clamp(file, var, min, max) \
+do { \
+ if (attr == &sysfs_ ## file) \
+ return strtoul_safe_clamp(buf, var, min, max) \
+ ?: (ssize_t) size; \
+} while (0)
+
+#define strtoul_or_return(cp) \
+({ \
+ unsigned long _v; \
+ int _r = kstrtoul(cp, 10, &_v); \
+ if (_r) \
+ return _r; \
+ _v; \
+})
+
+write_attribute(trigger_gc);
+write_attribute(trigger_discards);
+write_attribute(trigger_invalidates);
+write_attribute(prune_cache);
+write_attribute(btree_wakeup);
+rw_attribute(btree_gc_periodic);
+rw_attribute(gc_gens_pos);
+
+read_attribute(uuid);
+read_attribute(minor);
+read_attribute(bucket_size);
+read_attribute(first_bucket);
+read_attribute(nbuckets);
+rw_attribute(durability);
+read_attribute(io_done);
+read_attribute(io_errors);
+write_attribute(io_errors_reset);
+
+read_attribute(io_latency_read);
+read_attribute(io_latency_write);
+read_attribute(io_latency_stats_read);
+read_attribute(io_latency_stats_write);
+read_attribute(congested);
+
+read_attribute(btree_write_stats);
+
+read_attribute(btree_cache_size);
+read_attribute(compression_stats);
+read_attribute(journal_debug);
+read_attribute(btree_updates);
+read_attribute(btree_cache);
+read_attribute(btree_key_cache);
+read_attribute(stripes_heap);
+read_attribute(open_buckets);
+read_attribute(open_buckets_partial);
+read_attribute(write_points);
+read_attribute(nocow_lock_table);
+
+#ifdef BCH_WRITE_REF_DEBUG
+read_attribute(write_refs);
+
+static const char * const bch2_write_refs[] = {
+#define x(n) #n,
+ BCH_WRITE_REFS()
+#undef x
+ NULL
+};
+
+static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ bch2_printbuf_tabstop_push(out, 24);
+
+ for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) {
+ prt_str(out, bch2_write_refs[i]);
+ prt_tab(out);
+ prt_printf(out, "%li", atomic_long_read(&c->writes[i]));
+ prt_newline(out);
+ }
+}
+#endif
+
+read_attribute(internal_uuid);
+read_attribute(disk_groups);
+
+read_attribute(has_data);
+read_attribute(alloc_debug);
+
+#define x(t, n, ...) read_attribute(t);
+BCH_PERSISTENT_COUNTERS()
+#undef x
+
+rw_attribute(discard);
+rw_attribute(label);
+
+rw_attribute(copy_gc_enabled);
+read_attribute(copy_gc_wait);
+
+rw_attribute(rebalance_enabled);
+sysfs_pd_controller_attribute(rebalance);
+read_attribute(rebalance_status);
+rw_attribute(promote_whole_extents);
+
+read_attribute(new_stripes);
+
+read_attribute(io_timers_read);
+read_attribute(io_timers_write);
+
+read_attribute(moving_ctxts);
+
+#ifdef CONFIG_BCACHEFS_TESTS
+write_attribute(perf_test);
+#endif /* CONFIG_BCACHEFS_TESTS */
+
+#define x(_name) \
+ static struct attribute sysfs_time_stat_##_name = \
+ { .name = #_name, .mode = 0444 };
+ BCH_TIME_STATS()
+#undef x
+
+static struct attribute sysfs_state_rw = {
+ .name = "state",
+ .mode = 0444,
+};
+
+static size_t bch2_btree_cache_size(struct bch_fs *c)
+{
+ size_t ret = 0;
+ struct btree *b;
+
+ mutex_lock(&c->btree_cache.lock);
+ list_for_each_entry(b, &c->btree_cache.live, list)
+ ret += btree_bytes(c);
+
+ mutex_unlock(&c->btree_cache.lock);
+ return ret;
+}
+
+static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ struct btree_trans *trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ enum btree_id id;
+ u64 nr_uncompressed_extents = 0,
+ nr_compressed_extents = 0,
+ nr_incompressible_extents = 0,
+ uncompressed_sectors = 0,
+ incompressible_sectors = 0,
+ compressed_sectors_compressed = 0,
+ compressed_sectors_uncompressed = 0;
+ int ret = 0;
+
+ if (!test_bit(BCH_FS_STARTED, &c->flags))
+ return -EPERM;
+
+ trans = bch2_trans_get(c);
+
+ for (id = 0; id < BTREE_ID_NR; id++) {
+ if (!btree_type_has_ptrs(id))
+ continue;
+
+ ret = for_each_btree_key2(trans, iter, id, POS_MIN,
+ BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ bool compressed = false, uncompressed = false, incompressible = false;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ switch (p.crc.compression_type) {
+ case BCH_COMPRESSION_TYPE_none:
+ uncompressed = true;
+ uncompressed_sectors += k.k->size;
+ break;
+ case BCH_COMPRESSION_TYPE_incompressible:
+ incompressible = true;
+ incompressible_sectors += k.k->size;
+ break;
+ default:
+ compressed_sectors_compressed +=
+ p.crc.compressed_size;
+ compressed_sectors_uncompressed +=
+ p.crc.uncompressed_size;
+ compressed = true;
+ break;
+ }
+ }
+
+ if (incompressible)
+ nr_incompressible_extents++;
+ else if (uncompressed)
+ nr_uncompressed_extents++;
+ else if (compressed)
+ nr_compressed_extents++;
+ 0;
+ }));
+ }
+
+ bch2_trans_put(trans);
+
+ if (ret)
+ return ret;
+
+ prt_printf(out, "uncompressed:\n");
+ prt_printf(out, " nr extents: %llu\n", nr_uncompressed_extents);
+ prt_printf(out, " size: ");
+ prt_human_readable_u64(out, uncompressed_sectors << 9);
+ prt_printf(out, "\n");
+
+ prt_printf(out, "compressed:\n");
+ prt_printf(out, " nr extents: %llu\n", nr_compressed_extents);
+ prt_printf(out, " compressed size: ");
+ prt_human_readable_u64(out, compressed_sectors_compressed << 9);
+ prt_printf(out, "\n");
+ prt_printf(out, " uncompressed size: ");
+ prt_human_readable_u64(out, compressed_sectors_uncompressed << 9);
+ prt_printf(out, "\n");
+
+ prt_printf(out, "incompressible:\n");
+ prt_printf(out, " nr extents: %llu\n", nr_incompressible_extents);
+ prt_printf(out, " size: ");
+ prt_human_readable_u64(out, incompressible_sectors << 9);
+ prt_printf(out, "\n");
+ return 0;
+}
+
+static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ prt_printf(out, "%s: ", bch2_btree_id_str(c->gc_gens_btree));
+ bch2_bpos_to_text(out, c->gc_gens_pos);
+ prt_printf(out, "\n");
+}
+
+static void bch2_btree_wakeup_all(struct bch_fs *c)
+{
+ struct btree_trans *trans;
+
+ seqmutex_lock(&c->btree_trans_lock);
+ list_for_each_entry(trans, &c->btree_trans_list, list) {
+ struct btree_bkey_cached_common *b = READ_ONCE(trans->locking);
+
+ if (b)
+ six_lock_wakeup_all(&b->lock);
+
+ }
+ seqmutex_unlock(&c->btree_trans_lock);
+}
+
+SHOW(bch2_fs)
+{
+ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
+
+ sysfs_print(minor, c->minor);
+ sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b);
+
+ sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c));
+
+ if (attr == &sysfs_btree_write_stats)
+ bch2_btree_write_stats_to_text(out, c);
+
+ sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic);
+
+ if (attr == &sysfs_gc_gens_pos)
+ bch2_gc_gens_pos_to_text(out, c);
+
+ sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
+
+ sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled);
+ sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */
+
+ if (attr == &sysfs_copy_gc_wait)
+ bch2_copygc_wait_to_text(out, c);
+
+ if (attr == &sysfs_rebalance_status)
+ bch2_rebalance_status_to_text(out, c);
+
+ sysfs_print(promote_whole_extents, c->promote_whole_extents);
+
+ /* Debugging: */
+
+ if (attr == &sysfs_journal_debug)
+ bch2_journal_debug_to_text(out, &c->journal);
+
+ if (attr == &sysfs_btree_updates)
+ bch2_btree_updates_to_text(out, c);
+
+ if (attr == &sysfs_btree_cache)
+ bch2_btree_cache_to_text(out, c);
+
+ if (attr == &sysfs_btree_key_cache)
+ bch2_btree_key_cache_to_text(out, &c->btree_key_cache);
+
+ if (attr == &sysfs_stripes_heap)
+ bch2_stripes_heap_to_text(out, c);
+
+ if (attr == &sysfs_open_buckets)
+ bch2_open_buckets_to_text(out, c);
+
+ if (attr == &sysfs_open_buckets_partial)
+ bch2_open_buckets_partial_to_text(out, c);
+
+ if (attr == &sysfs_write_points)
+ bch2_write_points_to_text(out, c);
+
+ if (attr == &sysfs_compression_stats)
+ bch2_compression_stats_to_text(out, c);
+
+ if (attr == &sysfs_new_stripes)
+ bch2_new_stripes_to_text(out, c);
+
+ if (attr == &sysfs_io_timers_read)
+ bch2_io_timers_to_text(out, &c->io_clock[READ]);
+
+ if (attr == &sysfs_io_timers_write)
+ bch2_io_timers_to_text(out, &c->io_clock[WRITE]);
+
+ if (attr == &sysfs_moving_ctxts)
+ bch2_fs_moving_ctxts_to_text(out, c);
+
+#ifdef BCH_WRITE_REF_DEBUG
+ if (attr == &sysfs_write_refs)
+ bch2_write_refs_to_text(out, c);
+#endif
+
+ if (attr == &sysfs_nocow_lock_table)
+ bch2_nocow_locks_to_text(out, &c->nocow_locks);
+
+ if (attr == &sysfs_disk_groups)
+ bch2_disk_groups_to_text(out, c);
+
+ return 0;
+}
+
+STORE(bch2_fs)
+{
+ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
+
+ if (attr == &sysfs_btree_gc_periodic) {
+ ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic)
+ ?: (ssize_t) size;
+
+ wake_up_process(c->gc_thread);
+ return ret;
+ }
+
+ if (attr == &sysfs_copy_gc_enabled) {
+ ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled)
+ ?: (ssize_t) size;
+
+ if (c->copygc_thread)
+ wake_up_process(c->copygc_thread);
+ return ret;
+ }
+
+ if (attr == &sysfs_rebalance_enabled) {
+ ssize_t ret = strtoul_safe(buf, c->rebalance.enabled)
+ ?: (ssize_t) size;
+
+ rebalance_wakeup(c);
+ return ret;
+ }
+
+ sysfs_pd_controller_store(rebalance, &c->rebalance.pd);
+
+ sysfs_strtoul(promote_whole_extents, c->promote_whole_extents);
+
+ /* Debugging: */
+
+ if (!test_bit(BCH_FS_STARTED, &c->flags))
+ return -EPERM;
+
+ /* Debugging: */
+
+ if (!test_bit(BCH_FS_RW, &c->flags))
+ return -EROFS;
+
+ if (attr == &sysfs_prune_cache) {
+ struct shrink_control sc;
+
+ sc.gfp_mask = GFP_KERNEL;
+ sc.nr_to_scan = strtoul_or_return(buf);
+ c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc);
+ }
+
+ if (attr == &sysfs_btree_wakeup)
+ bch2_btree_wakeup_all(c);
+
+ if (attr == &sysfs_trigger_gc) {
+ /*
+ * Full gc is currently incompatible with btree key cache:
+ */
+#if 0
+ down_read(&c->state_lock);
+ bch2_gc(c, false, false);
+ up_read(&c->state_lock);
+#else
+ bch2_gc_gens(c);
+#endif
+ }
+
+ if (attr == &sysfs_trigger_discards)
+ bch2_do_discards(c);
+
+ if (attr == &sysfs_trigger_invalidates)
+ bch2_do_invalidates(c);
+
+#ifdef CONFIG_BCACHEFS_TESTS
+ if (attr == &sysfs_perf_test) {
+ char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
+ char *test = strsep(&p, " \t\n");
+ char *nr_str = strsep(&p, " \t\n");
+ char *threads_str = strsep(&p, " \t\n");
+ unsigned threads;
+ u64 nr;
+ int ret = -EINVAL;
+
+ if (threads_str &&
+ !(ret = kstrtouint(threads_str, 10, &threads)) &&
+ !(ret = bch2_strtoull_h(nr_str, &nr)))
+ ret = bch2_btree_perf_test(c, test, nr, threads);
+ kfree(tmp);
+
+ if (ret)
+ size = ret;
+ }
+#endif
+ return size;
+}
+SYSFS_OPS(bch2_fs);
+
+struct attribute *bch2_fs_files[] = {
+ &sysfs_minor,
+ &sysfs_btree_cache_size,
+ &sysfs_btree_write_stats,
+
+ &sysfs_promote_whole_extents,
+
+ &sysfs_compression_stats,
+
+#ifdef CONFIG_BCACHEFS_TESTS
+ &sysfs_perf_test,
+#endif
+ NULL
+};
+
+/* counters dir */
+
+SHOW(bch2_fs_counters)
+{
+ struct bch_fs *c = container_of(kobj, struct bch_fs, counters_kobj);
+ u64 counter = 0;
+ u64 counter_since_mount = 0;
+
+ printbuf_tabstop_push(out, 32);
+
+ #define x(t, ...) \
+ if (attr == &sysfs_##t) { \
+ counter = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\
+ counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\
+ prt_printf(out, "since mount:"); \
+ prt_tab(out); \
+ prt_human_readable_u64(out, counter_since_mount); \
+ prt_newline(out); \
+ \
+ prt_printf(out, "since filesystem creation:"); \
+ prt_tab(out); \
+ prt_human_readable_u64(out, counter); \
+ prt_newline(out); \
+ }
+ BCH_PERSISTENT_COUNTERS()
+ #undef x
+ return 0;
+}
+
+STORE(bch2_fs_counters) {
+ return 0;
+}
+
+SYSFS_OPS(bch2_fs_counters);
+
+struct attribute *bch2_fs_counters_files[] = {
+#define x(t, ...) \
+ &sysfs_##t,
+ BCH_PERSISTENT_COUNTERS()
+#undef x
+ NULL
+};
+/* internal dir - just a wrapper */
+
+SHOW(bch2_fs_internal)
+{
+ struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
+
+ return bch2_fs_to_text(out, &c->kobj, attr);
+}
+
+STORE(bch2_fs_internal)
+{
+ struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
+
+ return bch2_fs_store(&c->kobj, attr, buf, size);
+}
+SYSFS_OPS(bch2_fs_internal);
+
+struct attribute *bch2_fs_internal_files[] = {
+ &sysfs_journal_debug,
+ &sysfs_btree_updates,
+ &sysfs_btree_cache,
+ &sysfs_btree_key_cache,
+ &sysfs_new_stripes,
+ &sysfs_stripes_heap,
+ &sysfs_open_buckets,
+ &sysfs_open_buckets_partial,
+ &sysfs_write_points,
+#ifdef BCH_WRITE_REF_DEBUG
+ &sysfs_write_refs,
+#endif
+ &sysfs_nocow_lock_table,
+ &sysfs_io_timers_read,
+ &sysfs_io_timers_write,
+
+ &sysfs_trigger_gc,
+ &sysfs_trigger_discards,
+ &sysfs_trigger_invalidates,
+ &sysfs_prune_cache,
+ &sysfs_btree_wakeup,
+
+ &sysfs_gc_gens_pos,
+
+ &sysfs_copy_gc_enabled,
+ &sysfs_copy_gc_wait,
+
+ &sysfs_rebalance_enabled,
+ &sysfs_rebalance_status,
+ sysfs_pd_controller_files(rebalance),
+
+ &sysfs_moving_ctxts,
+
+ &sysfs_internal_uuid,
+
+ &sysfs_disk_groups,
+ NULL
+};
+
+/* options */
+
+SHOW(bch2_fs_opts_dir)
+{
+ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
+ const struct bch_option *opt = container_of(attr, struct bch_option, attr);
+ int id = opt - bch2_opt_table;
+ u64 v = bch2_opt_get_by_id(&c->opts, id);
+
+ bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST);
+ prt_char(out, '\n');
+
+ return 0;
+}
+
+STORE(bch2_fs_opts_dir)
+{
+ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
+ const struct bch_option *opt = container_of(attr, struct bch_option, attr);
+ int ret, id = opt - bch2_opt_table;
+ char *tmp;
+ u64 v;
+
+ /*
+ * We don't need to take c->writes for correctness, but it eliminates an
+ * unsightly error message in the dmesg log when we're RO:
+ */
+ if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)))
+ return -EROFS;
+
+ tmp = kstrdup(buf, GFP_KERNEL);
+ if (!tmp) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL);
+ kfree(tmp);
+
+ if (ret < 0)
+ goto err;
+
+ ret = bch2_opt_check_may_set(c, id, v);
+ if (ret < 0)
+ goto err;
+
+ bch2_opt_set_sb(c, opt, v);
+ bch2_opt_set_by_id(&c->opts, id, v);
+
+ if ((id == Opt_background_target ||
+ id == Opt_background_compression) && v)
+ bch2_set_rebalance_needs_scan(c, 0);
+
+ ret = size;
+err:
+ bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
+ return ret;
+}
+SYSFS_OPS(bch2_fs_opts_dir);
+
+struct attribute *bch2_fs_opts_dir_files[] = { NULL };
+
+int bch2_opts_create_sysfs_files(struct kobject *kobj)
+{
+ const struct bch_option *i;
+ int ret;
+
+ for (i = bch2_opt_table;
+ i < bch2_opt_table + bch2_opts_nr;
+ i++) {
+ if (!(i->flags & OPT_FS))
+ continue;
+
+ ret = sysfs_create_file(kobj, &i->attr);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/* time stats */
+
+SHOW(bch2_fs_time_stats)
+{
+ struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
+
+#define x(name) \
+ if (attr == &sysfs_time_stat_##name) \
+ bch2_time_stats_to_text(out, &c->times[BCH_TIME_##name]);
+ BCH_TIME_STATS()
+#undef x
+
+ return 0;
+}
+
+STORE(bch2_fs_time_stats)
+{
+ return size;
+}
+SYSFS_OPS(bch2_fs_time_stats);
+
+struct attribute *bch2_fs_time_stats_files[] = {
+#define x(name) \
+ &sysfs_time_stat_##name,
+ BCH_TIME_STATS()
+#undef x
+ NULL
+};
+
+static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
+{
+ struct bch_fs *c = ca->fs;
+ struct bch_dev_usage stats = bch2_dev_usage_read(ca);
+ unsigned i, nr[BCH_DATA_NR];
+
+ memset(nr, 0, sizeof(nr));
+
+ for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
+ nr[c->open_buckets[i].data_type]++;
+
+ printbuf_tabstop_push(out, 8);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 16);
+
+ prt_tab(out);
+ prt_str(out, "buckets");
+ prt_tab_rjust(out);
+ prt_str(out, "sectors");
+ prt_tab_rjust(out);
+ prt_str(out, "fragmented");
+ prt_tab_rjust(out);
+ prt_newline(out);
+
+ for (i = 0; i < BCH_DATA_NR; i++) {
+ prt_str(out, bch2_data_types[i]);
+ prt_tab(out);
+ prt_u64(out, stats.d[i].buckets);
+ prt_tab_rjust(out);
+ prt_u64(out, stats.d[i].sectors);
+ prt_tab_rjust(out);
+ prt_u64(out, stats.d[i].fragmented);
+ prt_tab_rjust(out);
+ prt_newline(out);
+ }
+
+ prt_str(out, "ec");
+ prt_tab(out);
+ prt_u64(out, stats.buckets_ec);
+ prt_tab_rjust(out);
+ prt_newline(out);
+
+ prt_newline(out);
+
+ prt_printf(out, "reserves:");
+ prt_newline(out);
+ for (i = 0; i < BCH_WATERMARK_NR; i++) {
+ prt_str(out, bch2_watermarks[i]);
+ prt_tab(out);
+ prt_u64(out, bch2_dev_buckets_reserved(ca, i));
+ prt_tab_rjust(out);
+ prt_newline(out);
+ }
+
+ prt_newline(out);
+
+ printbuf_tabstops_reset(out);
+ printbuf_tabstop_push(out, 24);
+
+ prt_str(out, "freelist_wait");
+ prt_tab(out);
+ prt_str(out, c->freelist_wait.list.first ? "waiting" : "empty");
+ prt_newline(out);
+
+ prt_str(out, "open buckets allocated");
+ prt_tab(out);
+ prt_u64(out, OPEN_BUCKETS_COUNT - c->open_buckets_nr_free);
+ prt_newline(out);
+
+ prt_str(out, "open buckets this dev");
+ prt_tab(out);
+ prt_u64(out, ca->nr_open_buckets);
+ prt_newline(out);
+
+ prt_str(out, "open buckets total");
+ prt_tab(out);
+ prt_u64(out, OPEN_BUCKETS_COUNT);
+ prt_newline(out);
+
+ prt_str(out, "open_buckets_wait");
+ prt_tab(out);
+ prt_str(out, c->open_buckets_wait.list.first ? "waiting" : "empty");
+ prt_newline(out);
+
+ prt_str(out, "open_buckets_btree");
+ prt_tab(out);
+ prt_u64(out, nr[BCH_DATA_btree]);
+ prt_newline(out);
+
+ prt_str(out, "open_buckets_user");
+ prt_tab(out);
+ prt_u64(out, nr[BCH_DATA_user]);
+ prt_newline(out);
+
+ prt_str(out, "buckets_to_invalidate");
+ prt_tab(out);
+ prt_u64(out, should_invalidate_buckets(ca, stats));
+ prt_newline(out);
+
+ prt_str(out, "btree reserve cache");
+ prt_tab(out);
+ prt_u64(out, c->btree_reserve_cache_nr);
+ prt_newline(out);
+}
+
+static const char * const bch2_rw[] = {
+ "read",
+ "write",
+ NULL
+};
+
+static void dev_io_done_to_text(struct printbuf *out, struct bch_dev *ca)
+{
+ int rw, i;
+
+ for (rw = 0; rw < 2; rw++) {
+ prt_printf(out, "%s:\n", bch2_rw[rw]);
+
+ for (i = 1; i < BCH_DATA_NR; i++)
+ prt_printf(out, "%-12s:%12llu\n",
+ bch2_data_types[i],
+ percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9);
+ }
+}
+
+SHOW(bch2_dev)
+{
+ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
+ struct bch_fs *c = ca->fs;
+
+ sysfs_printf(uuid, "%pU\n", ca->uuid.b);
+
+ sysfs_print(bucket_size, bucket_bytes(ca));
+ sysfs_print(first_bucket, ca->mi.first_bucket);
+ sysfs_print(nbuckets, ca->mi.nbuckets);
+ sysfs_print(durability, ca->mi.durability);
+ sysfs_print(discard, ca->mi.discard);
+
+ if (attr == &sysfs_label) {
+ if (ca->mi.group)
+ bch2_disk_path_to_text(out, c, ca->mi.group - 1);
+ prt_char(out, '\n');
+ }
+
+ if (attr == &sysfs_has_data) {
+ prt_bitflags(out, bch2_data_types, bch2_dev_has_data(c, ca));
+ prt_char(out, '\n');
+ }
+
+ if (attr == &sysfs_state_rw) {
+ prt_string_option(out, bch2_member_states, ca->mi.state);
+ prt_char(out, '\n');
+ }
+
+ if (attr == &sysfs_io_done)
+ dev_io_done_to_text(out, ca);
+
+ if (attr == &sysfs_io_errors)
+ bch2_dev_io_errors_to_text(out, ca);
+
+ sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ]));
+ sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE]));
+
+ if (attr == &sysfs_io_latency_stats_read)
+ bch2_time_stats_to_text(out, &ca->io_latency[READ]);
+
+ if (attr == &sysfs_io_latency_stats_write)
+ bch2_time_stats_to_text(out, &ca->io_latency[WRITE]);
+
+ sysfs_printf(congested, "%u%%",
+ clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
+ * 100 / CONGESTED_MAX);
+
+ if (attr == &sysfs_alloc_debug)
+ dev_alloc_debug_to_text(out, ca);
+
+ return 0;
+}
+
+STORE(bch2_dev)
+{
+ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
+ struct bch_fs *c = ca->fs;
+ struct bch_member *mi;
+
+ if (attr == &sysfs_discard) {
+ bool v = strtoul_or_return(buf);
+
+ mutex_lock(&c->sb_lock);
+ mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+
+ if (v != BCH_MEMBER_DISCARD(mi)) {
+ SET_BCH_MEMBER_DISCARD(mi, v);
+ bch2_write_super(c);
+ }
+ mutex_unlock(&c->sb_lock);
+ }
+
+ if (attr == &sysfs_durability) {
+ u64 v = strtoul_or_return(buf);
+
+ mutex_lock(&c->sb_lock);
+ mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+
+ if (v + 1 != BCH_MEMBER_DURABILITY(mi)) {
+ SET_BCH_MEMBER_DURABILITY(mi, v + 1);
+ bch2_write_super(c);
+ }
+ mutex_unlock(&c->sb_lock);
+ }
+
+ if (attr == &sysfs_label) {
+ char *tmp;
+ int ret;
+
+ tmp = kstrdup(buf, GFP_KERNEL);
+ if (!tmp)
+ return -ENOMEM;
+
+ ret = bch2_dev_group_set(c, ca, strim(tmp));
+ kfree(tmp);
+ if (ret)
+ return ret;
+ }
+
+ if (attr == &sysfs_io_errors_reset)
+ bch2_dev_errors_reset(ca);
+
+ return size;
+}
+SYSFS_OPS(bch2_dev);
+
+struct attribute *bch2_dev_files[] = {
+ &sysfs_uuid,
+ &sysfs_bucket_size,
+ &sysfs_first_bucket,
+ &sysfs_nbuckets,
+ &sysfs_durability,
+
+ /* settings: */
+ &sysfs_discard,
+ &sysfs_state_rw,
+ &sysfs_label,
+
+ &sysfs_has_data,
+ &sysfs_io_done,
+ &sysfs_io_errors,
+ &sysfs_io_errors_reset,
+
+ &sysfs_io_latency_read,
+ &sysfs_io_latency_write,
+ &sysfs_io_latency_stats_read,
+ &sysfs_io_latency_stats_write,
+ &sysfs_congested,
+
+ /* debug: */
+ &sysfs_alloc_debug,
+ NULL
+};
+
+#endif /* _BCACHEFS_SYSFS_H_ */
diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h
new file mode 100644
index 000000000000..222cd5062702
--- /dev/null
+++ b/fs/bcachefs/sysfs.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SYSFS_H_
+#define _BCACHEFS_SYSFS_H_
+
+#include <linux/sysfs.h>
+
+#ifndef NO_BCACHEFS_SYSFS
+
+struct attribute;
+struct sysfs_ops;
+
+extern struct attribute *bch2_fs_files[];
+extern struct attribute *bch2_fs_counters_files[];
+extern struct attribute *bch2_fs_internal_files[];
+extern struct attribute *bch2_fs_opts_dir_files[];
+extern struct attribute *bch2_fs_time_stats_files[];
+extern struct attribute *bch2_dev_files[];
+
+extern const struct sysfs_ops bch2_fs_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_counters_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_internal_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
+extern const struct sysfs_ops bch2_dev_sysfs_ops;
+
+int bch2_opts_create_sysfs_files(struct kobject *);
+
+#else
+
+static struct attribute *bch2_fs_files[] = {};
+static struct attribute *bch2_fs_counters_files[] = {};
+static struct attribute *bch2_fs_internal_files[] = {};
+static struct attribute *bch2_fs_opts_dir_files[] = {};
+static struct attribute *bch2_fs_time_stats_files[] = {};
+static struct attribute *bch2_dev_files[] = {};
+
+static const struct sysfs_ops bch2_fs_sysfs_ops;
+static const struct sysfs_ops bch2_fs_counters_sysfs_ops;
+static const struct sysfs_ops bch2_fs_internal_sysfs_ops;
+static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
+static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
+static const struct sysfs_ops bch2_dev_sysfs_ops;
+
+static inline int bch2_opts_create_sysfs_files(struct kobject *kobj) { return 0; }
+
+#endif /* NO_BCACHEFS_SYSFS */
+
+#endif /* _BCACHEFS_SYSFS_H_ */
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
new file mode 100644
index 000000000000..2fc9e60c754b
--- /dev/null
+++ b/fs/bcachefs/tests.c
@@ -0,0 +1,919 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifdef CONFIG_BCACHEFS_TESTS
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "journal_reclaim.h"
+#include "snapshot.h"
+#include "tests.h"
+
+#include "linux/kthread.h"
+#include "linux/random.h"
+
+static void delete_test_keys(struct bch_fs *c)
+{
+ int ret;
+
+ ret = bch2_btree_delete_range(c, BTREE_ID_extents,
+ SPOS(0, 0, U32_MAX),
+ POS(0, U64_MAX),
+ 0, NULL);
+ BUG_ON(ret);
+
+ ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX),
+ POS(0, U64_MAX),
+ 0, NULL);
+ BUG_ON(ret);
+}
+
+/* unit tests */
+
+static int test_delete(struct bch_fs *c, u64 nr)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_i_cookie k;
+ int ret;
+
+ bkey_cookie_init(&k.k_i);
+ k.k.p.snapshot = U32_MAX;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
+ BTREE_ITER_INTENT);
+
+ ret = commit_do(trans, NULL, NULL, 0,
+ bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(trans, &iter, &k.k_i, 0));
+ bch_err_msg(c, ret, "update error");
+ if (ret)
+ goto err;
+
+ pr_info("deleting once");
+ ret = commit_do(trans, NULL, NULL, 0,
+ bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_delete_at(trans, &iter, 0));
+ bch_err_msg(c, ret, "delete error (first)");
+ if (ret)
+ goto err;
+
+ pr_info("deleting twice");
+ ret = commit_do(trans, NULL, NULL, 0,
+ bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_delete_at(trans, &iter, 0));
+ bch_err_msg(c, ret, "delete error (second)");
+ if (ret)
+ goto err;
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
+ return ret;
+}
+
+static int test_delete_written(struct bch_fs *c, u64 nr)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_i_cookie k;
+ int ret;
+
+ bkey_cookie_init(&k.k_i);
+ k.k.p.snapshot = U32_MAX;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
+ BTREE_ITER_INTENT);
+
+ ret = commit_do(trans, NULL, NULL, 0,
+ bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(trans, &iter, &k.k_i, 0));
+ bch_err_msg(c, ret, "update error");
+ if (ret)
+ goto err;
+
+ bch2_trans_unlock(trans);
+ bch2_journal_flush_all_pins(&c->journal);
+
+ ret = commit_do(trans, NULL, NULL, 0,
+ bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_delete_at(trans, &iter, 0));
+ bch_err_msg(c, ret, "delete error");
+ if (ret)
+ goto err;
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
+ return ret;
+}
+
+static int test_iterate(struct bch_fs *c, u64 nr)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter = { NULL };
+ struct bkey_s_c k;
+ u64 i;
+ int ret = 0;
+
+ delete_test_keys(c);
+
+ pr_info("inserting test keys");
+
+ for (i = 0; i < nr; i++) {
+ struct bkey_i_cookie ck;
+
+ bkey_cookie_init(&ck.k_i);
+ ck.k.p.offset = i;
+ ck.k.p.snapshot = U32_MAX;
+
+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
+ bch_err_msg(c, ret, "insert error");
+ if (ret)
+ goto err;
+ }
+
+ pr_info("iterating forwards");
+
+ i = 0;
+
+ ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ 0, k, ({
+ BUG_ON(k.k->p.offset != i++);
+ 0;
+ }));
+ bch_err_msg(c, ret, "error iterating forwards");
+ if (ret)
+ goto err;
+
+ BUG_ON(i != nr);
+
+ pr_info("iterating backwards");
+
+ ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs,
+ SPOS(0, U64_MAX, U32_MAX), 0, k,
+ ({
+ BUG_ON(k.k->p.offset != --i);
+ 0;
+ }));
+ bch_err_msg(c, ret, "error iterating backwards");
+ if (ret)
+ goto err;
+
+ BUG_ON(i);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
+ return ret;
+}
+
+static int test_iterate_extents(struct bch_fs *c, u64 nr)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter = { NULL };
+ struct bkey_s_c k;
+ u64 i;
+ int ret = 0;
+
+ delete_test_keys(c);
+
+ pr_info("inserting test extents");
+
+ for (i = 0; i < nr; i += 8) {
+ struct bkey_i_cookie ck;
+
+ bkey_cookie_init(&ck.k_i);
+ ck.k.p.offset = i + 8;
+ ck.k.p.snapshot = U32_MAX;
+ ck.k.size = 8;
+
+ ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
+ bch_err_msg(c, ret, "insert error");
+ if (ret)
+ goto err;
+ }
+
+ pr_info("iterating forwards");
+
+ i = 0;
+
+ ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ 0, k, ({
+ BUG_ON(bkey_start_offset(k.k) != i);
+ i = k.k->p.offset;
+ 0;
+ }));
+ bch_err_msg(c, ret, "error iterating forwards");
+ if (ret)
+ goto err;
+
+ BUG_ON(i != nr);
+
+ pr_info("iterating backwards");
+
+ ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_extents,
+ SPOS(0, U64_MAX, U32_MAX), 0, k,
+ ({
+ BUG_ON(k.k->p.offset != i);
+ i = bkey_start_offset(k.k);
+ 0;
+ }));
+ bch_err_msg(c, ret, "error iterating backwards");
+ if (ret)
+ goto err;
+
+ BUG_ON(i);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
+ return ret;
+}
+
+static int test_iterate_slots(struct bch_fs *c, u64 nr)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter = { NULL };
+ struct bkey_s_c k;
+ u64 i;
+ int ret = 0;
+
+ delete_test_keys(c);
+
+ pr_info("inserting test keys");
+
+ for (i = 0; i < nr; i++) {
+ struct bkey_i_cookie ck;
+
+ bkey_cookie_init(&ck.k_i);
+ ck.k.p.offset = i * 2;
+ ck.k.p.snapshot = U32_MAX;
+
+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
+ bch_err_msg(c, ret, "insert error");
+ if (ret)
+ goto err;
+ }
+
+ pr_info("iterating forwards");
+
+ i = 0;
+
+ ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ 0, k, ({
+ BUG_ON(k.k->p.offset != i);
+ i += 2;
+ 0;
+ }));
+ bch_err_msg(c, ret, "error iterating forwards");
+ if (ret)
+ goto err;
+
+ BUG_ON(i != nr * 2);
+
+ pr_info("iterating forwards by slots");
+
+ i = 0;
+
+ ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ BTREE_ITER_SLOTS, k, ({
+ if (i >= nr * 2)
+ break;
+
+ BUG_ON(k.k->p.offset != i);
+ BUG_ON(bkey_deleted(k.k) != (i & 1));
+
+ i++;
+ 0;
+ }));
+ if (ret < 0) {
+ bch_err_msg(c, ret, "error iterating forwards by slots");
+ goto err;
+ }
+ ret = 0;
+err:
+ bch2_trans_put(trans);
+ return ret;
+}
+
+static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter = { NULL };
+ struct bkey_s_c k;
+ u64 i;
+ int ret = 0;
+
+ delete_test_keys(c);
+
+ pr_info("inserting test keys");
+
+ for (i = 0; i < nr; i += 16) {
+ struct bkey_i_cookie ck;
+
+ bkey_cookie_init(&ck.k_i);
+ ck.k.p.offset = i + 16;
+ ck.k.p.snapshot = U32_MAX;
+ ck.k.size = 8;
+
+ ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
+ bch_err_msg(c, ret, "insert error");
+ if (ret)
+ goto err;
+ }
+
+ pr_info("iterating forwards");
+
+ i = 0;
+
+ ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ 0, k, ({
+ BUG_ON(bkey_start_offset(k.k) != i + 8);
+ BUG_ON(k.k->size != 8);
+ i += 16;
+ 0;
+ }));
+ bch_err_msg(c, ret, "error iterating forwards");
+ if (ret)
+ goto err;
+
+ BUG_ON(i != nr);
+
+ pr_info("iterating forwards by slots");
+
+ i = 0;
+
+ ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ BTREE_ITER_SLOTS, k, ({
+ if (i == nr)
+ break;
+ BUG_ON(bkey_deleted(k.k) != !(i % 16));
+
+ BUG_ON(bkey_start_offset(k.k) != i);
+ BUG_ON(k.k->size != 8);
+ i = k.k->p.offset;
+ 0;
+ }));
+ bch_err_msg(c, ret, "error iterating forwards by slots");
+ if (ret)
+ goto err;
+ ret = 0;
+err:
+ bch2_trans_put(trans);
+ return 0;
+}
+
+/*
+ * XXX: we really want to make sure we've got a btree with depth > 0 for these
+ * tests
+ */
+static int test_peek_end(struct bch_fs *c, u64 nr)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_s_c k;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), 0);
+
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+ BUG_ON(k.k);
+
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+ BUG_ON(k.k);
+
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
+ return 0;
+}
+
+static int test_peek_end_extents(struct bch_fs *c, u64 nr)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_s_c k;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+ SPOS(0, 0, U32_MAX), 0);
+
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+ BUG_ON(k.k);
+
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+ BUG_ON(k.k);
+
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
+ return 0;
+}
+
+/* extent unit tests */
+
+static u64 test_version;
+
+static int insert_test_extent(struct bch_fs *c,
+ u64 start, u64 end)
+{
+ struct bkey_i_cookie k;
+ int ret;
+
+ bkey_cookie_init(&k.k_i);
+ k.k_i.k.p.offset = end;
+ k.k_i.k.p.snapshot = U32_MAX;
+ k.k_i.k.size = end - start;
+ k.k_i.k.version.lo = test_version++;
+
+ ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+static int __test_extent_overwrite(struct bch_fs *c,
+ u64 e1_start, u64 e1_end,
+ u64 e2_start, u64 e2_end)
+{
+ int ret;
+
+ ret = insert_test_extent(c, e1_start, e1_end) ?:
+ insert_test_extent(c, e2_start, e2_end);
+
+ delete_test_keys(c);
+ return ret;
+}
+
+static int test_extent_overwrite_front(struct bch_fs *c, u64 nr)
+{
+ return __test_extent_overwrite(c, 0, 64, 0, 32) ?:
+ __test_extent_overwrite(c, 8, 64, 0, 32);
+}
+
+static int test_extent_overwrite_back(struct bch_fs *c, u64 nr)
+{
+ return __test_extent_overwrite(c, 0, 64, 32, 64) ?:
+ __test_extent_overwrite(c, 0, 64, 32, 72);
+}
+
+static int test_extent_overwrite_middle(struct bch_fs *c, u64 nr)
+{
+ return __test_extent_overwrite(c, 0, 64, 32, 40);
+}
+
+static int test_extent_overwrite_all(struct bch_fs *c, u64 nr)
+{
+ return __test_extent_overwrite(c, 32, 64, 0, 64) ?:
+ __test_extent_overwrite(c, 32, 64, 0, 128) ?:
+ __test_extent_overwrite(c, 32, 64, 32, 64) ?:
+ __test_extent_overwrite(c, 32, 64, 32, 128);
+}
+
+static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start, u32 len, u32 snapid)
+{
+ struct bkey_i_cookie k;
+ int ret;
+
+ bkey_cookie_init(&k.k_i);
+ k.k_i.k.p.inode = inum;
+ k.k_i.k.p.offset = start + len;
+ k.k_i.k.p.snapshot = snapid;
+ k.k_i.k.size = len;
+
+ ret = bch2_trans_do(c, NULL, NULL, 0,
+ bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+static int test_extent_create_overlapping(struct bch_fs *c, u64 inum)
+{
+ return insert_test_overlapping_extent(c, inum, 0, 16, U32_MAX - 2) ?: /* overwrite entire */
+ insert_test_overlapping_extent(c, inum, 2, 8, U32_MAX - 2) ?:
+ insert_test_overlapping_extent(c, inum, 4, 4, U32_MAX) ?:
+ insert_test_overlapping_extent(c, inum, 32, 8, U32_MAX - 2) ?: /* overwrite front/back */
+ insert_test_overlapping_extent(c, inum, 36, 8, U32_MAX) ?:
+ insert_test_overlapping_extent(c, inum, 60, 8, U32_MAX - 2) ?:
+ insert_test_overlapping_extent(c, inum, 64, 8, U32_MAX);
+}
+
+/* snapshot unit tests */
+
+/* Test skipping over keys in unrelated snapshots: */
+static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
+{
+ struct btree_trans *trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i_cookie cookie;
+ int ret;
+
+ bkey_cookie_init(&cookie.k_i);
+ cookie.k.p.snapshot = snapid_hi;
+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0);
+ if (ret)
+ return ret;
+
+ trans = bch2_trans_get(c);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
+ SPOS(0, 0, snapid_lo), 0);
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+
+ BUG_ON(k.k->p.snapshot != U32_MAX);
+
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
+ return ret;
+}
+
+static int test_snapshots(struct bch_fs *c, u64 nr)
+{
+ struct bkey_i_cookie cookie;
+ u32 snapids[2];
+ u32 snapid_subvols[2] = { 1, 1 };
+ int ret;
+
+ bkey_cookie_init(&cookie.k_i);
+ cookie.k.p.snapshot = U32_MAX;
+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0);
+ if (ret)
+ return ret;
+
+ ret = bch2_trans_do(c, NULL, NULL, 0,
+ bch2_snapshot_node_create(trans, U32_MAX,
+ snapids,
+ snapid_subvols,
+ 2));
+ if (ret)
+ return ret;
+
+ if (snapids[0] > snapids[1])
+ swap(snapids[0], snapids[1]);
+
+ ret = test_snapshot_filter(c, snapids[0], snapids[1]);
+ bch_err_msg(c, ret, "from test_snapshot_filter");
+ return ret;
+}
+
+/* perf tests */
+
+static u64 test_rand(void)
+{
+ u64 v;
+
+ get_random_bytes(&v, sizeof(v));
+ return v;
+}
+
+static int rand_insert(struct bch_fs *c, u64 nr)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct bkey_i_cookie k;
+ int ret = 0;
+ u64 i;
+
+ for (i = 0; i < nr; i++) {
+ bkey_cookie_init(&k.k_i);
+ k.k.p.offset = test_rand();
+ k.k.p.snapshot = U32_MAX;
+
+ ret = commit_do(trans, NULL, NULL, 0,
+ bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k.k_i, 0));
+ if (ret)
+ break;
+ }
+
+ bch2_trans_put(trans);
+ return ret;
+}
+
+static int rand_insert_multi(struct bch_fs *c, u64 nr)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct bkey_i_cookie k[8];
+ int ret = 0;
+ unsigned j;
+ u64 i;
+
+ for (i = 0; i < nr; i += ARRAY_SIZE(k)) {
+ for (j = 0; j < ARRAY_SIZE(k); j++) {
+ bkey_cookie_init(&k[j].k_i);
+ k[j].k.p.offset = test_rand();
+ k[j].k.p.snapshot = U32_MAX;
+ }
+
+ ret = commit_do(trans, NULL, NULL, 0,
+ bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?:
+ bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?:
+ bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?:
+ bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?:
+ bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?:
+ bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?:
+ bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?:
+ bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[7].k_i, 0));
+ if (ret)
+ break;
+ }
+
+ bch2_trans_put(trans);
+ return ret;
+}
+
+static int rand_lookup(struct bch_fs *c, u64 nr)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+ u64 i;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), 0);
+
+ for (i = 0; i < nr; i++) {
+ bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX));
+
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
+ ret = bkey_err(k);
+ if (ret)
+ break;
+ }
+
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
+ return ret;
+}
+
+static int rand_mixed_trans(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_i_cookie *cookie,
+ u64 i, u64 pos)
+{
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX));
+
+ k = bch2_btree_iter_peek(iter);
+ ret = bkey_err(k);
+ bch_err_msg(trans->c, ret, "lookup error");
+ if (ret)
+ return ret;
+
+ if (!(i & 3) && k.k) {
+ bkey_cookie_init(&cookie->k_i);
+ cookie->k.p = iter->pos;
+ ret = bch2_trans_update(trans, iter, &cookie->k_i, 0);
+ }
+
+ return ret;
+}
+
+static int rand_mixed(struct bch_fs *c, u64 nr)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_i_cookie cookie;
+ int ret = 0;
+ u64 i, rand;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), 0);
+
+ for (i = 0; i < nr; i++) {
+ rand = test_rand();
+ ret = commit_do(trans, NULL, NULL, 0,
+ rand_mixed_trans(trans, &iter, &cookie, i, rand));
+ if (ret)
+ break;
+ }
+
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
+ return ret;
+}
+
+static int __do_delete(struct btree_trans *trans, struct bpos pos)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (!k.k)
+ goto err;
+
+ ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int rand_delete(struct bch_fs *c, u64 nr)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ int ret = 0;
+ u64 i;
+
+ for (i = 0; i < nr; i++) {
+ struct bpos pos = SPOS(0, test_rand(), U32_MAX);
+
+ ret = commit_do(trans, NULL, NULL, 0,
+ __do_delete(trans, pos));
+ if (ret)
+ break;
+ }
+
+ bch2_trans_put(trans);
+ return ret;
+}
+
+static int seq_insert(struct bch_fs *c, u64 nr)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i_cookie insert;
+
+ bkey_cookie_init(&insert.k_i);
+
+ return bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k,
+ NULL, NULL, 0, ({
+ if (iter.pos.offset >= nr)
+ break;
+ insert.k.p = iter.pos;
+ bch2_trans_update(trans, &iter, &insert.k_i, 0);
+ })));
+}
+
+static int seq_lookup(struct bch_fs *c, u64 nr)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+
+ return bch2_trans_run(c,
+ for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ 0, k,
+ 0));
+}
+
+static int seq_overwrite(struct bch_fs *c, u64 nr)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+
+ return bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX),
+ BTREE_ITER_INTENT, k,
+ NULL, NULL, 0, ({
+ struct bkey_i_cookie u;
+
+ bkey_reassemble(&u.k_i, k);
+ bch2_trans_update(trans, &iter, &u.k_i, 0);
+ })));
+}
+
+static int seq_delete(struct bch_fs *c, u64 nr)
+{
+ return bch2_btree_delete_range(c, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX),
+ POS(0, U64_MAX),
+ 0, NULL);
+}
+
+typedef int (*perf_test_fn)(struct bch_fs *, u64);
+
+struct test_job {
+ struct bch_fs *c;
+ u64 nr;
+ unsigned nr_threads;
+ perf_test_fn fn;
+
+ atomic_t ready;
+ wait_queue_head_t ready_wait;
+
+ atomic_t done;
+ struct completion done_completion;
+
+ u64 start;
+ u64 finish;
+ int ret;
+};
+
+static int btree_perf_test_thread(void *data)
+{
+ struct test_job *j = data;
+ int ret;
+
+ if (atomic_dec_and_test(&j->ready)) {
+ wake_up(&j->ready_wait);
+ j->start = sched_clock();
+ } else {
+ wait_event(j->ready_wait, !atomic_read(&j->ready));
+ }
+
+ ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads));
+ if (ret) {
+ bch_err(j->c, "%ps: error %s", j->fn, bch2_err_str(ret));
+ j->ret = ret;
+ }
+
+ if (atomic_dec_and_test(&j->done)) {
+ j->finish = sched_clock();
+ complete(&j->done_completion);
+ }
+
+ return 0;
+}
+
+int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
+ u64 nr, unsigned nr_threads)
+{
+ struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads };
+ char name_buf[20];
+ struct printbuf nr_buf = PRINTBUF;
+ struct printbuf per_sec_buf = PRINTBUF;
+ unsigned i;
+ u64 time;
+
+ atomic_set(&j.ready, nr_threads);
+ init_waitqueue_head(&j.ready_wait);
+
+ atomic_set(&j.done, nr_threads);
+ init_completion(&j.done_completion);
+
+#define perf_test(_test) \
+ if (!strcmp(testname, #_test)) j.fn = _test
+
+ perf_test(rand_insert);
+ perf_test(rand_insert_multi);
+ perf_test(rand_lookup);
+ perf_test(rand_mixed);
+ perf_test(rand_delete);
+
+ perf_test(seq_insert);
+ perf_test(seq_lookup);
+ perf_test(seq_overwrite);
+ perf_test(seq_delete);
+
+ /* a unit test, not a perf test: */
+ perf_test(test_delete);
+ perf_test(test_delete_written);
+ perf_test(test_iterate);
+ perf_test(test_iterate_extents);
+ perf_test(test_iterate_slots);
+ perf_test(test_iterate_slots_extents);
+ perf_test(test_peek_end);
+ perf_test(test_peek_end_extents);
+
+ perf_test(test_extent_overwrite_front);
+ perf_test(test_extent_overwrite_back);
+ perf_test(test_extent_overwrite_middle);
+ perf_test(test_extent_overwrite_all);
+ perf_test(test_extent_create_overlapping);
+
+ perf_test(test_snapshots);
+
+ if (!j.fn) {
+ pr_err("unknown test %s", testname);
+ return -EINVAL;
+ }
+
+ //pr_info("running test %s:", testname);
+
+ if (nr_threads == 1)
+ btree_perf_test_thread(&j);
+ else
+ for (i = 0; i < nr_threads; i++)
+ kthread_run(btree_perf_test_thread, &j,
+ "bcachefs perf test[%u]", i);
+
+ while (wait_for_completion_interruptible(&j.done_completion))
+ ;
+
+ time = j.finish - j.start;
+
+ scnprintf(name_buf, sizeof(name_buf), "%s:", testname);
+ prt_human_readable_u64(&nr_buf, nr);
+ prt_human_readable_u64(&per_sec_buf, div64_u64(nr * NSEC_PER_SEC, time));
+ printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n",
+ name_buf, nr_buf.buf, nr_threads,
+ div_u64(time, NSEC_PER_SEC),
+ div_u64(time * nr_threads, nr),
+ per_sec_buf.buf);
+ printbuf_exit(&per_sec_buf);
+ printbuf_exit(&nr_buf);
+ return j.ret;
+}
+
+#endif /* CONFIG_BCACHEFS_TESTS */
diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h
new file mode 100644
index 000000000000..c73b18aea7e0
--- /dev/null
+++ b/fs/bcachefs/tests.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_TEST_H
+#define _BCACHEFS_TEST_H
+
+struct bch_fs;
+
+#ifdef CONFIG_BCACHEFS_TESTS
+
+int bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned);
+
+#else
+
+#endif /* CONFIG_BCACHEFS_TESTS */
+
+#endif /* _BCACHEFS_TEST_H */
diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c
new file mode 100644
index 000000000000..dc48b52b01b4
--- /dev/null
+++ b/fs/bcachefs/trace.c
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "alloc_types.h"
+#include "buckets.h"
+#include "btree_cache.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "btree_update_interior.h"
+#include "keylist.h"
+#include "move_types.h"
+#include "opts.h"
+#include "six.h"
+
+#include <linux/blktrace_api.h>
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
new file mode 100644
index 000000000000..fd49b63562c3
--- /dev/null
+++ b/fs/bcachefs/trace.h
@@ -0,0 +1,1327 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM bcachefs
+
+#if !defined(_TRACE_BCACHEFS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_BCACHEFS_H
+
+#include <linux/tracepoint.h>
+
+#define TRACE_BPOS_entries(name) \
+ __field(u64, name##_inode ) \
+ __field(u64, name##_offset ) \
+ __field(u32, name##_snapshot )
+
+#define TRACE_BPOS_assign(dst, src) \
+ __entry->dst##_inode = (src).inode; \
+ __entry->dst##_offset = (src).offset; \
+ __entry->dst##_snapshot = (src).snapshot
+
+DECLARE_EVENT_CLASS(bpos,
+ TP_PROTO(const struct bpos *p),
+ TP_ARGS(p),
+
+ TP_STRUCT__entry(
+ TRACE_BPOS_entries(p)
+ ),
+
+ TP_fast_assign(
+ TRACE_BPOS_assign(p, *p);
+ ),
+
+ TP_printk("%llu:%llu:%u", __entry->p_inode, __entry->p_offset, __entry->p_snapshot)
+);
+
+DECLARE_EVENT_CLASS(bkey,
+ TP_PROTO(struct bch_fs *c, const char *k),
+ TP_ARGS(c, k),
+
+ TP_STRUCT__entry(
+ __string(k, k )
+ ),
+
+ TP_fast_assign(
+ __assign_str(k, k);
+ ),
+
+ TP_printk("%s", __get_str(k))
+);
+
+DECLARE_EVENT_CLASS(btree_node,
+ TP_PROTO(struct bch_fs *c, struct btree *b),
+ TP_ARGS(c, b),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(u8, level )
+ __field(u8, btree_id )
+ TRACE_BPOS_entries(pos)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ __entry->level = b->c.level;
+ __entry->btree_id = b->c.btree_id;
+ TRACE_BPOS_assign(pos, b->key.k.p);
+ ),
+
+ TP_printk("%d,%d %u %s %llu:%llu:%u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->level,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
+);
+
+DECLARE_EVENT_CLASS(bch_fs,
+ TP_PROTO(struct bch_fs *c),
+ TP_ARGS(c),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ ),
+
+ TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev))
+);
+
+DECLARE_EVENT_CLASS(bio,
+ TP_PROTO(struct bio *bio),
+ TP_ARGS(bio),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(sector_t, sector )
+ __field(unsigned int, nr_sector )
+ __array(char, rwbs, 6 )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = bio->bi_bdev ? bio_dev(bio) : 0;
+ __entry->sector = bio->bi_iter.bi_sector;
+ __entry->nr_sector = bio->bi_iter.bi_size >> 9;
+ blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
+ ),
+
+ TP_printk("%d,%d %s %llu + %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+ (unsigned long long)__entry->sector, __entry->nr_sector)
+);
+
+/* super-io.c: */
+TRACE_EVENT(write_super,
+ TP_PROTO(struct bch_fs *c, unsigned long ip),
+ TP_ARGS(c, ip),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(unsigned long, ip )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ __entry->ip = ip;
+ ),
+
+ TP_printk("%d,%d for %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (void *) __entry->ip)
+);
+
+/* io.c: */
+
+DEFINE_EVENT(bio, read_promote,
+ TP_PROTO(struct bio *bio),
+ TP_ARGS(bio)
+);
+
+TRACE_EVENT(read_nopromote,
+ TP_PROTO(struct bch_fs *c, int ret),
+ TP_ARGS(c, ret),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __array(char, ret, 32 )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret));
+ ),
+
+ TP_printk("%d,%d ret %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ret)
+);
+
+DEFINE_EVENT(bio, read_bounce,
+ TP_PROTO(struct bio *bio),
+ TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bio, read_split,
+ TP_PROTO(struct bio *bio),
+ TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bio, read_retry,
+ TP_PROTO(struct bio *bio),
+ TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bio, read_reuse_race,
+ TP_PROTO(struct bio *bio),
+ TP_ARGS(bio)
+);
+
+/* Journal */
+
+DEFINE_EVENT(bch_fs, journal_full,
+ TP_PROTO(struct bch_fs *c),
+ TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, journal_entry_full,
+ TP_PROTO(struct bch_fs *c),
+ TP_ARGS(c)
+);
+
+DEFINE_EVENT(bio, journal_write,
+ TP_PROTO(struct bio *bio),
+ TP_ARGS(bio)
+);
+
+TRACE_EVENT(journal_reclaim_start,
+ TP_PROTO(struct bch_fs *c, bool direct, bool kicked,
+ u64 min_nr, u64 min_key_cache,
+ u64 btree_cache_dirty, u64 btree_cache_total,
+ u64 btree_key_cache_dirty, u64 btree_key_cache_total),
+ TP_ARGS(c, direct, kicked, min_nr, min_key_cache,
+ btree_cache_dirty, btree_cache_total,
+ btree_key_cache_dirty, btree_key_cache_total),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(bool, direct )
+ __field(bool, kicked )
+ __field(u64, min_nr )
+ __field(u64, min_key_cache )
+ __field(u64, btree_cache_dirty )
+ __field(u64, btree_cache_total )
+ __field(u64, btree_key_cache_dirty )
+ __field(u64, btree_key_cache_total )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ __entry->direct = direct;
+ __entry->kicked = kicked;
+ __entry->min_nr = min_nr;
+ __entry->min_key_cache = min_key_cache;
+ __entry->btree_cache_dirty = btree_cache_dirty;
+ __entry->btree_cache_total = btree_cache_total;
+ __entry->btree_key_cache_dirty = btree_key_cache_dirty;
+ __entry->btree_key_cache_total = btree_key_cache_total;
+ ),
+
+ TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu btree cache %llu/%llu key cache %llu/%llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->direct,
+ __entry->kicked,
+ __entry->min_nr,
+ __entry->min_key_cache,
+ __entry->btree_cache_dirty,
+ __entry->btree_cache_total,
+ __entry->btree_key_cache_dirty,
+ __entry->btree_key_cache_total)
+);
+
+TRACE_EVENT(journal_reclaim_finish,
+ TP_PROTO(struct bch_fs *c, u64 nr_flushed),
+ TP_ARGS(c, nr_flushed),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(u64, nr_flushed )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ __entry->nr_flushed = nr_flushed;
+ ),
+
+ TP_printk("%d,%d flushed %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->nr_flushed)
+);
+
+/* bset.c: */
+
+DEFINE_EVENT(bpos, bkey_pack_pos_fail,
+ TP_PROTO(const struct bpos *p),
+ TP_ARGS(p)
+);
+
+/* Btree cache: */
+
+TRACE_EVENT(btree_cache_scan,
+ TP_PROTO(long nr_to_scan, long can_free, long ret),
+ TP_ARGS(nr_to_scan, can_free, ret),
+
+ TP_STRUCT__entry(
+ __field(long, nr_to_scan )
+ __field(long, can_free )
+ __field(long, ret )
+ ),
+
+ TP_fast_assign(
+ __entry->nr_to_scan = nr_to_scan;
+ __entry->can_free = can_free;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("scanned for %li nodes, can free %li, ret %li",
+ __entry->nr_to_scan, __entry->can_free, __entry->ret)
+);
+
+DEFINE_EVENT(btree_node, btree_cache_reap,
+ TP_PROTO(struct bch_fs *c, struct btree *b),
+ TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock_fail,
+ TP_PROTO(struct bch_fs *c),
+ TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock,
+ TP_PROTO(struct bch_fs *c),
+ TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize,
+ TP_PROTO(struct bch_fs *c),
+ TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize_unlock,
+ TP_PROTO(struct bch_fs *c),
+ TP_ARGS(c)
+);
+
+/* Btree */
+
+DEFINE_EVENT(btree_node, btree_node_read,
+ TP_PROTO(struct bch_fs *c, struct btree *b),
+ TP_ARGS(c, b)
+);
+
+TRACE_EVENT(btree_node_write,
+ TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors),
+ TP_ARGS(b, bytes, sectors),
+
+ TP_STRUCT__entry(
+ __field(enum btree_node_type, type)
+ __field(unsigned, bytes )
+ __field(unsigned, sectors )
+ ),
+
+ TP_fast_assign(
+ __entry->type = btree_node_type(b);
+ __entry->bytes = bytes;
+ __entry->sectors = sectors;
+ ),
+
+ TP_printk("bkey type %u bytes %u sectors %u",
+ __entry->type , __entry->bytes, __entry->sectors)
+);
+
+DEFINE_EVENT(btree_node, btree_node_alloc,
+ TP_PROTO(struct bch_fs *c, struct btree *b),
+ TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(btree_node, btree_node_free,
+ TP_PROTO(struct bch_fs *c, struct btree *b),
+ TP_ARGS(c, b)
+);
+
+TRACE_EVENT(btree_reserve_get_fail,
+ TP_PROTO(const char *trans_fn,
+ unsigned long caller_ip,
+ size_t required,
+ int ret),
+ TP_ARGS(trans_fn, caller_ip, required, ret),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ __field(size_t, required )
+ __array(char, ret, 32 )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __entry->required = required;
+ strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret));
+ ),
+
+ TP_printk("%s %pS required %zu ret %s",
+ __entry->trans_fn,
+ (void *) __entry->caller_ip,
+ __entry->required,
+ __entry->ret)
+);
+
+DEFINE_EVENT(btree_node, btree_node_compact,
+ TP_PROTO(struct bch_fs *c, struct btree *b),
+ TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(btree_node, btree_node_merge,
+ TP_PROTO(struct bch_fs *c, struct btree *b),
+ TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(btree_node, btree_node_split,
+ TP_PROTO(struct bch_fs *c, struct btree *b),
+ TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(btree_node, btree_node_rewrite,
+ TP_PROTO(struct bch_fs *c, struct btree *b),
+ TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(btree_node, btree_node_set_root,
+ TP_PROTO(struct bch_fs *c, struct btree *b),
+ TP_ARGS(c, b)
+);
+
+TRACE_EVENT(btree_path_relock_fail,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree_path *path,
+ unsigned level),
+ TP_ARGS(trans, caller_ip, path, level),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ __field(u8, btree_id )
+ __field(u8, level )
+ TRACE_BPOS_entries(pos)
+ __array(char, node, 24 )
+ __field(u8, self_read_count )
+ __field(u8, self_intent_count)
+ __field(u8, read_count )
+ __field(u8, intent_count )
+ __field(u32, iter_lock_seq )
+ __field(u32, node_lock_seq )
+ ),
+
+ TP_fast_assign(
+ struct btree *b = btree_path_node(path, level);
+ struct six_lock_count c;
+
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __entry->btree_id = path->btree_id;
+ __entry->level = path->level;
+ TRACE_BPOS_assign(pos, path->pos);
+
+ c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level),
+ __entry->self_read_count = c.n[SIX_LOCK_read];
+ __entry->self_intent_count = c.n[SIX_LOCK_intent];
+
+ if (IS_ERR(b)) {
+ strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node));
+ } else {
+ c = six_lock_counts(&path->l[level].b->c.lock);
+ __entry->read_count = c.n[SIX_LOCK_read];
+ __entry->intent_count = c.n[SIX_LOCK_intent];
+ scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
+ }
+ __entry->iter_lock_seq = path->l[level].lock_seq;
+ __entry->node_lock_seq = is_btree_node(path, level)
+ ? six_lock_seq(&path->l[level].b->c.lock)
+ : 0;
+ ),
+
+ TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u",
+ __entry->trans_fn,
+ (void *) __entry->caller_ip,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot,
+ __entry->level,
+ __entry->node,
+ __entry->self_read_count,
+ __entry->self_intent_count,
+ __entry->read_count,
+ __entry->intent_count,
+ __entry->iter_lock_seq,
+ __entry->node_lock_seq)
+);
+
+TRACE_EVENT(btree_path_upgrade_fail,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree_path *path,
+ unsigned level),
+ TP_ARGS(trans, caller_ip, path, level),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ __field(u8, btree_id )
+ __field(u8, level )
+ TRACE_BPOS_entries(pos)
+ __field(u8, locked )
+ __field(u8, self_read_count )
+ __field(u8, self_intent_count)
+ __field(u8, read_count )
+ __field(u8, intent_count )
+ __field(u32, iter_lock_seq )
+ __field(u32, node_lock_seq )
+ ),
+
+ TP_fast_assign(
+ struct six_lock_count c;
+
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __entry->btree_id = path->btree_id;
+ __entry->level = level;
+ TRACE_BPOS_assign(pos, path->pos);
+ __entry->locked = btree_node_locked(path, level);
+
+ c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level),
+ __entry->self_read_count = c.n[SIX_LOCK_read];
+ __entry->self_intent_count = c.n[SIX_LOCK_intent];
+ c = six_lock_counts(&path->l[level].b->c.lock);
+ __entry->read_count = c.n[SIX_LOCK_read];
+ __entry->intent_count = c.n[SIX_LOCK_intent];
+ __entry->iter_lock_seq = path->l[level].lock_seq;
+ __entry->node_lock_seq = is_btree_node(path, level)
+ ? six_lock_seq(&path->l[level].b->c.lock)
+ : 0;
+ ),
+
+ TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
+ __entry->trans_fn,
+ (void *) __entry->caller_ip,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot,
+ __entry->level,
+ __entry->locked,
+ __entry->self_read_count,
+ __entry->self_intent_count,
+ __entry->read_count,
+ __entry->intent_count,
+ __entry->iter_lock_seq,
+ __entry->node_lock_seq)
+);
+
+/* Garbage collection */
+
+DEFINE_EVENT(bch_fs, gc_gens_start,
+ TP_PROTO(struct bch_fs *c),
+ TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, gc_gens_end,
+ TP_PROTO(struct bch_fs *c),
+ TP_ARGS(c)
+);
+
+/* Allocator */
+
+DECLARE_EVENT_CLASS(bucket_alloc,
+ TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+ u64 bucket,
+ u64 free,
+ u64 avail,
+ u64 copygc_wait_amount,
+ s64 copygc_waiting_for,
+ struct bucket_alloc_state *s,
+ bool nonblocking,
+ const char *err),
+ TP_ARGS(ca, alloc_reserve, bucket, free, avail,
+ copygc_wait_amount, copygc_waiting_for,
+ s, nonblocking, err),
+
+ TP_STRUCT__entry(
+ __field(u8, dev )
+ __array(char, reserve, 16 )
+ __field(u64, bucket )
+ __field(u64, free )
+ __field(u64, avail )
+ __field(u64, copygc_wait_amount )
+ __field(s64, copygc_waiting_for )
+ __field(u64, seen )
+ __field(u64, open )
+ __field(u64, need_journal_commit )
+ __field(u64, nouse )
+ __field(bool, nonblocking )
+ __field(u64, nocow )
+ __array(char, err, 32 )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = ca->dev_idx;
+ strscpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
+ __entry->bucket = bucket;
+ __entry->free = free;
+ __entry->avail = avail;
+ __entry->copygc_wait_amount = copygc_wait_amount;
+ __entry->copygc_waiting_for = copygc_waiting_for;
+ __entry->seen = s->buckets_seen;
+ __entry->open = s->skipped_open;
+ __entry->need_journal_commit = s->skipped_need_journal_commit;
+ __entry->nouse = s->skipped_nouse;
+ __entry->nonblocking = nonblocking;
+ __entry->nocow = s->skipped_nocow;
+ strscpy(__entry->err, err, sizeof(__entry->err));
+ ),
+
+ TP_printk("reserve %s bucket %u:%llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nocow %llu nonblocking %u err %s",
+ __entry->reserve,
+ __entry->dev,
+ __entry->bucket,
+ __entry->free,
+ __entry->avail,
+ __entry->copygc_wait_amount,
+ __entry->copygc_waiting_for,
+ __entry->seen,
+ __entry->open,
+ __entry->need_journal_commit,
+ __entry->nouse,
+ __entry->nocow,
+ __entry->nonblocking,
+ __entry->err)
+);
+
+DEFINE_EVENT(bucket_alloc, bucket_alloc,
+ TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+ u64 bucket,
+ u64 free,
+ u64 avail,
+ u64 copygc_wait_amount,
+ s64 copygc_waiting_for,
+ struct bucket_alloc_state *s,
+ bool nonblocking,
+ const char *err),
+ TP_ARGS(ca, alloc_reserve, bucket, free, avail,
+ copygc_wait_amount, copygc_waiting_for,
+ s, nonblocking, err)
+);
+
+DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
+ TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+ u64 bucket,
+ u64 free,
+ u64 avail,
+ u64 copygc_wait_amount,
+ s64 copygc_waiting_for,
+ struct bucket_alloc_state *s,
+ bool nonblocking,
+ const char *err),
+ TP_ARGS(ca, alloc_reserve, bucket, free, avail,
+ copygc_wait_amount, copygc_waiting_for,
+ s, nonblocking, err)
+);
+
+TRACE_EVENT(discard_buckets,
+ TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
+ u64 need_journal_commit, u64 discarded, const char *err),
+ TP_ARGS(c, seen, open, need_journal_commit, discarded, err),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(u64, seen )
+ __field(u64, open )
+ __field(u64, need_journal_commit )
+ __field(u64, discarded )
+ __array(char, err, 16 )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ __entry->seen = seen;
+ __entry->open = open;
+ __entry->need_journal_commit = need_journal_commit;
+ __entry->discarded = discarded;
+ strscpy(__entry->err, err, sizeof(__entry->err));
+ ),
+
+ TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->seen,
+ __entry->open,
+ __entry->need_journal_commit,
+ __entry->discarded,
+ __entry->err)
+);
+
+TRACE_EVENT(bucket_invalidate,
+ TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors),
+ TP_ARGS(c, dev, bucket, sectors),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(u32, dev_idx )
+ __field(u32, sectors )
+ __field(u64, bucket )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ __entry->dev_idx = dev;
+ __entry->sectors = sectors;
+ __entry->bucket = bucket;
+ ),
+
+ TP_printk("%d:%d invalidated %u:%llu cached sectors %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dev_idx, __entry->bucket,
+ __entry->sectors)
+);
+
+/* Moving IO */
+
+TRACE_EVENT(bucket_evacuate,
+ TP_PROTO(struct bch_fs *c, struct bpos *bucket),
+ TP_ARGS(c, bucket),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(u32, dev_idx )
+ __field(u64, bucket )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ __entry->dev_idx = bucket->inode;
+ __entry->bucket = bucket->offset;
+ ),
+
+ TP_printk("%d:%d %u:%llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dev_idx, __entry->bucket)
+);
+
+DEFINE_EVENT(bkey, move_extent,
+ TP_PROTO(struct bch_fs *c, const char *k),
+ TP_ARGS(c, k)
+);
+
+DEFINE_EVENT(bkey, move_extent_read,
+ TP_PROTO(struct bch_fs *c, const char *k),
+ TP_ARGS(c, k)
+);
+
+DEFINE_EVENT(bkey, move_extent_write,
+ TP_PROTO(struct bch_fs *c, const char *k),
+ TP_ARGS(c, k)
+);
+
+DEFINE_EVENT(bkey, move_extent_finish,
+ TP_PROTO(struct bch_fs *c, const char *k),
+ TP_ARGS(c, k)
+);
+
+TRACE_EVENT(move_extent_fail,
+ TP_PROTO(struct bch_fs *c, const char *msg),
+ TP_ARGS(c, msg),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __string(msg, msg )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ __assign_str(msg, msg);
+ ),
+
+ TP_printk("%d:%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(msg))
+);
+
+DEFINE_EVENT(bkey, move_extent_start_fail,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
+);
+
+TRACE_EVENT(move_data,
+ TP_PROTO(struct bch_fs *c,
+ struct bch_move_stats *stats),
+ TP_ARGS(c, stats),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(u64, keys_moved )
+ __field(u64, keys_raced )
+ __field(u64, sectors_seen )
+ __field(u64, sectors_moved )
+ __field(u64, sectors_raced )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ __entry->keys_moved = atomic64_read(&stats->keys_moved);
+ __entry->keys_raced = atomic64_read(&stats->keys_raced);
+ __entry->sectors_seen = atomic64_read(&stats->sectors_seen);
+ __entry->sectors_moved = atomic64_read(&stats->sectors_moved);
+ __entry->sectors_raced = atomic64_read(&stats->sectors_raced);
+ ),
+
+ TP_printk("%d,%d keys moved %llu raced %llu"
+ "sectors seen %llu moved %llu raced %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->keys_moved,
+ __entry->keys_raced,
+ __entry->sectors_seen,
+ __entry->sectors_moved,
+ __entry->sectors_raced)
+);
+
+TRACE_EVENT(evacuate_bucket,
+ TP_PROTO(struct bch_fs *c, struct bpos *bucket,
+ unsigned sectors, unsigned bucket_size,
+ u64 fragmentation, int ret),
+ TP_ARGS(c, bucket, sectors, bucket_size, fragmentation, ret),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(u64, member )
+ __field(u64, bucket )
+ __field(u32, sectors )
+ __field(u32, bucket_size )
+ __field(u64, fragmentation )
+ __field(int, ret )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ __entry->member = bucket->inode;
+ __entry->bucket = bucket->offset;
+ __entry->sectors = sectors;
+ __entry->bucket_size = bucket_size;
+ __entry->fragmentation = fragmentation;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("%d,%d %llu:%llu sectors %u/%u fragmentation %llu ret %i",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->member, __entry->bucket,
+ __entry->sectors, __entry->bucket_size,
+ __entry->fragmentation, __entry->ret)
+);
+
+TRACE_EVENT(copygc,
+ TP_PROTO(struct bch_fs *c,
+ u64 sectors_moved, u64 sectors_not_moved,
+ u64 buckets_moved, u64 buckets_not_moved),
+ TP_ARGS(c,
+ sectors_moved, sectors_not_moved,
+ buckets_moved, buckets_not_moved),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(u64, sectors_moved )
+ __field(u64, sectors_not_moved )
+ __field(u64, buckets_moved )
+ __field(u64, buckets_not_moved )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ __entry->sectors_moved = sectors_moved;
+ __entry->sectors_not_moved = sectors_not_moved;
+ __entry->buckets_moved = buckets_moved;
+ __entry->buckets_not_moved = buckets_moved;
+ ),
+
+ TP_printk("%d,%d sectors moved %llu remain %llu buckets moved %llu remain %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->sectors_moved, __entry->sectors_not_moved,
+ __entry->buckets_moved, __entry->buckets_not_moved)
+);
+
+TRACE_EVENT(copygc_wait,
+ TP_PROTO(struct bch_fs *c,
+ u64 wait_amount, u64 until),
+ TP_ARGS(c, wait_amount, until),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(u64, wait_amount )
+ __field(u64, until )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ __entry->wait_amount = wait_amount;
+ __entry->until = until;
+ ),
+
+ TP_printk("%d,%u waiting for %llu sectors until %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->wait_amount, __entry->until)
+);
+
+/* btree transactions: */
+
+DECLARE_EVENT_CLASS(transaction_event,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ ),
+
+ TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
+);
+
+DEFINE_EVENT(transaction_event, transaction_commit,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip)
+);
+
+DEFINE_EVENT(transaction_event, trans_restart_injected,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip)
+);
+
+TRACE_EVENT(trans_restart_split_race,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree *b),
+ TP_ARGS(trans, caller_ip, b),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ __field(u8, level )
+ __field(u16, written )
+ __field(u16, blocks )
+ __field(u16, u64s_remaining )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __entry->level = b->c.level;
+ __entry->written = b->written;
+ __entry->blocks = btree_blocks(trans->c);
+ __entry->u64s_remaining = bch_btree_keys_u64s_remaining(trans->c, b);
+ ),
+
+ TP_printk("%s %pS l=%u written %u/%u u64s remaining %u",
+ __entry->trans_fn, (void *) __entry->caller_ip,
+ __entry->level,
+ __entry->written, __entry->blocks,
+ __entry->u64s_remaining)
+);
+
+DEFINE_EVENT(transaction_event, trans_blocked_journal_reclaim,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip)
+);
+
+TRACE_EVENT(trans_restart_journal_preres_get,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ unsigned flags),
+ TP_ARGS(trans, caller_ip, flags),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ __field(unsigned, flags )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __entry->flags = flags;
+ ),
+
+ TP_printk("%s %pS %x", __entry->trans_fn,
+ (void *) __entry->caller_ip,
+ __entry->flags)
+);
+
+DEFINE_EVENT(transaction_event, trans_restart_fault_inject,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip)
+);
+
+DEFINE_EVENT(transaction_event, trans_traverse_all,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip)
+);
+
+DEFINE_EVENT(transaction_event, trans_restart_key_cache_raced,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip)
+);
+
+DEFINE_EVENT(transaction_event, trans_restart_too_many_iters,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip)
+);
+
+DECLARE_EVENT_CLASS(transaction_restart_iter,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ __field(u8, btree_id )
+ TRACE_BPOS_entries(pos)
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __entry->btree_id = path->btree_id;
+ TRACE_BPOS_assign(pos, path->pos)
+ ),
+
+ TP_printk("%s %pS btree %s pos %llu:%llu:%u",
+ __entry->trans_fn,
+ (void *) __entry->caller_ip,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot)
+);
+
+DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_reused,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
+);
+
+struct get_locks_fail;
+
+TRACE_EVENT(trans_restart_upgrade,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree_path *path,
+ unsigned old_locks_want,
+ unsigned new_locks_want,
+ struct get_locks_fail *f),
+ TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want, f),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ __field(u8, btree_id )
+ __field(u8, old_locks_want )
+ __field(u8, new_locks_want )
+ __field(u8, level )
+ __field(u32, path_seq )
+ __field(u32, node_seq )
+ __field(u32, path_alloc_seq )
+ __field(u32, downgrade_seq)
+ TRACE_BPOS_entries(pos)
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __entry->btree_id = path->btree_id;
+ __entry->old_locks_want = old_locks_want;
+ __entry->new_locks_want = new_locks_want;
+ __entry->level = f->l;
+ __entry->path_seq = path->l[f->l].lock_seq;
+ __entry->node_seq = IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq;
+ __entry->path_alloc_seq = path->alloc_seq;
+ __entry->downgrade_seq = path->downgrade_seq;
+ TRACE_BPOS_assign(pos, path->pos)
+ ),
+
+ TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u alloc_seq %u downgrade_seq %u",
+ __entry->trans_fn,
+ (void *) __entry->caller_ip,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot,
+ __entry->old_locks_want,
+ __entry->new_locks_want,
+ __entry->level,
+ __entry->path_seq,
+ __entry->node_seq,
+ __entry->path_alloc_seq,
+ __entry->downgrade_seq)
+);
+
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_after_fill,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_event, trans_restart_key_cache_upgrade,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip)
+);
+
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_event, trans_restart_would_deadlock,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip)
+);
+
+DEFINE_EVENT(transaction_event, trans_restart_would_deadlock_recursion_limit,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip)
+);
+
+TRACE_EVENT(trans_restart_would_deadlock_write,
+ TP_PROTO(struct btree_trans *trans),
+ TP_ARGS(trans),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ ),
+
+ TP_printk("%s", __entry->trans_fn)
+);
+
+TRACE_EVENT(trans_restart_mem_realloced,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ unsigned long bytes),
+ TP_ARGS(trans, caller_ip, bytes),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ __field(unsigned long, bytes )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __entry->bytes = bytes;
+ ),
+
+ TP_printk("%s %pS bytes %lu",
+ __entry->trans_fn,
+ (void *) __entry->caller_ip,
+ __entry->bytes)
+);
+
+TRACE_EVENT(trans_restart_key_cache_key_realloced,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree_path *path,
+ unsigned old_u64s,
+ unsigned new_u64s),
+ TP_ARGS(trans, caller_ip, path, old_u64s, new_u64s),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ __field(enum btree_id, btree_id )
+ TRACE_BPOS_entries(pos)
+ __field(u32, old_u64s )
+ __field(u32, new_u64s )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+
+ __entry->btree_id = path->btree_id;
+ TRACE_BPOS_assign(pos, path->pos);
+ __entry->old_u64s = old_u64s;
+ __entry->new_u64s = new_u64s;
+ ),
+
+ TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u",
+ __entry->trans_fn,
+ (void *) __entry->caller_ip,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot,
+ __entry->old_u64s,
+ __entry->new_u64s)
+);
+
+TRACE_EVENT(path_downgrade,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ ),
+
+ TP_printk("%s %pS",
+ __entry->trans_fn,
+ (void *) __entry->caller_ip)
+);
+
+DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip)
+);
+
+TRACE_EVENT(write_buffer_flush,
+ TP_PROTO(struct btree_trans *trans, size_t nr, size_t skipped, size_t fast, size_t size),
+ TP_ARGS(trans, nr, skipped, fast, size),
+
+ TP_STRUCT__entry(
+ __field(size_t, nr )
+ __field(size_t, skipped )
+ __field(size_t, fast )
+ __field(size_t, size )
+ ),
+
+ TP_fast_assign(
+ __entry->nr = nr;
+ __entry->skipped = skipped;
+ __entry->fast = fast;
+ __entry->size = size;
+ ),
+
+ TP_printk("%zu/%zu skipped %zu fast %zu",
+ __entry->nr, __entry->size, __entry->skipped, __entry->fast)
+);
+
+TRACE_EVENT(write_buffer_flush_slowpath,
+ TP_PROTO(struct btree_trans *trans, size_t nr, size_t size),
+ TP_ARGS(trans, nr, size),
+
+ TP_STRUCT__entry(
+ __field(size_t, nr )
+ __field(size_t, size )
+ ),
+
+ TP_fast_assign(
+ __entry->nr = nr;
+ __entry->size = size;
+ ),
+
+ TP_printk("%zu/%zu", __entry->nr, __entry->size)
+);
+
+#endif /* _TRACE_BCACHEFS_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../fs/bcachefs
+
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+#include <trace/define_trace.h>
diff --git a/fs/bcachefs/two_state_shared_lock.c b/fs/bcachefs/two_state_shared_lock.c
new file mode 100644
index 000000000000..9764c2e6a910
--- /dev/null
+++ b/fs/bcachefs/two_state_shared_lock.c
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "two_state_shared_lock.h"
+
+void __bch2_two_state_lock(two_state_lock_t *lock, int s)
+{
+ __wait_event(lock->wait, bch2_two_state_trylock(lock, s));
+}
diff --git a/fs/bcachefs/two_state_shared_lock.h b/fs/bcachefs/two_state_shared_lock.h
new file mode 100644
index 000000000000..905801772002
--- /dev/null
+++ b/fs/bcachefs/two_state_shared_lock.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_TWO_STATE_LOCK_H
+#define _BCACHEFS_TWO_STATE_LOCK_H
+
+#include <linux/atomic.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+
+#include "util.h"
+
+/*
+ * Two-state lock - can be taken for add or block - both states are shared,
+ * like read side of rwsem, but conflict with other state:
+ */
+typedef struct {
+ atomic_long_t v;
+ wait_queue_head_t wait;
+} two_state_lock_t;
+
+static inline void two_state_lock_init(two_state_lock_t *lock)
+{
+ atomic_long_set(&lock->v, 0);
+ init_waitqueue_head(&lock->wait);
+}
+
+static inline void bch2_two_state_unlock(two_state_lock_t *lock, int s)
+{
+ long i = s ? 1 : -1;
+
+ EBUG_ON(atomic_long_read(&lock->v) == 0);
+
+ if (atomic_long_sub_return_release(i, &lock->v) == 0)
+ wake_up_all(&lock->wait);
+}
+
+static inline bool bch2_two_state_trylock(two_state_lock_t *lock, int s)
+{
+ long i = s ? 1 : -1;
+ long v = atomic_long_read(&lock->v), old;
+
+ do {
+ old = v;
+
+ if (i > 0 ? v < 0 : v > 0)
+ return false;
+ } while ((v = atomic_long_cmpxchg_acquire(&lock->v,
+ old, old + i)) != old);
+ return true;
+}
+
+void __bch2_two_state_lock(two_state_lock_t *, int);
+
+static inline void bch2_two_state_lock(two_state_lock_t *lock, int s)
+{
+ if (!bch2_two_state_trylock(lock, s))
+ __bch2_two_state_lock(lock, s);
+}
+
+#endif /* _BCACHEFS_TWO_STATE_LOCK_H */
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
new file mode 100644
index 000000000000..84b142fcc3df
--- /dev/null
+++ b/fs/bcachefs/util.c
@@ -0,0 +1,1159 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * random utiility code, for bcache but in theory not specific to bcache
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/console.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/log2.h>
+#include <linux/math64.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/sched/clock.h>
+
+#include "eytzinger.h"
+#include "mean_and_variance.h"
+#include "util.h"
+
+static const char si_units[] = "?kMGTPEZY";
+
+/* string_get_size units: */
+static const char *const units_2[] = {
+ "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"
+};
+static const char *const units_10[] = {
+ "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"
+};
+
+static int parse_u64(const char *cp, u64 *res)
+{
+ const char *start = cp;
+ u64 v = 0;
+
+ if (!isdigit(*cp))
+ return -EINVAL;
+
+ do {
+ if (v > U64_MAX / 10)
+ return -ERANGE;
+ v *= 10;
+ if (v > U64_MAX - (*cp - '0'))
+ return -ERANGE;
+ v += *cp - '0';
+ cp++;
+ } while (isdigit(*cp));
+
+ *res = v;
+ return cp - start;
+}
+
+static int bch2_pow(u64 n, u64 p, u64 *res)
+{
+ *res = 1;
+
+ while (p--) {
+ if (*res > div_u64(U64_MAX, n))
+ return -ERANGE;
+ *res *= n;
+ }
+ return 0;
+}
+
+static int parse_unit_suffix(const char *cp, u64 *res)
+{
+ const char *start = cp;
+ u64 base = 1024;
+ unsigned u;
+ int ret;
+
+ if (*cp == ' ')
+ cp++;
+
+ for (u = 1; u < strlen(si_units); u++)
+ if (*cp == si_units[u]) {
+ cp++;
+ goto got_unit;
+ }
+
+ for (u = 0; u < ARRAY_SIZE(units_2); u++)
+ if (!strncmp(cp, units_2[u], strlen(units_2[u]))) {
+ cp += strlen(units_2[u]);
+ goto got_unit;
+ }
+
+ for (u = 0; u < ARRAY_SIZE(units_10); u++)
+ if (!strncmp(cp, units_10[u], strlen(units_10[u]))) {
+ cp += strlen(units_10[u]);
+ base = 1000;
+ goto got_unit;
+ }
+
+ *res = 1;
+ return 0;
+got_unit:
+ ret = bch2_pow(base, u, res);
+ if (ret)
+ return ret;
+
+ return cp - start;
+}
+
+#define parse_or_ret(cp, _f) \
+do { \
+ int _ret = _f; \
+ if (_ret < 0) \
+ return _ret; \
+ cp += _ret; \
+} while (0)
+
+static int __bch2_strtou64_h(const char *cp, u64 *res)
+{
+ const char *start = cp;
+ u64 v = 0, b, f_n = 0, f_d = 1;
+ int ret;
+
+ parse_or_ret(cp, parse_u64(cp, &v));
+
+ if (*cp == '.') {
+ cp++;
+ ret = parse_u64(cp, &f_n);
+ if (ret < 0)
+ return ret;
+ cp += ret;
+
+ ret = bch2_pow(10, ret, &f_d);
+ if (ret)
+ return ret;
+ }
+
+ parse_or_ret(cp, parse_unit_suffix(cp, &b));
+
+ if (v > div_u64(U64_MAX, b))
+ return -ERANGE;
+ v *= b;
+
+ if (f_n > div_u64(U64_MAX, b))
+ return -ERANGE;
+
+ f_n = div_u64(f_n * b, f_d);
+ if (v + f_n < v)
+ return -ERANGE;
+ v += f_n;
+
+ *res = v;
+ return cp - start;
+}
+
+static int __bch2_strtoh(const char *cp, u64 *res,
+ u64 t_max, bool t_signed)
+{
+ bool positive = *cp != '-';
+ u64 v = 0;
+
+ if (*cp == '+' || *cp == '-')
+ cp++;
+
+ parse_or_ret(cp, __bch2_strtou64_h(cp, &v));
+
+ if (*cp == '\n')
+ cp++;
+ if (*cp)
+ return -EINVAL;
+
+ if (positive) {
+ if (v > t_max)
+ return -ERANGE;
+ } else {
+ if (v && !t_signed)
+ return -ERANGE;
+
+ if (v > t_max + 1)
+ return -ERANGE;
+ v = -v;
+ }
+
+ *res = v;
+ return 0;
+}
+
+#define STRTO_H(name, type) \
+int bch2_ ## name ## _h(const char *cp, type *res) \
+{ \
+ u64 v = 0; \
+ int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type), \
+ ANYSINT_MAX(type) != ((type) ~0ULL)); \
+ *res = v; \
+ return ret; \
+}
+
+STRTO_H(strtoint, int)
+STRTO_H(strtouint, unsigned int)
+STRTO_H(strtoll, long long)
+STRTO_H(strtoull, unsigned long long)
+STRTO_H(strtou64, u64)
+
+u64 bch2_read_flag_list(char *opt, const char * const list[])
+{
+ u64 ret = 0;
+ char *p, *s, *d = kstrdup(opt, GFP_KERNEL);
+
+ if (!d)
+ return -ENOMEM;
+
+ s = strim(d);
+
+ while ((p = strsep(&s, ","))) {
+ int flag = match_string(list, -1, p);
+
+ if (flag < 0) {
+ ret = -1;
+ break;
+ }
+
+ ret |= 1 << flag;
+ }
+
+ kfree(d);
+
+ return ret;
+}
+
+bool bch2_is_zero(const void *_p, size_t n)
+{
+ const char *p = _p;
+ size_t i;
+
+ for (i = 0; i < n; i++)
+ if (p[i])
+ return false;
+ return true;
+}
+
+void bch2_prt_u64_binary(struct printbuf *out, u64 v, unsigned nr_bits)
+{
+ while (nr_bits)
+ prt_char(out, '0' + ((v >> --nr_bits) & 1));
+}
+
+void bch2_print_string_as_lines(const char *prefix, const char *lines)
+{
+ const char *p;
+
+ if (!lines) {
+ printk("%s (null)\n", prefix);
+ return;
+ }
+
+ console_lock();
+ while (1) {
+ p = strchrnul(lines, '\n');
+ printk("%s%.*s\n", prefix, (int) (p - lines), lines);
+ if (!*p)
+ break;
+ lines = p + 1;
+ }
+ console_unlock();
+}
+
+int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task)
+{
+#ifdef CONFIG_STACKTRACE
+ unsigned nr_entries = 0;
+ int ret = 0;
+
+ stack->nr = 0;
+ ret = darray_make_room(stack, 32);
+ if (ret)
+ return ret;
+
+ if (!down_read_trylock(&task->signal->exec_update_lock))
+ return -1;
+
+ do {
+ nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, 0);
+ } while (nr_entries == stack->size &&
+ !(ret = darray_make_room(stack, stack->size * 2)));
+
+ stack->nr = nr_entries;
+ up_read(&task->signal->exec_update_lock);
+
+ return ret;
+#else
+ return 0;
+#endif
+}
+
+void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack)
+{
+ unsigned long *i;
+
+ darray_for_each(*stack, i) {
+ prt_printf(out, "[<0>] %pB", (void *) *i);
+ prt_newline(out);
+ }
+}
+
+int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task)
+{
+ bch_stacktrace stack = { 0 };
+ int ret = bch2_save_backtrace(&stack, task);
+
+ bch2_prt_backtrace(out, &stack);
+ darray_exit(&stack);
+ return ret;
+}
+
+/* time stats: */
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v)
+{
+ unsigned i = 0;
+
+ while (i < ARRAY_SIZE(q->entries)) {
+ struct bch2_quantile_entry *e = q->entries + i;
+
+ if (unlikely(!e->step)) {
+ e->m = v;
+ e->step = max_t(unsigned, v / 2, 1024);
+ } else if (e->m > v) {
+ e->m = e->m >= e->step
+ ? e->m - e->step
+ : 0;
+ } else if (e->m < v) {
+ e->m = e->m + e->step > e->m
+ ? e->m + e->step
+ : U32_MAX;
+ }
+
+ if ((e->m > v ? e->m - v : v - e->m) < e->step)
+ e->step = max_t(unsigned, e->step / 2, 1);
+
+ if (v >= e->m)
+ break;
+
+ i = eytzinger0_child(i, v > e->m);
+ }
+}
+
+static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
+ u64 start, u64 end)
+{
+ u64 duration, freq;
+
+ if (time_after64(end, start)) {
+ duration = end - start;
+ mean_and_variance_update(&stats->duration_stats, duration);
+ mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
+ stats->max_duration = max(stats->max_duration, duration);
+ stats->min_duration = min(stats->min_duration, duration);
+ bch2_quantiles_update(&stats->quantiles, duration);
+ }
+
+ if (time_after64(end, stats->last_event)) {
+ freq = end - stats->last_event;
+ mean_and_variance_update(&stats->freq_stats, freq);
+ mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq);
+ stats->max_freq = max(stats->max_freq, freq);
+ stats->min_freq = min(stats->min_freq, freq);
+ stats->last_event = end;
+ }
+}
+
+static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
+ struct bch2_time_stat_buffer *b)
+{
+ struct bch2_time_stat_buffer_entry *i;
+ unsigned long flags;
+
+ spin_lock_irqsave(&stats->lock, flags);
+ for (i = b->entries;
+ i < b->entries + ARRAY_SIZE(b->entries);
+ i++)
+ bch2_time_stats_update_one(stats, i->start, i->end);
+ spin_unlock_irqrestore(&stats->lock, flags);
+
+ b->nr = 0;
+}
+
+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
+{
+ unsigned long flags;
+
+ WARN_RATELIMIT(!stats->min_duration || !stats->min_freq,
+ "time_stats: min_duration = %llu, min_freq = %llu",
+ stats->min_duration, stats->min_freq);
+
+ if (!stats->buffer) {
+ spin_lock_irqsave(&stats->lock, flags);
+ bch2_time_stats_update_one(stats, start, end);
+
+ if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 &&
+ stats->duration_stats.n > 1024)
+ stats->buffer =
+ alloc_percpu_gfp(struct bch2_time_stat_buffer,
+ GFP_ATOMIC);
+ spin_unlock_irqrestore(&stats->lock, flags);
+ } else {
+ struct bch2_time_stat_buffer *b;
+
+ preempt_disable();
+ b = this_cpu_ptr(stats->buffer);
+
+ BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
+ b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) {
+ .start = start,
+ .end = end
+ };
+
+ if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
+ bch2_time_stats_clear_buffer(stats, b);
+ preempt_enable();
+ }
+}
+#endif
+
+static const struct time_unit {
+ const char *name;
+ u64 nsecs;
+} time_units[] = {
+ { "ns", 1 },
+ { "us", NSEC_PER_USEC },
+ { "ms", NSEC_PER_MSEC },
+ { "s", NSEC_PER_SEC },
+ { "m", (u64) NSEC_PER_SEC * 60},
+ { "h", (u64) NSEC_PER_SEC * 3600},
+ { "eon", U64_MAX },
+};
+
+static const struct time_unit *pick_time_units(u64 ns)
+{
+ const struct time_unit *u;
+
+ for (u = time_units;
+ u + 1 < time_units + ARRAY_SIZE(time_units) &&
+ ns >= u[1].nsecs << 1;
+ u++)
+ ;
+
+ return u;
+}
+
+void bch2_pr_time_units(struct printbuf *out, u64 ns)
+{
+ const struct time_unit *u = pick_time_units(ns);
+
+ prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
+}
+
+static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
+{
+ const struct time_unit *u = pick_time_units(ns);
+
+ prt_printf(out, "%llu ", div64_u64(ns, u->nsecs));
+ prt_tab_rjust(out);
+ prt_printf(out, "%s", u->name);
+}
+
+#ifndef __KERNEL__
+#include <time.h>
+void bch2_prt_datetime(struct printbuf *out, time64_t sec)
+{
+ time_t t = sec;
+ char buf[64];
+ ctime_r(&t, buf);
+ prt_str(out, buf);
+}
+#else
+void bch2_prt_datetime(struct printbuf *out, time64_t sec)
+{
+ char buf[64];
+ snprintf(buf, sizeof(buf), "%ptT", &sec);
+ prt_u64(out, sec);
+}
+#endif
+
+#define TABSTOP_SIZE 12
+
+static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns)
+{
+ prt_str(out, name);
+ prt_tab(out);
+ bch2_pr_time_units_aligned(out, ns);
+ prt_newline(out);
+}
+
+void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
+{
+ const struct time_unit *u;
+ s64 f_mean = 0, d_mean = 0;
+ u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
+ int i;
+ /*
+ * avoid divide by zero
+ */
+ if (stats->freq_stats.n) {
+ f_mean = mean_and_variance_get_mean(stats->freq_stats);
+ f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
+ d_mean = mean_and_variance_get_mean(stats->duration_stats);
+ d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
+ }
+
+ printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE);
+ prt_printf(out, "count:");
+ prt_tab(out);
+ prt_printf(out, "%llu ",
+ stats->duration_stats.n);
+ printbuf_tabstop_pop(out);
+ prt_newline(out);
+
+ printbuf_tabstops_reset(out);
+
+ printbuf_tabstop_push(out, out->indent + 20);
+ printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
+ printbuf_tabstop_push(out, 0);
+ printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
+
+ prt_tab(out);
+ prt_printf(out, "since mount");
+ prt_tab_rjust(out);
+ prt_tab(out);
+ prt_printf(out, "recent");
+ prt_tab_rjust(out);
+ prt_newline(out);
+
+ printbuf_tabstops_reset(out);
+ printbuf_tabstop_push(out, out->indent + 20);
+ printbuf_tabstop_push(out, TABSTOP_SIZE);
+ printbuf_tabstop_push(out, 2);
+ printbuf_tabstop_push(out, TABSTOP_SIZE);
+
+ prt_printf(out, "duration of events");
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+
+ pr_name_and_units(out, "min:", stats->min_duration);
+ pr_name_and_units(out, "max:", stats->max_duration);
+
+ prt_printf(out, "mean:");
+ prt_tab(out);
+ bch2_pr_time_units_aligned(out, d_mean);
+ prt_tab(out);
+ bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
+ prt_newline(out);
+
+ prt_printf(out, "stddev:");
+ prt_tab(out);
+ bch2_pr_time_units_aligned(out, d_stddev);
+ prt_tab(out);
+ bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
+
+ printbuf_indent_sub(out, 2);
+ prt_newline(out);
+
+ prt_printf(out, "time between events");
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+
+ pr_name_and_units(out, "min:", stats->min_freq);
+ pr_name_and_units(out, "max:", stats->max_freq);
+
+ prt_printf(out, "mean:");
+ prt_tab(out);
+ bch2_pr_time_units_aligned(out, f_mean);
+ prt_tab(out);
+ bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
+ prt_newline(out);
+
+ prt_printf(out, "stddev:");
+ prt_tab(out);
+ bch2_pr_time_units_aligned(out, f_stddev);
+ prt_tab(out);
+ bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
+
+ printbuf_indent_sub(out, 2);
+ prt_newline(out);
+
+ printbuf_tabstops_reset(out);
+
+ i = eytzinger0_first(NR_QUANTILES);
+ u = pick_time_units(stats->quantiles.entries[i].m);
+
+ prt_printf(out, "quantiles (%s):\t", u->name);
+ eytzinger0_for_each(i, NR_QUANTILES) {
+ bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+
+ q = max(stats->quantiles.entries[i].m, last_q);
+ prt_printf(out, "%llu ",
+ div_u64(q, u->nsecs));
+ if (is_last)
+ prt_newline(out);
+ last_q = q;
+ }
+}
+
+void bch2_time_stats_exit(struct bch2_time_stats *stats)
+{
+ free_percpu(stats->buffer);
+}
+
+void bch2_time_stats_init(struct bch2_time_stats *stats)
+{
+ memset(stats, 0, sizeof(*stats));
+ stats->duration_stats_weighted.weight = 8;
+ stats->freq_stats_weighted.weight = 8;
+ stats->min_duration = U64_MAX;
+ stats->min_freq = U64_MAX;
+ spin_lock_init(&stats->lock);
+}
+
+/* ratelimit: */
+
+/**
+ * bch2_ratelimit_delay() - return how long to delay until the next time to do
+ * some work
+ * @d: the struct bch_ratelimit to update
+ * Returns: the amount of time to delay by, in jiffies
+ */
+u64 bch2_ratelimit_delay(struct bch_ratelimit *d)
+{
+ u64 now = local_clock();
+
+ return time_after64(d->next, now)
+ ? nsecs_to_jiffies(d->next - now)
+ : 0;
+}
+
+/**
+ * bch2_ratelimit_increment() - increment @d by the amount of work done
+ * @d: the struct bch_ratelimit to update
+ * @done: the amount of work done, in arbitrary units
+ */
+void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done)
+{
+ u64 now = local_clock();
+
+ d->next += div_u64(done * NSEC_PER_SEC, d->rate);
+
+ if (time_before64(now + NSEC_PER_SEC, d->next))
+ d->next = now + NSEC_PER_SEC;
+
+ if (time_after64(now - NSEC_PER_SEC * 2, d->next))
+ d->next = now - NSEC_PER_SEC * 2;
+}
+
+/* pd controller: */
+
+/*
+ * Updates pd_controller. Attempts to scale inputed values to units per second.
+ * @target: desired value
+ * @actual: current value
+ *
+ * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing
+ * it makes actual go down.
+ */
+void bch2_pd_controller_update(struct bch_pd_controller *pd,
+ s64 target, s64 actual, int sign)
+{
+ s64 proportional, derivative, change;
+
+ unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ;
+
+ if (seconds_since_update == 0)
+ return;
+
+ pd->last_update = jiffies;
+
+ proportional = actual - target;
+ proportional *= seconds_since_update;
+ proportional = div_s64(proportional, pd->p_term_inverse);
+
+ derivative = actual - pd->last_actual;
+ derivative = div_s64(derivative, seconds_since_update);
+ derivative = ewma_add(pd->smoothed_derivative, derivative,
+ (pd->d_term / seconds_since_update) ?: 1);
+ derivative = derivative * pd->d_term;
+ derivative = div_s64(derivative, pd->p_term_inverse);
+
+ change = proportional + derivative;
+
+ /* Don't increase rate if not keeping up */
+ if (change > 0 &&
+ pd->backpressure &&
+ time_after64(local_clock(),
+ pd->rate.next + NSEC_PER_MSEC))
+ change = 0;
+
+ change *= (sign * -1);
+
+ pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change,
+ 1, UINT_MAX);
+
+ pd->last_actual = actual;
+ pd->last_derivative = derivative;
+ pd->last_proportional = proportional;
+ pd->last_change = change;
+ pd->last_target = target;
+}
+
+void bch2_pd_controller_init(struct bch_pd_controller *pd)
+{
+ pd->rate.rate = 1024;
+ pd->last_update = jiffies;
+ pd->p_term_inverse = 6000;
+ pd->d_term = 30;
+ pd->d_smooth = pd->d_term;
+ pd->backpressure = 1;
+}
+
+void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_controller *pd)
+{
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 20);
+
+ prt_printf(out, "rate:");
+ prt_tab(out);
+ prt_human_readable_s64(out, pd->rate.rate);
+ prt_newline(out);
+
+ prt_printf(out, "target:");
+ prt_tab(out);
+ prt_human_readable_u64(out, pd->last_target);
+ prt_newline(out);
+
+ prt_printf(out, "actual:");
+ prt_tab(out);
+ prt_human_readable_u64(out, pd->last_actual);
+ prt_newline(out);
+
+ prt_printf(out, "proportional:");
+ prt_tab(out);
+ prt_human_readable_s64(out, pd->last_proportional);
+ prt_newline(out);
+
+ prt_printf(out, "derivative:");
+ prt_tab(out);
+ prt_human_readable_s64(out, pd->last_derivative);
+ prt_newline(out);
+
+ prt_printf(out, "change:");
+ prt_tab(out);
+ prt_human_readable_s64(out, pd->last_change);
+ prt_newline(out);
+
+ prt_printf(out, "next io:");
+ prt_tab(out);
+ prt_printf(out, "%llims", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC));
+ prt_newline(out);
+}
+
+/* misc: */
+
+void bch2_bio_map(struct bio *bio, void *base, size_t size)
+{
+ while (size) {
+ struct page *page = is_vmalloc_addr(base)
+ ? vmalloc_to_page(base)
+ : virt_to_page(base);
+ unsigned offset = offset_in_page(base);
+ unsigned len = min_t(size_t, PAGE_SIZE - offset, size);
+
+ BUG_ON(!bio_add_page(bio, page, len, offset));
+ size -= len;
+ base += len;
+ }
+}
+
+int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
+{
+ while (size) {
+ struct page *page = alloc_pages(gfp_mask, 0);
+ unsigned len = min_t(size_t, PAGE_SIZE, size);
+
+ if (!page)
+ return -ENOMEM;
+
+ if (unlikely(!bio_add_page(bio, page, len, 0))) {
+ __free_page(page);
+ break;
+ }
+
+ size -= len;
+ }
+
+ return 0;
+}
+
+size_t bch2_rand_range(size_t max)
+{
+ size_t rand;
+
+ if (!max)
+ return 0;
+
+ do {
+ rand = get_random_long();
+ rand &= roundup_pow_of_two(max) - 1;
+ } while (rand >= max);
+
+ return rand;
+}
+
+void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src)
+{
+ struct bio_vec bv;
+ struct bvec_iter iter;
+
+ __bio_for_each_segment(bv, dst, iter, dst_iter) {
+ void *dstp = kmap_local_page(bv.bv_page);
+
+ memcpy(dstp + bv.bv_offset, src, bv.bv_len);
+ kunmap_local(dstp);
+
+ src += bv.bv_len;
+ }
+}
+
+void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
+{
+ struct bio_vec bv;
+ struct bvec_iter iter;
+
+ __bio_for_each_segment(bv, src, iter, src_iter) {
+ void *srcp = kmap_local_page(bv.bv_page);
+
+ memcpy(dst, srcp + bv.bv_offset, bv.bv_len);
+ kunmap_local(srcp);
+
+ dst += bv.bv_len;
+ }
+}
+
+static int alignment_ok(const void *base, size_t align)
+{
+ return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
+ ((unsigned long)base & (align - 1)) == 0;
+}
+
+static void u32_swap(void *a, void *b, size_t size)
+{
+ u32 t = *(u32 *)a;
+ *(u32 *)a = *(u32 *)b;
+ *(u32 *)b = t;
+}
+
+static void u64_swap(void *a, void *b, size_t size)
+{
+ u64 t = *(u64 *)a;
+ *(u64 *)a = *(u64 *)b;
+ *(u64 *)b = t;
+}
+
+static void generic_swap(void *a, void *b, size_t size)
+{
+ char t;
+
+ do {
+ t = *(char *)a;
+ *(char *)a++ = *(char *)b;
+ *(char *)b++ = t;
+ } while (--size > 0);
+}
+
+static inline int do_cmp(void *base, size_t n, size_t size,
+ int (*cmp_func)(const void *, const void *, size_t),
+ size_t l, size_t r)
+{
+ return cmp_func(base + inorder_to_eytzinger0(l, n) * size,
+ base + inorder_to_eytzinger0(r, n) * size,
+ size);
+}
+
+static inline void do_swap(void *base, size_t n, size_t size,
+ void (*swap_func)(void *, void *, size_t),
+ size_t l, size_t r)
+{
+ swap_func(base + inorder_to_eytzinger0(l, n) * size,
+ base + inorder_to_eytzinger0(r, n) * size,
+ size);
+}
+
+void eytzinger0_sort(void *base, size_t n, size_t size,
+ int (*cmp_func)(const void *, const void *, size_t),
+ void (*swap_func)(void *, void *, size_t))
+{
+ int i, c, r;
+
+ if (!swap_func) {
+ if (size == 4 && alignment_ok(base, 4))
+ swap_func = u32_swap;
+ else if (size == 8 && alignment_ok(base, 8))
+ swap_func = u64_swap;
+ else
+ swap_func = generic_swap;
+ }
+
+ /* heapify */
+ for (i = n / 2 - 1; i >= 0; --i) {
+ for (r = i; r * 2 + 1 < n; r = c) {
+ c = r * 2 + 1;
+
+ if (c + 1 < n &&
+ do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
+ c++;
+
+ if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
+ break;
+
+ do_swap(base, n, size, swap_func, r, c);
+ }
+ }
+
+ /* sort */
+ for (i = n - 1; i > 0; --i) {
+ do_swap(base, n, size, swap_func, 0, i);
+
+ for (r = 0; r * 2 + 1 < i; r = c) {
+ c = r * 2 + 1;
+
+ if (c + 1 < i &&
+ do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
+ c++;
+
+ if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
+ break;
+
+ do_swap(base, n, size, swap_func, r, c);
+ }
+ }
+}
+
+void sort_cmp_size(void *base, size_t num, size_t size,
+ int (*cmp_func)(const void *, const void *, size_t),
+ void (*swap_func)(void *, void *, size_t size))
+{
+ /* pre-scale counters for performance */
+ int i = (num/2 - 1) * size, n = num * size, c, r;
+
+ if (!swap_func) {
+ if (size == 4 && alignment_ok(base, 4))
+ swap_func = u32_swap;
+ else if (size == 8 && alignment_ok(base, 8))
+ swap_func = u64_swap;
+ else
+ swap_func = generic_swap;
+ }
+
+ /* heapify */
+ for ( ; i >= 0; i -= size) {
+ for (r = i; r * 2 + size < n; r = c) {
+ c = r * 2 + size;
+ if (c < n - size &&
+ cmp_func(base + c, base + c + size, size) < 0)
+ c += size;
+ if (cmp_func(base + r, base + c, size) >= 0)
+ break;
+ swap_func(base + r, base + c, size);
+ }
+ }
+
+ /* sort */
+ for (i = n - size; i > 0; i -= size) {
+ swap_func(base, base + i, size);
+ for (r = 0; r * 2 + size < i; r = c) {
+ c = r * 2 + size;
+ if (c < i - size &&
+ cmp_func(base + c, base + c + size, size) < 0)
+ c += size;
+ if (cmp_func(base + r, base + c, size) >= 0)
+ break;
+ swap_func(base + r, base + c, size);
+ }
+ }
+}
+
+static void mempool_free_vp(void *element, void *pool_data)
+{
+ size_t size = (size_t) pool_data;
+
+ vpfree(element, size);
+}
+
+static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
+{
+ size_t size = (size_t) pool_data;
+
+ return vpmalloc(size, gfp_mask);
+}
+
+int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
+{
+ return size < PAGE_SIZE
+ ? mempool_init_kmalloc_pool(pool, min_nr, size)
+ : mempool_init(pool, min_nr, mempool_alloc_vp,
+ mempool_free_vp, (void *) size);
+}
+
+#if 0
+void eytzinger1_test(void)
+{
+ unsigned inorder, eytz, size;
+
+ pr_info("1 based eytzinger test:");
+
+ for (size = 2;
+ size < 65536;
+ size++) {
+ unsigned extra = eytzinger1_extra(size);
+
+ if (!(size % 4096))
+ pr_info("tree size %u", size);
+
+ BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size));
+ BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size));
+
+ BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0);
+ BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0);
+
+ inorder = 1;
+ eytzinger1_for_each(eytz, size) {
+ BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz);
+ BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder);
+ BUG_ON(eytz != eytzinger1_last(size) &&
+ eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz);
+
+ inorder++;
+ }
+ }
+}
+
+void eytzinger0_test(void)
+{
+
+ unsigned inorder, eytz, size;
+
+ pr_info("0 based eytzinger test:");
+
+ for (size = 1;
+ size < 65536;
+ size++) {
+ unsigned extra = eytzinger0_extra(size);
+
+ if (!(size % 4096))
+ pr_info("tree size %u", size);
+
+ BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size));
+ BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size));
+
+ BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1);
+ BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1);
+
+ inorder = 0;
+ eytzinger0_for_each(eytz, size) {
+ BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz);
+ BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder);
+ BUG_ON(eytz != eytzinger0_last(size) &&
+ eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz);
+
+ inorder++;
+ }
+ }
+}
+
+static inline int cmp_u16(const void *_l, const void *_r, size_t size)
+{
+ const u16 *l = _l, *r = _r;
+
+ return (*l > *r) - (*r - *l);
+}
+
+static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search)
+{
+ int i, c1 = -1, c2 = -1;
+ ssize_t r;
+
+ r = eytzinger0_find_le(test_array, nr,
+ sizeof(test_array[0]),
+ cmp_u16, &search);
+ if (r >= 0)
+ c1 = test_array[r];
+
+ for (i = 0; i < nr; i++)
+ if (test_array[i] <= search && test_array[i] > c2)
+ c2 = test_array[i];
+
+ if (c1 != c2) {
+ eytzinger0_for_each(i, nr)
+ pr_info("[%3u] = %12u", i, test_array[i]);
+ pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i",
+ i, r, c1, c2);
+ }
+}
+
+void eytzinger0_find_test(void)
+{
+ unsigned i, nr, allocated = 1 << 12;
+ u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL);
+
+ for (nr = 1; nr < allocated; nr++) {
+ pr_info("testing %u elems", nr);
+
+ get_random_bytes(test_array, nr * sizeof(test_array[0]));
+ eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL);
+
+ /* verify array is sorted correctly: */
+ eytzinger0_for_each(i, nr)
+ BUG_ON(i != eytzinger0_last(nr) &&
+ test_array[i] > test_array[eytzinger0_next(i, nr)]);
+
+ for (i = 0; i < U16_MAX; i += 1 << 12)
+ eytzinger0_find_test_val(test_array, nr, i);
+
+ for (i = 0; i < nr; i++) {
+ eytzinger0_find_test_val(test_array, nr, test_array[i] - 1);
+ eytzinger0_find_test_val(test_array, nr, test_array[i]);
+ eytzinger0_find_test_val(test_array, nr, test_array[i] + 1);
+ }
+ }
+
+ kfree(test_array);
+}
+#endif
+
+/*
+ * Accumulate percpu counters onto one cpu's copy - only valid when access
+ * against any percpu counter is guarded against
+ */
+u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr)
+{
+ u64 *ret;
+ int cpu;
+
+ /* access to pcpu vars has to be blocked by other locking */
+ preempt_disable();
+ ret = this_cpu_ptr(p);
+ preempt_enable();
+
+ for_each_possible_cpu(cpu) {
+ u64 *i = per_cpu_ptr(p, cpu);
+
+ if (i != ret) {
+ acc_u64s(ret, i, nr);
+ memset(i, 0, nr * sizeof(u64));
+ }
+ }
+
+ return ret;
+}
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
new file mode 100644
index 000000000000..b701f7fe0784
--- /dev/null
+++ b/fs/bcachefs/util.h
@@ -0,0 +1,834 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_UTIL_H
+#define _BCACHEFS_UTIL_H
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/closure.h>
+#include <linux/errno.h>
+#include <linux/freezer.h>
+#include <linux/kernel.h>
+#include <linux/sched/clock.h>
+#include <linux/llist.h>
+#include <linux/log2.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/ratelimit.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+
+#include "mean_and_variance.h"
+
+#include "darray.h"
+
+struct closure;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define EBUG_ON(cond) BUG_ON(cond)
+#else
+#define EBUG_ON(cond)
+#endif
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define CPU_BIG_ENDIAN 0
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define CPU_BIG_ENDIAN 1
+#endif
+
+/* type hackery */
+
+#define type_is_exact(_val, _type) \
+ __builtin_types_compatible_p(typeof(_val), _type)
+
+#define type_is(_val, _type) \
+ (__builtin_types_compatible_p(typeof(_val), _type) || \
+ __builtin_types_compatible_p(typeof(_val), const _type))
+
+/* Userspace doesn't align allocations as nicely as the kernel allocators: */
+static inline size_t buf_pages(void *p, size_t len)
+{
+ return DIV_ROUND_UP(len +
+ ((unsigned long) p & (PAGE_SIZE - 1)),
+ PAGE_SIZE);
+}
+
+static inline void vpfree(void *p, size_t size)
+{
+ if (is_vmalloc_addr(p))
+ vfree(p);
+ else
+ free_pages((unsigned long) p, get_order(size));
+}
+
+static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
+{
+ return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
+ get_order(size)) ?:
+ __vmalloc(size, gfp_mask);
+}
+
+static inline void kvpfree(void *p, size_t size)
+{
+ if (size < PAGE_SIZE)
+ kfree(p);
+ else
+ vpfree(p, size);
+}
+
+static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
+{
+ return size < PAGE_SIZE
+ ? kmalloc(size, gfp_mask)
+ : vpmalloc(size, gfp_mask);
+}
+
+int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);
+
+#define HEAP(type) \
+struct { \
+ size_t size, used; \
+ type *data; \
+}
+
+#define DECLARE_HEAP(type, name) HEAP(type) name
+
+#define init_heap(heap, _size, gfp) \
+({ \
+ (heap)->used = 0; \
+ (heap)->size = (_size); \
+ (heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\
+ (gfp)); \
+})
+
+#define free_heap(heap) \
+do { \
+ kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0])); \
+ (heap)->data = NULL; \
+} while (0)
+
+#define heap_set_backpointer(h, i, _fn) \
+do { \
+ void (*fn)(typeof(h), size_t) = _fn; \
+ if (fn) \
+ fn(h, i); \
+} while (0)
+
+#define heap_swap(h, i, j, set_backpointer) \
+do { \
+ swap((h)->data[i], (h)->data[j]); \
+ heap_set_backpointer(h, i, set_backpointer); \
+ heap_set_backpointer(h, j, set_backpointer); \
+} while (0)
+
+#define heap_peek(h) \
+({ \
+ EBUG_ON(!(h)->used); \
+ (h)->data[0]; \
+})
+
+#define heap_full(h) ((h)->used == (h)->size)
+
+#define heap_sift_down(h, i, cmp, set_backpointer) \
+do { \
+ size_t _c, _j = i; \
+ \
+ for (; _j * 2 + 1 < (h)->used; _j = _c) { \
+ _c = _j * 2 + 1; \
+ if (_c + 1 < (h)->used && \
+ cmp(h, (h)->data[_c], (h)->data[_c + 1]) >= 0) \
+ _c++; \
+ \
+ if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0) \
+ break; \
+ heap_swap(h, _c, _j, set_backpointer); \
+ } \
+} while (0)
+
+#define heap_sift_up(h, i, cmp, set_backpointer) \
+do { \
+ while (i) { \
+ size_t p = (i - 1) / 2; \
+ if (cmp(h, (h)->data[i], (h)->data[p]) >= 0) \
+ break; \
+ heap_swap(h, i, p, set_backpointer); \
+ i = p; \
+ } \
+} while (0)
+
+#define __heap_add(h, d, cmp, set_backpointer) \
+({ \
+ size_t _i = (h)->used++; \
+ (h)->data[_i] = d; \
+ heap_set_backpointer(h, _i, set_backpointer); \
+ \
+ heap_sift_up(h, _i, cmp, set_backpointer); \
+ _i; \
+})
+
+#define heap_add(h, d, cmp, set_backpointer) \
+({ \
+ bool _r = !heap_full(h); \
+ if (_r) \
+ __heap_add(h, d, cmp, set_backpointer); \
+ _r; \
+})
+
+#define heap_add_or_replace(h, new, cmp, set_backpointer) \
+do { \
+ if (!heap_add(h, new, cmp, set_backpointer) && \
+ cmp(h, new, heap_peek(h)) >= 0) { \
+ (h)->data[0] = new; \
+ heap_set_backpointer(h, 0, set_backpointer); \
+ heap_sift_down(h, 0, cmp, set_backpointer); \
+ } \
+} while (0)
+
+#define heap_del(h, i, cmp, set_backpointer) \
+do { \
+ size_t _i = (i); \
+ \
+ BUG_ON(_i >= (h)->used); \
+ (h)->used--; \
+ if ((_i) < (h)->used) { \
+ heap_swap(h, _i, (h)->used, set_backpointer); \
+ heap_sift_up(h, _i, cmp, set_backpointer); \
+ heap_sift_down(h, _i, cmp, set_backpointer); \
+ } \
+} while (0)
+
+#define heap_pop(h, d, cmp, set_backpointer) \
+({ \
+ bool _r = (h)->used; \
+ if (_r) { \
+ (d) = (h)->data[0]; \
+ heap_del(h, 0, cmp, set_backpointer); \
+ } \
+ _r; \
+})
+
+#define heap_resort(heap, cmp, set_backpointer) \
+do { \
+ ssize_t _i; \
+ for (_i = (ssize_t) (heap)->used / 2 - 1; _i >= 0; --_i) \
+ heap_sift_down(heap, _i, cmp, set_backpointer); \
+} while (0)
+
+#define ANYSINT_MAX(t) \
+ ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
+
+#include "printbuf.h"
+
+#define prt_vprintf(_out, ...) bch2_prt_vprintf(_out, __VA_ARGS__)
+#define prt_printf(_out, ...) bch2_prt_printf(_out, __VA_ARGS__)
+#define printbuf_str(_buf) bch2_printbuf_str(_buf)
+#define printbuf_exit(_buf) bch2_printbuf_exit(_buf)
+
+#define printbuf_tabstops_reset(_buf) bch2_printbuf_tabstops_reset(_buf)
+#define printbuf_tabstop_pop(_buf) bch2_printbuf_tabstop_pop(_buf)
+#define printbuf_tabstop_push(_buf, _n) bch2_printbuf_tabstop_push(_buf, _n)
+
+#define printbuf_indent_add(_out, _n) bch2_printbuf_indent_add(_out, _n)
+#define printbuf_indent_sub(_out, _n) bch2_printbuf_indent_sub(_out, _n)
+
+#define prt_newline(_out) bch2_prt_newline(_out)
+#define prt_tab(_out) bch2_prt_tab(_out)
+#define prt_tab_rjust(_out) bch2_prt_tab_rjust(_out)
+
+#define prt_bytes_indented(...) bch2_prt_bytes_indented(__VA_ARGS__)
+#define prt_u64(_out, _v) prt_printf(_out, "%llu", (u64) (_v))
+#define prt_human_readable_u64(...) bch2_prt_human_readable_u64(__VA_ARGS__)
+#define prt_human_readable_s64(...) bch2_prt_human_readable_s64(__VA_ARGS__)
+#define prt_units_u64(...) bch2_prt_units_u64(__VA_ARGS__)
+#define prt_units_s64(...) bch2_prt_units_s64(__VA_ARGS__)
+#define prt_string_option(...) bch2_prt_string_option(__VA_ARGS__)
+#define prt_bitflags(...) bch2_prt_bitflags(__VA_ARGS__)
+#define prt_bitflags_vector(...) bch2_prt_bitflags_vector(__VA_ARGS__)
+
+void bch2_pr_time_units(struct printbuf *, u64);
+void bch2_prt_datetime(struct printbuf *, time64_t);
+
+#ifdef __KERNEL__
+static inline void uuid_unparse_lower(u8 *uuid, char *out)
+{
+ sprintf(out, "%pUb", uuid);
+}
+#else
+#include <uuid/uuid.h>
+#endif
+
+static inline void pr_uuid(struct printbuf *out, u8 *uuid)
+{
+ char uuid_str[40];
+
+ uuid_unparse_lower(uuid, uuid_str);
+ prt_printf(out, "%s", uuid_str);
+}
+
+int bch2_strtoint_h(const char *, int *);
+int bch2_strtouint_h(const char *, unsigned int *);
+int bch2_strtoll_h(const char *, long long *);
+int bch2_strtoull_h(const char *, unsigned long long *);
+int bch2_strtou64_h(const char *, u64 *);
+
+static inline int bch2_strtol_h(const char *cp, long *res)
+{
+#if BITS_PER_LONG == 32
+ return bch2_strtoint_h(cp, (int *) res);
+#else
+ return bch2_strtoll_h(cp, (long long *) res);
+#endif
+}
+
+static inline int bch2_strtoul_h(const char *cp, long *res)
+{
+#if BITS_PER_LONG == 32
+ return bch2_strtouint_h(cp, (unsigned int *) res);
+#else
+ return bch2_strtoull_h(cp, (unsigned long long *) res);
+#endif
+}
+
+#define strtoi_h(cp, res) \
+ ( type_is(*res, int) ? bch2_strtoint_h(cp, (void *) res)\
+ : type_is(*res, long) ? bch2_strtol_h(cp, (void *) res)\
+ : type_is(*res, long long) ? bch2_strtoll_h(cp, (void *) res)\
+ : type_is(*res, unsigned) ? bch2_strtouint_h(cp, (void *) res)\
+ : type_is(*res, unsigned long) ? bch2_strtoul_h(cp, (void *) res)\
+ : type_is(*res, unsigned long long) ? bch2_strtoull_h(cp, (void *) res)\
+ : -EINVAL)
+
+#define strtoul_safe(cp, var) \
+({ \
+ unsigned long _v; \
+ int _r = kstrtoul(cp, 10, &_v); \
+ if (!_r) \
+ var = _v; \
+ _r; \
+})
+
+#define strtoul_safe_clamp(cp, var, min, max) \
+({ \
+ unsigned long _v; \
+ int _r = kstrtoul(cp, 10, &_v); \
+ if (!_r) \
+ var = clamp_t(typeof(var), _v, min, max); \
+ _r; \
+})
+
+#define strtoul_safe_restrict(cp, var, min, max) \
+({ \
+ unsigned long _v; \
+ int _r = kstrtoul(cp, 10, &_v); \
+ if (!_r && _v >= min && _v <= max) \
+ var = _v; \
+ else \
+ _r = -EINVAL; \
+ _r; \
+})
+
+#define snprint(out, var) \
+ prt_printf(out, \
+ type_is(var, int) ? "%i\n" \
+ : type_is(var, unsigned) ? "%u\n" \
+ : type_is(var, long) ? "%li\n" \
+ : type_is(var, unsigned long) ? "%lu\n" \
+ : type_is(var, s64) ? "%lli\n" \
+ : type_is(var, u64) ? "%llu\n" \
+ : type_is(var, char *) ? "%s\n" \
+ : "%i\n", var)
+
+bool bch2_is_zero(const void *, size_t);
+
+u64 bch2_read_flag_list(char *, const char * const[]);
+
+void bch2_prt_u64_binary(struct printbuf *, u64, unsigned);
+
+void bch2_print_string_as_lines(const char *prefix, const char *lines);
+
+typedef DARRAY(unsigned long) bch_stacktrace;
+int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *);
+void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *);
+int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *);
+
+#define NR_QUANTILES 15
+#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES)
+#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
+#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES)
+
+struct bch2_quantiles {
+ struct bch2_quantile_entry {
+ u64 m;
+ u64 step;
+ } entries[NR_QUANTILES];
+};
+
+struct bch2_time_stat_buffer {
+ unsigned nr;
+ struct bch2_time_stat_buffer_entry {
+ u64 start;
+ u64 end;
+ } entries[32];
+};
+
+struct bch2_time_stats {
+ spinlock_t lock;
+ /* all fields are in nanoseconds */
+ u64 max_duration;
+ u64 min_duration;
+ u64 max_freq;
+ u64 min_freq;
+ u64 last_event;
+ struct bch2_quantiles quantiles;
+
+ struct mean_and_variance duration_stats;
+ struct mean_and_variance_weighted duration_stats_weighted;
+ struct mean_and_variance freq_stats;
+ struct mean_and_variance_weighted freq_stats_weighted;
+ struct bch2_time_stat_buffer __percpu *buffer;
+};
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
+#else
+static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
+#endif
+
+static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
+{
+ __bch2_time_stats_update(stats, start, local_clock());
+}
+
+void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
+
+void bch2_time_stats_exit(struct bch2_time_stats *);
+void bch2_time_stats_init(struct bch2_time_stats *);
+
+#define ewma_add(ewma, val, weight) \
+({ \
+ typeof(ewma) _ewma = (ewma); \
+ typeof(weight) _weight = (weight); \
+ \
+ (((_ewma << _weight) - _ewma) + (val)) >> _weight; \
+})
+
+struct bch_ratelimit {
+ /* Next time we want to do some work, in nanoseconds */
+ u64 next;
+
+ /*
+ * Rate at which we want to do work, in units per nanosecond
+ * The units here correspond to the units passed to
+ * bch2_ratelimit_increment()
+ */
+ unsigned rate;
+};
+
+static inline void bch2_ratelimit_reset(struct bch_ratelimit *d)
+{
+ d->next = local_clock();
+}
+
+u64 bch2_ratelimit_delay(struct bch_ratelimit *);
+void bch2_ratelimit_increment(struct bch_ratelimit *, u64);
+
+struct bch_pd_controller {
+ struct bch_ratelimit rate;
+ unsigned long last_update;
+
+ s64 last_actual;
+ s64 smoothed_derivative;
+
+ unsigned p_term_inverse;
+ unsigned d_smooth;
+ unsigned d_term;
+
+ /* for exporting to sysfs (no effect on behavior) */
+ s64 last_derivative;
+ s64 last_proportional;
+ s64 last_change;
+ s64 last_target;
+
+ /*
+ * If true, the rate will not increase if bch2_ratelimit_delay()
+ * is not being called often enough.
+ */
+ bool backpressure;
+};
+
+void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int);
+void bch2_pd_controller_init(struct bch_pd_controller *);
+void bch2_pd_controller_debug_to_text(struct printbuf *, struct bch_pd_controller *);
+
+#define sysfs_pd_controller_attribute(name) \
+ rw_attribute(name##_rate); \
+ rw_attribute(name##_rate_bytes); \
+ rw_attribute(name##_rate_d_term); \
+ rw_attribute(name##_rate_p_term_inverse); \
+ read_attribute(name##_rate_debug)
+
+#define sysfs_pd_controller_files(name) \
+ &sysfs_##name##_rate, \
+ &sysfs_##name##_rate_bytes, \
+ &sysfs_##name##_rate_d_term, \
+ &sysfs_##name##_rate_p_term_inverse, \
+ &sysfs_##name##_rate_debug
+
+#define sysfs_pd_controller_show(name, var) \
+do { \
+ sysfs_hprint(name##_rate, (var)->rate.rate); \
+ sysfs_print(name##_rate_bytes, (var)->rate.rate); \
+ sysfs_print(name##_rate_d_term, (var)->d_term); \
+ sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \
+ \
+ if (attr == &sysfs_##name##_rate_debug) \
+ bch2_pd_controller_debug_to_text(out, var); \
+} while (0)
+
+#define sysfs_pd_controller_store(name, var) \
+do { \
+ sysfs_strtoul_clamp(name##_rate, \
+ (var)->rate.rate, 1, UINT_MAX); \
+ sysfs_strtoul_clamp(name##_rate_bytes, \
+ (var)->rate.rate, 1, UINT_MAX); \
+ sysfs_strtoul(name##_rate_d_term, (var)->d_term); \
+ sysfs_strtoul_clamp(name##_rate_p_term_inverse, \
+ (var)->p_term_inverse, 1, INT_MAX); \
+} while (0)
+
+#define container_of_or_null(ptr, type, member) \
+({ \
+ typeof(ptr) _ptr = ptr; \
+ _ptr ? container_of(_ptr, type, member) : NULL; \
+})
+
+/* Does linear interpolation between powers of two */
+static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
+{
+ unsigned fract = x & ~(~0 << fract_bits);
+
+ x >>= fract_bits;
+ x = 1 << x;
+ x += (x * fract) >> fract_bits;
+
+ return x;
+}
+
+void bch2_bio_map(struct bio *bio, void *base, size_t);
+int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t);
+
+static inline sector_t bdev_sectors(struct block_device *bdev)
+{
+ return bdev->bd_inode->i_size >> 9;
+}
+
+#define closure_bio_submit(bio, cl) \
+do { \
+ closure_get(cl); \
+ submit_bio(bio); \
+} while (0)
+
+#define kthread_wait(cond) \
+({ \
+ int _ret = 0; \
+ \
+ while (1) { \
+ set_current_state(TASK_INTERRUPTIBLE); \
+ if (kthread_should_stop()) { \
+ _ret = -1; \
+ break; \
+ } \
+ \
+ if (cond) \
+ break; \
+ \
+ schedule(); \
+ } \
+ set_current_state(TASK_RUNNING); \
+ _ret; \
+})
+
+#define kthread_wait_freezable(cond) \
+({ \
+ int _ret = 0; \
+ while (1) { \
+ set_current_state(TASK_INTERRUPTIBLE); \
+ if (kthread_should_stop()) { \
+ _ret = -1; \
+ break; \
+ } \
+ \
+ if (cond) \
+ break; \
+ \
+ schedule(); \
+ try_to_freeze(); \
+ } \
+ set_current_state(TASK_RUNNING); \
+ _ret; \
+})
+
+size_t bch2_rand_range(size_t);
+
+void memcpy_to_bio(struct bio *, struct bvec_iter, const void *);
+void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
+
+static inline void memcpy_u64s_small(void *dst, const void *src,
+ unsigned u64s)
+{
+ u64 *d = dst;
+ const u64 *s = src;
+
+ while (u64s--)
+ *d++ = *s++;
+}
+
+static inline void __memcpy_u64s(void *dst, const void *src,
+ unsigned u64s)
+{
+#ifdef CONFIG_X86_64
+ long d0, d1, d2;
+
+ asm volatile("rep ; movsq"
+ : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+ : "0" (u64s), "1" (dst), "2" (src)
+ : "memory");
+#else
+ u64 *d = dst;
+ const u64 *s = src;
+
+ while (u64s--)
+ *d++ = *s++;
+#endif
+}
+
+static inline void memcpy_u64s(void *dst, const void *src,
+ unsigned u64s)
+{
+ EBUG_ON(!(dst >= src + u64s * sizeof(u64) ||
+ dst + u64s * sizeof(u64) <= src));
+
+ __memcpy_u64s(dst, src, u64s);
+}
+
+static inline void __memmove_u64s_down(void *dst, const void *src,
+ unsigned u64s)
+{
+ __memcpy_u64s(dst, src, u64s);
+}
+
+static inline void memmove_u64s_down(void *dst, const void *src,
+ unsigned u64s)
+{
+ EBUG_ON(dst > src);
+
+ __memmove_u64s_down(dst, src, u64s);
+}
+
+static inline void __memmove_u64s_down_small(void *dst, const void *src,
+ unsigned u64s)
+{
+ memcpy_u64s_small(dst, src, u64s);
+}
+
+static inline void memmove_u64s_down_small(void *dst, const void *src,
+ unsigned u64s)
+{
+ EBUG_ON(dst > src);
+
+ __memmove_u64s_down_small(dst, src, u64s);
+}
+
+static inline void __memmove_u64s_up_small(void *_dst, const void *_src,
+ unsigned u64s)
+{
+ u64 *dst = (u64 *) _dst + u64s;
+ u64 *src = (u64 *) _src + u64s;
+
+ while (u64s--)
+ *--dst = *--src;
+}
+
+static inline void memmove_u64s_up_small(void *dst, const void *src,
+ unsigned u64s)
+{
+ EBUG_ON(dst < src);
+
+ __memmove_u64s_up_small(dst, src, u64s);
+}
+
+static inline void __memmove_u64s_up(void *_dst, const void *_src,
+ unsigned u64s)
+{
+ u64 *dst = (u64 *) _dst + u64s - 1;
+ u64 *src = (u64 *) _src + u64s - 1;
+
+#ifdef CONFIG_X86_64
+ long d0, d1, d2;
+
+ asm volatile("std ;\n"
+ "rep ; movsq\n"
+ "cld ;\n"
+ : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+ : "0" (u64s), "1" (dst), "2" (src)
+ : "memory");
+#else
+ while (u64s--)
+ *dst-- = *src--;
+#endif
+}
+
+static inline void memmove_u64s_up(void *dst, const void *src,
+ unsigned u64s)
+{
+ EBUG_ON(dst < src);
+
+ __memmove_u64s_up(dst, src, u64s);
+}
+
+static inline void memmove_u64s(void *dst, const void *src,
+ unsigned u64s)
+{
+ if (dst < src)
+ __memmove_u64s_down(dst, src, u64s);
+ else
+ __memmove_u64s_up(dst, src, u64s);
+}
+
+/* Set the last few bytes up to a u64 boundary given an offset into a buffer. */
+static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
+{
+ unsigned rem = round_up(bytes, sizeof(u64)) - bytes;
+
+ memset(s + bytes, c, rem);
+}
+
+void sort_cmp_size(void *base, size_t num, size_t size,
+ int (*cmp_func)(const void *, const void *, size_t),
+ void (*swap_func)(void *, void *, size_t));
+
+/* just the memmove, doesn't update @_nr */
+#define __array_insert_item(_array, _nr, _pos) \
+ memmove(&(_array)[(_pos) + 1], \
+ &(_array)[(_pos)], \
+ sizeof((_array)[0]) * ((_nr) - (_pos)))
+
+#define array_insert_item(_array, _nr, _pos, _new_item) \
+do { \
+ __array_insert_item(_array, _nr, _pos); \
+ (_nr)++; \
+ (_array)[(_pos)] = (_new_item); \
+} while (0)
+
+#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \
+do { \
+ (_nr) -= (_nr_to_remove); \
+ memmove(&(_array)[(_pos)], \
+ &(_array)[(_pos) + (_nr_to_remove)], \
+ sizeof((_array)[0]) * ((_nr) - (_pos))); \
+} while (0)
+
+#define array_remove_item(_array, _nr, _pos) \
+ array_remove_items(_array, _nr, _pos, 1)
+
+static inline void __move_gap(void *array, size_t element_size,
+ size_t nr, size_t size,
+ size_t old_gap, size_t new_gap)
+{
+ size_t gap_end = old_gap + size - nr;
+
+ if (new_gap < old_gap) {
+ size_t move = old_gap - new_gap;
+
+ memmove(array + element_size * (gap_end - move),
+ array + element_size * (old_gap - move),
+ element_size * move);
+ } else if (new_gap > old_gap) {
+ size_t move = new_gap - old_gap;
+
+ memmove(array + element_size * old_gap,
+ array + element_size * gap_end,
+ element_size * move);
+ }
+}
+
+/* Move the gap in a gap buffer: */
+#define move_gap(_array, _nr, _size, _old_gap, _new_gap) \
+ __move_gap(_array, sizeof(_array[0]), _nr, _size, _old_gap, _new_gap)
+
+#define bubble_sort(_base, _nr, _cmp) \
+do { \
+ ssize_t _i, _last; \
+ bool _swapped = true; \
+ \
+ for (_last= (ssize_t) (_nr) - 1; _last > 0 && _swapped; --_last) {\
+ _swapped = false; \
+ for (_i = 0; _i < _last; _i++) \
+ if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) { \
+ swap((_base)[_i], (_base)[_i + 1]); \
+ _swapped = true; \
+ } \
+ } \
+} while (0)
+
+static inline u64 percpu_u64_get(u64 __percpu *src)
+{
+ u64 ret = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ ret += *per_cpu_ptr(src, cpu);
+ return ret;
+}
+
+static inline void percpu_u64_set(u64 __percpu *dst, u64 src)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ *per_cpu_ptr(dst, cpu) = 0;
+ this_cpu_write(*dst, src);
+}
+
+static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr)
+{
+ unsigned i;
+
+ for (i = 0; i < nr; i++)
+ acc[i] += src[i];
+}
+
+static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src,
+ unsigned nr)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ acc_u64s(acc, per_cpu_ptr(src, cpu), nr);
+}
+
+static inline void percpu_memset(void __percpu *p, int c, size_t bytes)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ memset(per_cpu_ptr(p, cpu), c, bytes);
+}
+
+u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned);
+
+#define cmp_int(l, r) ((l > r) - (l < r))
+
+static inline int u8_cmp(u8 l, u8 r)
+{
+ return cmp_int(l, r);
+}
+
+static inline int cmp_le32(__le32 l, __le32 r)
+{
+ return cmp_int(le32_to_cpu(l), le32_to_cpu(r));
+}
+
+#include <linux/uuid.h>
+
+#endif /* _BCACHEFS_UTIL_H */
diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c
new file mode 100644
index 000000000000..cb4f33ed9ab3
--- /dev/null
+++ b/fs/bcachefs/varint.c
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bitops.h>
+#include <linux/math.h>
+#include <linux/string.h>
+#include <asm/unaligned.h>
+
+#ifdef CONFIG_VALGRIND
+#include <valgrind/memcheck.h>
+#endif
+
+#include "varint.h"
+
+/**
+ * bch2_varint_encode - encode a variable length integer
+ * @out: destination to encode to
+ * @v: unsigned integer to encode
+ * Returns: size in bytes of the encoded integer - at most 9 bytes
+ */
+int bch2_varint_encode(u8 *out, u64 v)
+{
+ unsigned bits = fls64(v|1);
+ unsigned bytes = DIV_ROUND_UP(bits, 7);
+ __le64 v_le;
+
+ if (likely(bytes < 9)) {
+ v <<= bytes;
+ v |= ~(~0 << (bytes - 1));
+ v_le = cpu_to_le64(v);
+ memcpy(out, &v_le, bytes);
+ } else {
+ *out++ = 255;
+ bytes = 9;
+ put_unaligned_le64(v, out);
+ }
+
+ return bytes;
+}
+
+/**
+ * bch2_varint_decode - encode a variable length integer
+ * @in: varint to decode
+ * @end: end of buffer to decode from
+ * @out: on success, decoded integer
+ * Returns: size in bytes of the decoded integer - or -1 on failure (would
+ * have read past the end of the buffer)
+ */
+int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
+{
+ unsigned bytes = likely(in < end)
+ ? ffz(*in & 255) + 1
+ : 1;
+ u64 v;
+
+ if (unlikely(in + bytes > end))
+ return -1;
+
+ if (likely(bytes < 9)) {
+ __le64 v_le = 0;
+
+ memcpy(&v_le, in, bytes);
+ v = le64_to_cpu(v_le);
+ v >>= bytes;
+ } else {
+ v = get_unaligned_le64(++in);
+ }
+
+ *out = v;
+ return bytes;
+}
+
+/**
+ * bch2_varint_encode_fast - fast version of bch2_varint_encode
+ * @out: destination to encode to
+ * @v: unsigned integer to encode
+ * Returns: size in bytes of the encoded integer - at most 9 bytes
+ *
+ * This version assumes it's always safe to write 8 bytes to @out, even if the
+ * encoded integer would be smaller.
+ */
+int bch2_varint_encode_fast(u8 *out, u64 v)
+{
+ unsigned bits = fls64(v|1);
+ unsigned bytes = DIV_ROUND_UP(bits, 7);
+
+ if (likely(bytes < 9)) {
+ v <<= bytes;
+ v |= ~(~0 << (bytes - 1));
+ } else {
+ *out++ = 255;
+ bytes = 9;
+ }
+
+ put_unaligned_le64(v, out);
+ return bytes;
+}
+
+/**
+ * bch2_varint_decode_fast - fast version of bch2_varint_decode
+ * @in: varint to decode
+ * @end: end of buffer to decode from
+ * @out: on success, decoded integer
+ * Returns: size in bytes of the decoded integer - or -1 on failure (would
+ * have read past the end of the buffer)
+ *
+ * This version assumes that it is safe to read at most 8 bytes past the end of
+ * @end (we still return an error if the varint extends past @end).
+ */
+int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out)
+{
+#ifdef CONFIG_VALGRIND
+ VALGRIND_MAKE_MEM_DEFINED(in, 8);
+#endif
+ u64 v = get_unaligned_le64(in);
+ unsigned bytes = ffz(*in) + 1;
+
+ if (unlikely(in + bytes > end))
+ return -1;
+
+ if (likely(bytes < 9)) {
+ v >>= bytes;
+ v &= ~(~0ULL << (7 * bytes));
+ } else {
+ v = get_unaligned_le64(++in);
+ }
+
+ *out = v;
+ return bytes;
+}
diff --git a/fs/bcachefs/varint.h b/fs/bcachefs/varint.h
new file mode 100644
index 000000000000..92a182fb3d7a
--- /dev/null
+++ b/fs/bcachefs/varint.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_VARINT_H
+#define _BCACHEFS_VARINT_H
+
+int bch2_varint_encode(u8 *, u64);
+int bch2_varint_decode(const u8 *, const u8 *, u64 *);
+
+int bch2_varint_encode_fast(u8 *, u64);
+int bch2_varint_decode_fast(const u8 *, const u8 *, u64 *);
+
+#endif /* _BCACHEFS_VARINT_H */
diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h
new file mode 100644
index 000000000000..a6561b4b36a6
--- /dev/null
+++ b/fs/bcachefs/vstructs.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _VSTRUCTS_H
+#define _VSTRUCTS_H
+
+#include "util.h"
+
+/*
+ * NOTE: we can't differentiate between __le64 and u64 with type_is - this
+ * assumes u64 is little endian:
+ */
+#define __vstruct_u64s(_s) \
+({ \
+ ( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s) \
+ : type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s) \
+ : type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s) \
+ : ((__force u8) ((_s)->u64s))); \
+})
+
+#define __vstruct_bytes(_type, _u64s) \
+({ \
+ BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \
+ \
+ (size_t) (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \
+})
+
+#define vstruct_bytes(_s) \
+ __vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s))
+
+#define __vstruct_blocks(_type, _sector_block_bits, _u64s) \
+ (round_up(__vstruct_bytes(_type, _u64s), \
+ 512 << (_sector_block_bits)) >> (9 + (_sector_block_bits)))
+
+#define vstruct_blocks(_s, _sector_block_bits) \
+ __vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s))
+
+#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s) \
+ __vstruct_blocks(typeof(*(_s)), _sector_block_bits, \
+ __vstruct_u64s(_s) + (_u64s))
+
+#define vstruct_sectors(_s, _sector_block_bits) \
+ (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9)
+
+#define vstruct_next(_s) \
+ ((typeof(_s)) ((u64 *) (_s)->_data + __vstruct_u64s(_s)))
+#define vstruct_last(_s) \
+ ((typeof(&(_s)->start[0])) ((u64 *) (_s)->_data + __vstruct_u64s(_s)))
+#define vstruct_end(_s) \
+ ((void *) ((u64 *) (_s)->_data + __vstruct_u64s(_s)))
+
+#define vstruct_for_each(_s, _i) \
+ for (_i = (_s)->start; \
+ _i < vstruct_last(_s); \
+ _i = vstruct_next(_i))
+
+#define vstruct_for_each_safe(_s, _i, _t) \
+ for (_i = (_s)->start; \
+ _i < vstruct_last(_s) && (_t = vstruct_next(_i), true); \
+ _i = _t)
+
+#define vstruct_idx(_s, _idx) \
+ ((typeof(&(_s)->start[0])) ((_s)->_data + (_idx)))
+
+#endif /* _VSTRUCTS_H */
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
new file mode 100644
index 000000000000..5a1858fb9879
--- /dev/null
+++ b/fs/bcachefs/xattr.c
@@ -0,0 +1,653 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "acl.h"
+#include "bkey_methods.h"
+#include "btree_update.h"
+#include "extents.h"
+#include "fs.h"
+#include "rebalance.h"
+#include "str_hash.h"
+#include "xattr.h"
+
+#include <linux/dcache.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+
+static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned);
+
+static u64 bch2_xattr_hash(const struct bch_hash_info *info,
+ const struct xattr_search_key *key)
+{
+ struct bch_str_hash_ctx ctx;
+
+ bch2_str_hash_init(&ctx, info);
+ bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type));
+ bch2_str_hash_update(&ctx, info, key->name.name, key->name.len);
+
+ return bch2_str_hash_end(&ctx, info);
+}
+
+static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key)
+{
+ return bch2_xattr_hash(info, key);
+}
+
+static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
+{
+ struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k);
+
+ return bch2_xattr_hash(info,
+ &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len));
+}
+
+static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r)
+{
+ struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
+ const struct xattr_search_key *r = _r;
+
+ return l.v->x_type != r->type ||
+ l.v->x_name_len != r->name.len ||
+ memcmp(l.v->x_name, r->name.name, r->name.len);
+}
+
+static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
+{
+ struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
+ struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r);
+
+ return l.v->x_type != r.v->x_type ||
+ l.v->x_name_len != r.v->x_name_len ||
+ memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len);
+}
+
+const struct bch_hash_desc bch2_xattr_hash_desc = {
+ .btree_id = BTREE_ID_xattrs,
+ .key_type = KEY_TYPE_xattr,
+ .hash_key = xattr_hash_key,
+ .hash_bkey = xattr_hash_bkey,
+ .cmp_key = xattr_cmp_key,
+ .cmp_bkey = xattr_cmp_bkey,
+};
+
+int bch2_xattr_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
+{
+ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
+ unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len,
+ le16_to_cpu(xattr.v->x_val_len));
+ int ret = 0;
+
+ bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s, c, err,
+ xattr_val_size_too_small,
+ "value too small (%zu < %u)",
+ bkey_val_u64s(k.k), val_u64s);
+
+ /* XXX why +4 ? */
+ val_u64s = xattr_val_u64s(xattr.v->x_name_len,
+ le16_to_cpu(xattr.v->x_val_len) + 4);
+
+ bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s, c, err,
+ xattr_val_size_too_big,
+ "value too big (%zu > %u)",
+ bkey_val_u64s(k.k), val_u64s);
+
+ bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type), c, err,
+ xattr_invalid_type,
+ "invalid type (%u)", xattr.v->x_type);
+
+ bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len), c, err,
+ xattr_name_invalid_chars,
+ "xattr name has invalid characters");
+fsck_err:
+ return ret;
+}
+
+void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ const struct xattr_handler *handler;
+ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
+
+ handler = bch2_xattr_type_to_handler(xattr.v->x_type);
+ if (handler && handler->prefix)
+ prt_printf(out, "%s", handler->prefix);
+ else if (handler)
+ prt_printf(out, "(type %u)", xattr.v->x_type);
+ else
+ prt_printf(out, "(unknown type %u)", xattr.v->x_type);
+
+ prt_printf(out, "%.*s:%.*s",
+ xattr.v->x_name_len,
+ xattr.v->x_name,
+ le16_to_cpu(xattr.v->x_val_len),
+ (char *) xattr_val(xattr.v));
+
+ if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS ||
+ xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT) {
+ prt_char(out, ' ');
+ bch2_acl_to_text(out, xattr_val(xattr.v),
+ le16_to_cpu(xattr.v->x_val_len));
+ }
+}
+
+static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode,
+ const char *name, void *buffer, size_t size, int type)
+{
+ struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode);
+ struct xattr_search_key search = X_SEARCH(type, name, strlen(name));
+ struct btree_iter iter;
+ struct bkey_s_c_xattr xattr;
+ struct bkey_s_c k;
+ int ret;
+
+ ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash,
+ inode_inum(inode), &search, 0);
+ if (ret)
+ goto err1;
+
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err2;
+
+ xattr = bkey_s_c_to_xattr(k);
+ ret = le16_to_cpu(xattr.v->x_val_len);
+ if (buffer) {
+ if (ret > size)
+ ret = -ERANGE;
+ else
+ memcpy(buffer, xattr_val(xattr.v), ret);
+ }
+err2:
+ bch2_trans_iter_exit(trans, &iter);
+err1:
+ return ret < 0 && bch2_err_matches(ret, ENOENT) ? -ENODATA : ret;
+}
+
+int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
+ struct bch_inode_unpacked *inode_u,
+ const struct bch_hash_info *hash_info,
+ const char *name, const void *value, size_t size,
+ int type, int flags)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter inode_iter = { NULL };
+ int ret;
+
+ ret = bch2_subvol_is_ro_trans(trans, inum.subvol) ?:
+ bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
+ if (ret)
+ return ret;
+
+ inode_u->bi_ctime = bch2_current_time(c);
+
+ ret = bch2_inode_write(trans, &inode_iter, inode_u);
+ bch2_trans_iter_exit(trans, &inode_iter);
+
+ if (ret)
+ return ret;
+
+ if (value) {
+ struct bkey_i_xattr *xattr;
+ unsigned namelen = strlen(name);
+ unsigned u64s = BKEY_U64s +
+ xattr_val_u64s(namelen, size);
+
+ if (u64s > U8_MAX)
+ return -ERANGE;
+
+ xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
+ if (IS_ERR(xattr))
+ return PTR_ERR(xattr);
+
+ bkey_xattr_init(&xattr->k_i);
+ xattr->k.u64s = u64s;
+ xattr->v.x_type = type;
+ xattr->v.x_name_len = namelen;
+ xattr->v.x_val_len = cpu_to_le16(size);
+ memcpy(xattr->v.x_name, name, namelen);
+ memcpy(xattr_val(&xattr->v), value, size);
+
+ ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
+ inum, &xattr->k_i,
+ (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)|
+ (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0));
+ } else {
+ struct xattr_search_key search =
+ X_SEARCH(type, name, strlen(name));
+
+ ret = bch2_hash_delete(trans, bch2_xattr_hash_desc,
+ hash_info, inum, &search);
+ }
+
+ if (bch2_err_matches(ret, ENOENT))
+ ret = flags & XATTR_REPLACE ? -ENODATA : 0;
+
+ return ret;
+}
+
+struct xattr_buf {
+ char *buf;
+ size_t len;
+ size_t used;
+};
+
+static int __bch2_xattr_emit(const char *prefix,
+ const char *name, size_t name_len,
+ struct xattr_buf *buf)
+{
+ const size_t prefix_len = strlen(prefix);
+ const size_t total_len = prefix_len + name_len + 1;
+
+ if (buf->buf) {
+ if (buf->used + total_len > buf->len)
+ return -ERANGE;
+
+ memcpy(buf->buf + buf->used, prefix, prefix_len);
+ memcpy(buf->buf + buf->used + prefix_len,
+ name, name_len);
+ buf->buf[buf->used + prefix_len + name_len] = '\0';
+ }
+
+ buf->used += total_len;
+ return 0;
+}
+
+static int bch2_xattr_emit(struct dentry *dentry,
+ const struct bch_xattr *xattr,
+ struct xattr_buf *buf)
+{
+ const struct xattr_handler *handler =
+ bch2_xattr_type_to_handler(xattr->x_type);
+
+ return handler && (!handler->list || handler->list(dentry))
+ ? __bch2_xattr_emit(handler->prefix ?: handler->name,
+ xattr->x_name, xattr->x_name_len, buf)
+ : 0;
+}
+
+static int bch2_xattr_list_bcachefs(struct bch_fs *c,
+ struct bch_inode_unpacked *inode,
+ struct xattr_buf *buf,
+ bool all)
+{
+ const char *prefix = all ? "bcachefs_effective." : "bcachefs.";
+ unsigned id;
+ int ret = 0;
+ u64 v;
+
+ for (id = 0; id < Inode_opt_nr; id++) {
+ v = bch2_inode_opt_get(inode, id);
+ if (!v)
+ continue;
+
+ if (!all &&
+ !(inode->bi_fields_set & (1 << id)))
+ continue;
+
+ ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id],
+ strlen(bch2_inode_opts[id]), buf);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+ struct bch_fs *c = dentry->d_sb->s_fs_info;
+ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
+ u64 offset = 0, inum = inode->ei_inode.bi_inum;
+ u32 snapshot;
+ int ret;
+retry:
+ bch2_trans_begin(trans);
+ iter = (struct btree_iter) { NULL };
+
+ ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_xattrs,
+ SPOS(inum, offset, snapshot),
+ POS(inum, U64_MAX), 0, k, ret) {
+ if (k.k->type != KEY_TYPE_xattr)
+ continue;
+
+ ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf);
+ if (ret)
+ break;
+ }
+
+ offset = iter.pos.offset;
+ bch2_trans_iter_exit(trans, &iter);
+err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
+
+ bch2_trans_put(trans);
+
+ if (ret)
+ goto out;
+
+ ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false);
+ if (ret)
+ goto out;
+
+ ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
+ if (ret)
+ goto out;
+
+ return buf.used;
+out:
+ return bch2_err_class(ret);
+}
+
+static int bch2_xattr_get_handler(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *vinode,
+ const char *name, void *buffer, size_t size)
+{
+ struct bch_inode_info *inode = to_bch_ei(vinode);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ int ret = bch2_trans_do(c, NULL, NULL, 0,
+ bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags));
+
+ return bch2_err_class(ret);
+}
+
+static int bch2_xattr_set_handler(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap,
+ struct dentry *dentry, struct inode *vinode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ struct bch_inode_info *inode = to_bch_ei(vinode);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
+ struct bch_inode_unpacked inode_u;
+ int ret;
+
+ ret = bch2_trans_run(c,
+ commit_do(trans, NULL, NULL, 0,
+ bch2_xattr_set(trans, inode_inum(inode), &inode_u,
+ &hash, name, value, size,
+ handler->flags, flags)) ?:
+ (bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME), 0));
+
+ return bch2_err_class(ret);
+}
+
+static const struct xattr_handler bch_xattr_user_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .get = bch2_xattr_get_handler,
+ .set = bch2_xattr_set_handler,
+ .flags = KEY_TYPE_XATTR_INDEX_USER,
+};
+
+static bool bch2_xattr_trusted_list(struct dentry *dentry)
+{
+ return capable(CAP_SYS_ADMIN);
+}
+
+static const struct xattr_handler bch_xattr_trusted_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .list = bch2_xattr_trusted_list,
+ .get = bch2_xattr_get_handler,
+ .set = bch2_xattr_set_handler,
+ .flags = KEY_TYPE_XATTR_INDEX_TRUSTED,
+};
+
+static const struct xattr_handler bch_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .get = bch2_xattr_get_handler,
+ .set = bch2_xattr_set_handler,
+ .flags = KEY_TYPE_XATTR_INDEX_SECURITY,
+};
+
+#ifndef NO_BCACHEFS_FS
+
+static int opt_to_inode_opt(int id)
+{
+ switch (id) {
+#define x(name, ...) \
+ case Opt_##name: return Inode_opt_##name;
+ BCH_INODE_OPTS()
+#undef x
+ default:
+ return -1;
+ }
+}
+
+static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *vinode,
+ const char *name, void *buffer, size_t size,
+ bool all)
+{
+ struct bch_inode_info *inode = to_bch_ei(vinode);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch_opts opts =
+ bch2_inode_opts_to_opts(&inode->ei_inode);
+ const struct bch_option *opt;
+ int id, inode_opt_id;
+ struct printbuf out = PRINTBUF;
+ int ret;
+ u64 v;
+
+ id = bch2_opt_lookup(name);
+ if (id < 0 || !bch2_opt_is_inode_opt(id))
+ return -EINVAL;
+
+ inode_opt_id = opt_to_inode_opt(id);
+ if (inode_opt_id < 0)
+ return -EINVAL;
+
+ opt = bch2_opt_table + id;
+
+ if (!bch2_opt_defined_by_id(&opts, id))
+ return -ENODATA;
+
+ if (!all &&
+ !(inode->ei_inode.bi_fields_set & (1 << inode_opt_id)))
+ return -ENODATA;
+
+ v = bch2_opt_get_by_id(&opts, id);
+ bch2_opt_to_text(&out, c, c->disk_sb.sb, opt, v, 0);
+
+ ret = out.pos;
+
+ if (out.allocation_failure) {
+ ret = -ENOMEM;
+ } else if (buffer) {
+ if (out.pos > size)
+ ret = -ERANGE;
+ else
+ memcpy(buffer, out.buf, out.pos);
+ }
+
+ printbuf_exit(&out);
+ return ret;
+}
+
+static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *vinode,
+ const char *name, void *buffer, size_t size)
+{
+ return __bch2_xattr_bcachefs_get(handler, dentry, vinode,
+ name, buffer, size, false);
+}
+
+struct inode_opt_set {
+ int id;
+ u64 v;
+ bool defined;
+};
+
+static int inode_opt_set_fn(struct btree_trans *trans,
+ struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi,
+ void *p)
+{
+ struct inode_opt_set *s = p;
+
+ if (s->defined)
+ bi->bi_fields_set |= 1U << s->id;
+ else
+ bi->bi_fields_set &= ~(1U << s->id);
+
+ bch2_inode_opt_set(bi, s->id, s->v);
+
+ return 0;
+}
+
+static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap,
+ struct dentry *dentry, struct inode *vinode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ struct bch_inode_info *inode = to_bch_ei(vinode);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ const struct bch_option *opt;
+ char *buf;
+ struct inode_opt_set s;
+ int opt_id, inode_opt_id, ret;
+
+ opt_id = bch2_opt_lookup(name);
+ if (opt_id < 0)
+ return -EINVAL;
+
+ opt = bch2_opt_table + opt_id;
+
+ inode_opt_id = opt_to_inode_opt(opt_id);
+ if (inode_opt_id < 0)
+ return -EINVAL;
+
+ s.id = inode_opt_id;
+
+ if (value) {
+ u64 v = 0;
+
+ buf = kmalloc(size + 1, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ memcpy(buf, value, size);
+ buf[size] = '\0';
+
+ ret = bch2_opt_parse(c, opt, buf, &v, NULL);
+ kfree(buf);
+
+ if (ret < 0)
+ return ret;
+
+ ret = bch2_opt_check_may_set(c, opt_id, v);
+ if (ret < 0)
+ return ret;
+
+ s.v = v + 1;
+ s.defined = true;
+ } else {
+ /*
+ * Check if this option was set on the parent - if so, switched
+ * back to inheriting from the parent:
+ *
+ * rename() also has to deal with keeping inherited options up
+ * to date - see bch2_reinherit_attrs()
+ */
+ spin_lock(&dentry->d_lock);
+ if (!IS_ROOT(dentry)) {
+ struct bch_inode_info *dir =
+ to_bch_ei(d_inode(dentry->d_parent));
+
+ s.v = bch2_inode_opt_get(&dir->ei_inode, inode_opt_id);
+ } else {
+ s.v = 0;
+ }
+ spin_unlock(&dentry->d_lock);
+
+ s.defined = false;
+ }
+
+ mutex_lock(&inode->ei_update_lock);
+ if (inode_opt_id == Inode_opt_project) {
+ /*
+ * inode fields accessible via the xattr interface are stored
+ * with a +1 bias, so that 0 means unset:
+ */
+ ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0);
+ if (ret)
+ goto err;
+ }
+
+ ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0);
+err:
+ mutex_unlock(&inode->ei_update_lock);
+
+ if (value &&
+ (opt_id == Opt_background_compression ||
+ opt_id == Opt_background_target))
+ bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum);
+
+ return bch2_err_class(ret);
+}
+
+static const struct xattr_handler bch_xattr_bcachefs_handler = {
+ .prefix = "bcachefs.",
+ .get = bch2_xattr_bcachefs_get,
+ .set = bch2_xattr_bcachefs_set,
+};
+
+static int bch2_xattr_bcachefs_get_effective(
+ const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *vinode,
+ const char *name, void *buffer, size_t size)
+{
+ return __bch2_xattr_bcachefs_get(handler, dentry, vinode,
+ name, buffer, size, true);
+}
+
+static const struct xattr_handler bch_xattr_bcachefs_effective_handler = {
+ .prefix = "bcachefs_effective.",
+ .get = bch2_xattr_bcachefs_get_effective,
+ .set = bch2_xattr_bcachefs_set,
+};
+
+#endif /* NO_BCACHEFS_FS */
+
+const struct xattr_handler *bch2_xattr_handlers[] = {
+ &bch_xattr_user_handler,
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+ &nop_posix_acl_access,
+ &nop_posix_acl_default,
+#endif
+ &bch_xattr_trusted_handler,
+ &bch_xattr_security_handler,
+#ifndef NO_BCACHEFS_FS
+ &bch_xattr_bcachefs_handler,
+ &bch_xattr_bcachefs_effective_handler,
+#endif
+ NULL
+};
+
+static const struct xattr_handler *bch_xattr_handler_map[] = {
+ [KEY_TYPE_XATTR_INDEX_USER] = &bch_xattr_user_handler,
+ [KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS] =
+ &nop_posix_acl_access,
+ [KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT] =
+ &nop_posix_acl_default,
+ [KEY_TYPE_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler,
+ [KEY_TYPE_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler,
+};
+
+static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type)
+{
+ return type < ARRAY_SIZE(bch_xattr_handler_map)
+ ? bch_xattr_handler_map[type]
+ : NULL;
+}
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
new file mode 100644
index 000000000000..1337f31a5c49
--- /dev/null
+++ b/fs/bcachefs/xattr.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_XATTR_H
+#define _BCACHEFS_XATTR_H
+
+#include "str_hash.h"
+
+extern const struct bch_hash_desc bch2_xattr_hash_desc;
+
+int bch2_xattr_invalid(struct bch_fs *, struct bkey_s_c,
+ enum bkey_invalid_flags, struct printbuf *);
+void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_xattr ((struct bkey_ops) { \
+ .key_invalid = bch2_xattr_invalid, \
+ .val_to_text = bch2_xattr_to_text, \
+ .min_val_size = 8, \
+})
+
+static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len)
+{
+ return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) +
+ name_len + val_len, sizeof(u64));
+}
+
+#define xattr_val(_xattr) \
+ ((void *) (_xattr)->x_name + (_xattr)->x_name_len)
+
+struct xattr_search_key {
+ u8 type;
+ struct qstr name;
+};
+
+#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key) \
+ { .type = _type, .name = QSTR_INIT(_name, _len) })
+
+struct dentry;
+struct xattr_handler;
+struct bch_hash_info;
+struct bch_inode_info;
+
+/* Exported for cmd_migrate.c in tools: */
+int bch2_xattr_set(struct btree_trans *, subvol_inum,
+ struct bch_inode_unpacked *, const struct bch_hash_info *,
+ const char *, const void *, size_t, int, int);
+
+ssize_t bch2_xattr_list(struct dentry *, char *, size_t);
+
+extern const struct xattr_handler *bch2_xattr_handlers[];
+
+#endif /* _BCACHEFS_XATTR_H */
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 9a16a51fbb88..a93d76df8ed8 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -96,6 +96,7 @@ static const struct address_space_operations befs_symlink_aops = {
};
static const struct export_operations befs_export_operations = {
+ .encode_fh = generic_encode_ino32_fh,
.fh_to_dentry = befs_fh_to_dentry,
.fh_to_parent = befs_fh_to_parent,
.get_parent = befs_get_parent,
@@ -360,11 +361,11 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
* for indexing purposes. (PFD, page 54)
*/
- inode->i_mtime.tv_sec =
- fs64_to_cpu(sb, raw_inode->last_modified_time) >> 16;
- inode->i_mtime.tv_nsec = 0; /* lower 16 bits are not a time */
- inode_set_ctime_to_ts(inode, inode->i_mtime);
- inode->i_atime = inode->i_mtime;
+ inode_set_mtime(inode,
+ fs64_to_cpu(sb, raw_inode->last_modified_time) >> 16,
+ 0);/* lower 16 bits are not a time */
+ inode_set_ctime_to_ts(inode, inode_get_mtime(inode));
+ inode_set_atime_to_ts(inode, inode_get_mtime(inode));
befs_ino->i_inode_num = fsrun_to_cpu(sb, raw_inode->inode_num);
befs_ino->i_parent = fsrun_to_cpu(sb, raw_inode->parent);
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 12b8af04dcb3..fbc4ae80a4b2 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -97,7 +97,7 @@ static int bfs_create(struct mnt_idmap *idmap, struct inode *dir,
set_bit(ino, info->si_imap);
info->si_freei--;
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_blocks = 0;
inode->i_op = &bfs_file_inops;
inode->i_fop = &bfs_file_operations;
@@ -187,7 +187,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry)
}
de->ino = 0;
mark_buffer_dirty_inode(bh, dir);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
inode_dec_link_count(inode);
@@ -240,7 +240,7 @@ static int bfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
goto end_rename;
}
old_de->ino = 0;
- old_dir->i_mtime = inode_set_ctime_current(old_dir);
+ inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir));
mark_inode_dirty(old_dir);
if (new_inode) {
inode_set_ctime_current(new_inode);
@@ -294,7 +294,8 @@ static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino)
dir->i_size += BFS_DIRENT_SIZE;
inode_set_ctime_current(dir);
}
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_current(dir));
mark_inode_dirty(dir);
de->ino = cpu_to_le16((u16)ino);
for (i = 0; i < BFS_NAMELEN; i++)
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index e6a76ae9eb44..355957dbce39 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -80,11 +80,9 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
set_nlink(inode, le32_to_cpu(di->i_nlink));
inode->i_size = BFS_FILESIZE(di);
inode->i_blocks = BFS_FILEBLOCKS(di);
- inode->i_atime.tv_sec = le32_to_cpu(di->i_atime);
- inode->i_mtime.tv_sec = le32_to_cpu(di->i_mtime);
+ inode_set_atime(inode, le32_to_cpu(di->i_atime), 0);
+ inode_set_mtime(inode, le32_to_cpu(di->i_mtime), 0);
inode_set_ctime(inode, le32_to_cpu(di->i_ctime), 0);
- inode->i_atime.tv_nsec = 0;
- inode->i_mtime.tv_nsec = 0;
brelse(bh);
unlock_new_inode(inode);
@@ -140,9 +138,9 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
di->i_uid = cpu_to_le32(i_uid_read(inode));
di->i_gid = cpu_to_le32(i_gid_read(inode));
di->i_nlink = cpu_to_le32(inode->i_nlink);
- di->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
- di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
- di->i_ctime = cpu_to_le32(inode_get_ctime(inode).tv_sec);
+ di->i_atime = cpu_to_le32(inode_get_atime_sec(inode));
+ di->i_mtime = cpu_to_le32(inode_get_mtime_sec(inode));
+ di->i_ctime = cpu_to_le32(inode_get_ctime_sec(inode));
i_sblock = BFS_I(inode)->i_sblock;
di->i_sblock = cpu_to_le32(i_sblock);
di->i_eblock = cpu_to_le32(BFS_I(inode)->i_eblock);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 7b3d2d491407..5397b552fbeb 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -110,38 +110,19 @@ static struct linux_binfmt elf_format = {
#define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE))
-static int set_brk(unsigned long start, unsigned long end, int prot)
-{
- start = ELF_PAGEALIGN(start);
- end = ELF_PAGEALIGN(end);
- if (end > start) {
- /*
- * Map the last of the bss segment.
- * If the header is requesting these pages to be
- * executable, honour that (ppc32 needs this).
- */
- int error = vm_brk_flags(start, end - start,
- prot & PROT_EXEC ? VM_EXEC : 0);
- if (error)
- return error;
- }
- current->mm->start_brk = current->mm->brk = end;
- return 0;
-}
-
-/* We need to explicitly zero any fractional pages
- after the data section (i.e. bss). This would
- contain the junk from the file that should not
- be in memory
+/*
+ * We need to explicitly zero any trailing portion of the page that follows
+ * p_filesz when it ends before the page ends (e.g. bss), otherwise this
+ * memory will contain the junk from the file that should not be present.
*/
-static int padzero(unsigned long elf_bss)
+static int padzero(unsigned long address)
{
unsigned long nbyte;
- nbyte = ELF_PAGEOFFSET(elf_bss);
+ nbyte = ELF_PAGEOFFSET(address);
if (nbyte) {
nbyte = ELF_MIN_ALIGN - nbyte;
- if (clear_user((void __user *) elf_bss, nbyte))
+ if (clear_user((void __user *)address, nbyte))
return -EFAULT;
}
return 0;
@@ -367,6 +348,11 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
return 0;
}
+/*
+ * Map "eppnt->p_filesz" bytes from "filep" offset "eppnt->p_offset"
+ * into memory at "addr". (Note that p_filesz is rounded up to the
+ * next page, so any extra bytes from the file must be wiped.)
+ */
static unsigned long elf_map(struct file *filep, unsigned long addr,
const struct elf_phdr *eppnt, int prot, int type,
unsigned long total_size)
@@ -406,6 +392,60 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
return(map_addr);
}
+/*
+ * Map "eppnt->p_filesz" bytes from "filep" offset "eppnt->p_offset"
+ * into memory at "addr". Memory from "p_filesz" through "p_memsz"
+ * rounded up to the next page is zeroed.
+ */
+static unsigned long elf_load(struct file *filep, unsigned long addr,
+ const struct elf_phdr *eppnt, int prot, int type,
+ unsigned long total_size)
+{
+ unsigned long zero_start, zero_end;
+ unsigned long map_addr;
+
+ if (eppnt->p_filesz) {
+ map_addr = elf_map(filep, addr, eppnt, prot, type, total_size);
+ if (BAD_ADDR(map_addr))
+ return map_addr;
+ if (eppnt->p_memsz > eppnt->p_filesz) {
+ zero_start = map_addr + ELF_PAGEOFFSET(eppnt->p_vaddr) +
+ eppnt->p_filesz;
+ zero_end = map_addr + ELF_PAGEOFFSET(eppnt->p_vaddr) +
+ eppnt->p_memsz;
+
+ /*
+ * Zero the end of the last mapped page but ignore
+ * any errors if the segment isn't writable.
+ */
+ if (padzero(zero_start) && (prot & PROT_WRITE))
+ return -EFAULT;
+ }
+ } else {
+ map_addr = zero_start = ELF_PAGESTART(addr);
+ zero_end = zero_start + ELF_PAGEOFFSET(eppnt->p_vaddr) +
+ eppnt->p_memsz;
+ }
+ if (eppnt->p_memsz > eppnt->p_filesz) {
+ /*
+ * Map the last of the segment.
+ * If the header is requesting these pages to be
+ * executable, honour that (ppc32 needs this).
+ */
+ int error;
+
+ zero_start = ELF_PAGEALIGN(zero_start);
+ zero_end = ELF_PAGEALIGN(zero_end);
+
+ error = vm_brk_flags(zero_start, zero_end - zero_start,
+ prot & PROT_EXEC ? VM_EXEC : 0);
+ if (error)
+ map_addr = error;
+ }
+ return map_addr;
+}
+
+
static unsigned long total_mapping_size(const struct elf_phdr *phdr, int nr)
{
elf_addr_t min_addr = -1;
@@ -596,8 +636,6 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
struct elf_phdr *eppnt;
unsigned long load_addr = 0;
int load_addr_set = 0;
- unsigned long last_bss = 0, elf_bss = 0;
- int bss_prot = 0;
unsigned long error = ~0UL;
unsigned long total_size;
int i;
@@ -634,7 +672,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
else if (no_base && interp_elf_ex->e_type == ET_DYN)
load_addr = -vaddr;
- map_addr = elf_map(interpreter, load_addr + vaddr,
+ map_addr = elf_load(interpreter, load_addr + vaddr,
eppnt, elf_prot, elf_type, total_size);
total_size = 0;
error = map_addr;
@@ -660,51 +698,9 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
error = -ENOMEM;
goto out;
}
-
- /*
- * Find the end of the file mapping for this phdr, and
- * keep track of the largest address we see for this.
- */
- k = load_addr + eppnt->p_vaddr + eppnt->p_filesz;
- if (k > elf_bss)
- elf_bss = k;
-
- /*
- * Do the same thing for the memory mapping - between
- * elf_bss and last_bss is the bss section.
- */
- k = load_addr + eppnt->p_vaddr + eppnt->p_memsz;
- if (k > last_bss) {
- last_bss = k;
- bss_prot = elf_prot;
- }
}
}
- /*
- * Now fill out the bss section: first pad the last page from
- * the file up to the page boundary, and zero it from elf_bss
- * up to the end of the page.
- */
- if (padzero(elf_bss)) {
- error = -EFAULT;
- goto out;
- }
- /*
- * Next, align both the file and mem bss up to the page size,
- * since this is where elf_bss was just zeroed up to, and where
- * last_bss will end after the vm_brk_flags() below.
- */
- elf_bss = ELF_PAGEALIGN(elf_bss);
- last_bss = ELF_PAGEALIGN(last_bss);
- /* Finally, if there is still more bss to allocate, do it. */
- if (last_bss > elf_bss) {
- error = vm_brk_flags(elf_bss, last_bss - elf_bss,
- bss_prot & PROT_EXEC ? VM_EXEC : 0);
- if (error)
- goto out;
- }
-
error = load_addr;
out:
return error;
@@ -828,8 +824,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
unsigned long error;
struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
struct elf_phdr *elf_property_phdata = NULL;
- unsigned long elf_bss, elf_brk;
- int bss_prot = 0;
+ unsigned long elf_brk;
int retval, i;
unsigned long elf_entry;
unsigned long e_entry;
@@ -1020,7 +1015,6 @@ out_free_interp:
if (retval < 0)
goto out_free_dentry;
- elf_bss = 0;
elf_brk = 0;
start_code = ~0UL;
@@ -1040,33 +1034,6 @@ out_free_interp:
if (elf_ppnt->p_type != PT_LOAD)
continue;
- if (unlikely (elf_brk > elf_bss)) {
- unsigned long nbyte;
-
- /* There was a PT_LOAD segment with p_memsz > p_filesz
- before this one. Map anonymous pages, if needed,
- and clear the area. */
- retval = set_brk(elf_bss + load_bias,
- elf_brk + load_bias,
- bss_prot);
- if (retval)
- goto out_free_dentry;
- nbyte = ELF_PAGEOFFSET(elf_bss);
- if (nbyte) {
- nbyte = ELF_MIN_ALIGN - nbyte;
- if (nbyte > elf_brk - elf_bss)
- nbyte = elf_brk - elf_bss;
- if (clear_user((void __user *)elf_bss +
- load_bias, nbyte)) {
- /*
- * This bss-zeroing can fail if the ELF
- * file specifies odd protections. So
- * we don't check the return value
- */
- }
- }
- }
-
elf_prot = make_prot(elf_ppnt->p_flags, &arch_state,
!!interpreter, false);
@@ -1162,7 +1129,7 @@ out_free_interp:
}
}
- error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
+ error = elf_load(bprm->file, load_bias + vaddr, elf_ppnt,
elf_prot, elf_flags, total_size);
if (BAD_ADDR(error)) {
retval = IS_ERR_VALUE(error) ?
@@ -1210,40 +1177,24 @@ out_free_interp:
k = elf_ppnt->p_vaddr + elf_ppnt->p_filesz;
- if (k > elf_bss)
- elf_bss = k;
if ((elf_ppnt->p_flags & PF_X) && end_code < k)
end_code = k;
if (end_data < k)
end_data = k;
k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz;
- if (k > elf_brk) {
- bss_prot = elf_prot;
+ if (k > elf_brk)
elf_brk = k;
- }
}
e_entry = elf_ex->e_entry + load_bias;
phdr_addr += load_bias;
- elf_bss += load_bias;
elf_brk += load_bias;
start_code += load_bias;
end_code += load_bias;
start_data += load_bias;
end_data += load_bias;
- /* Calling set_brk effectively mmaps the pages that we need
- * for the bss and break sections. We must do this before
- * mapping in the interpreter, to make sure it doesn't wind
- * up getting placed where the bss needs to go.
- */
- retval = set_brk(elf_bss, elf_brk, bss_prot);
- if (retval)
- goto out_free_dentry;
- if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) {
- retval = -EFAULT; /* Nobody gets to see this, but.. */
- goto out_free_dentry;
- }
+ current->mm->start_brk = current->mm->brk = ELF_PAGEALIGN(elf_brk);
if (interpreter) {
elf_entry = load_elf_interp(interp_elf_ex,
@@ -1369,7 +1320,6 @@ static int load_elf_library(struct file *file)
{
struct elf_phdr *elf_phdata;
struct elf_phdr *eppnt;
- unsigned long elf_bss, bss, len;
int retval, error, i, j;
struct elfhdr elf_ex;
@@ -1414,30 +1364,15 @@ static int load_elf_library(struct file *file)
eppnt++;
/* Now use mmap to map the library into memory. */
- error = vm_mmap(file,
- ELF_PAGESTART(eppnt->p_vaddr),
- (eppnt->p_filesz +
- ELF_PAGEOFFSET(eppnt->p_vaddr)),
+ error = elf_load(file, ELF_PAGESTART(eppnt->p_vaddr),
+ eppnt,
PROT_READ | PROT_WRITE | PROT_EXEC,
MAP_FIXED_NOREPLACE | MAP_PRIVATE,
- (eppnt->p_offset -
- ELF_PAGEOFFSET(eppnt->p_vaddr)));
+ 0);
+
if (error != ELF_PAGESTART(eppnt->p_vaddr))
goto out_free_ph;
- elf_bss = eppnt->p_vaddr + eppnt->p_filesz;
- if (padzero(elf_bss)) {
- error = -EFAULT;
- goto out_free_ph;
- }
-
- len = ELF_PAGEALIGN(eppnt->p_filesz + eppnt->p_vaddr);
- bss = ELF_PAGEALIGN(eppnt->p_memsz + eppnt->p_vaddr);
- if (bss > len) {
- error = vm_brk(len, bss - len);
- if (error)
- goto out_free_ph;
- }
error = 0;
out_free_ph:
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 206812ce544a..fefc642541cb 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -899,10 +899,12 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params,
kdebug("- DYNAMIC[]: %lx", params->dynamic_addr);
seg = loadmap->segs;
for (loop = 0; loop < loadmap->nsegs; loop++, seg++)
- kdebug("- LOAD[%d] : %08x-%08x [va=%x ms=%x]",
+ kdebug("- LOAD[%d] : %08llx-%08llx [va=%llx ms=%llx]",
loop,
- seg->addr, seg->addr + seg->p_memsz - 1,
- seg->p_vaddr, seg->p_memsz);
+ (unsigned long long) seg->addr,
+ (unsigned long long) seg->addr + seg->p_memsz - 1,
+ (unsigned long long) seg->p_vaddr,
+ (unsigned long long) seg->p_memsz);
return 0;
@@ -1081,9 +1083,10 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
maddr = vm_mmap(file, maddr, phdr->p_memsz + disp, prot, flags,
phdr->p_offset - disp);
- kdebug("mmap[%d] <file> sz=%lx pr=%x fl=%x of=%lx --> %08lx",
- loop, phdr->p_memsz + disp, prot, flags,
- phdr->p_offset - disp, maddr);
+ kdebug("mmap[%d] <file> sz=%llx pr=%x fl=%x of=%llx --> %08lx",
+ loop, (unsigned long long) phdr->p_memsz + disp,
+ prot, flags, (unsigned long long) phdr->p_offset - disp,
+ maddr);
if (IS_ERR_VALUE(maddr))
return (int) maddr;
@@ -1145,8 +1148,9 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
#else
if (excess > 0) {
- kdebug("clear[%d] ad=%lx sz=%lx",
- loop, maddr + phdr->p_filesz, excess);
+ kdebug("clear[%d] ad=%llx sz=%lx", loop,
+ (unsigned long long) maddr + phdr->p_filesz,
+ excess);
if (clear_user((void *) maddr + phdr->p_filesz, excess))
return -EFAULT;
}
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index e0108d17b085..68fa225f89e5 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -40,9 +40,6 @@ enum {
VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */
};
-static LIST_HEAD(entries);
-static int enabled = 1;
-
enum {Enabled, Magic};
#define MISC_FMT_PRESERVE_ARGV0 (1UL << 31)
#define MISC_FMT_OPEN_BINARY (1UL << 30)
@@ -60,12 +57,10 @@ typedef struct {
char *name;
struct dentry *dentry;
struct file *interp_file;
+ refcount_t users; /* sync removal with load_misc_binary() */
} Node;
-static DEFINE_RWLOCK(entries_lock);
static struct file_system_type bm_fs_type;
-static struct vfsmount *bm_mnt;
-static int entry_count;
/*
* Max length of the register string. Determined by:
@@ -82,19 +77,24 @@ static int entry_count;
*/
#define MAX_REGISTER_LENGTH 1920
-/*
- * Check if we support the binfmt
- * if we do, return the node, else NULL
- * locking is done in load_misc_binary
+/**
+ * search_binfmt_handler - search for a binary handler for @bprm
+ * @misc: handle to binfmt_misc instance
+ * @bprm: binary for which we are looking for a handler
+ *
+ * Search for a binary type handler for @bprm in the list of registered binary
+ * type handlers.
+ *
+ * Return: binary type list entry on success, NULL on failure
*/
-static Node *check_file(struct linux_binprm *bprm)
+static Node *search_binfmt_handler(struct binfmt_misc *misc,
+ struct linux_binprm *bprm)
{
char *p = strrchr(bprm->interp, '.');
- struct list_head *l;
+ Node *e;
/* Walk all the registered handlers. */
- list_for_each(l, &entries) {
- Node *e = list_entry(l, Node, list);
+ list_for_each_entry(e, &misc->entries, list) {
char *s;
int j;
@@ -123,9 +123,79 @@ static Node *check_file(struct linux_binprm *bprm)
if (j == e->size)
return e;
}
+
return NULL;
}
+/**
+ * get_binfmt_handler - try to find a binary type handler
+ * @misc: handle to binfmt_misc instance
+ * @bprm: binary for which we are looking for a handler
+ *
+ * Try to find a binfmt handler for the binary type. If one is found take a
+ * reference to protect against removal via bm_{entry,status}_write().
+ *
+ * Return: binary type list entry on success, NULL on failure
+ */
+static Node *get_binfmt_handler(struct binfmt_misc *misc,
+ struct linux_binprm *bprm)
+{
+ Node *e;
+
+ read_lock(&misc->entries_lock);
+ e = search_binfmt_handler(misc, bprm);
+ if (e)
+ refcount_inc(&e->users);
+ read_unlock(&misc->entries_lock);
+ return e;
+}
+
+/**
+ * put_binfmt_handler - put binary handler node
+ * @e: node to put
+ *
+ * Free node syncing with load_misc_binary() and defer final free to
+ * load_misc_binary() in case it is using the binary type handler we were
+ * requested to remove.
+ */
+static void put_binfmt_handler(Node *e)
+{
+ if (refcount_dec_and_test(&e->users)) {
+ if (e->flags & MISC_FMT_OPEN_FILE)
+ filp_close(e->interp_file, NULL);
+ kfree(e);
+ }
+}
+
+/**
+ * load_binfmt_misc - load the binfmt_misc of the caller's user namespace
+ *
+ * To be called in load_misc_binary() to load the relevant struct binfmt_misc.
+ * If a user namespace doesn't have its own binfmt_misc mount it can make use
+ * of its ancestor's binfmt_misc handlers. This mimicks the behavior of
+ * pre-namespaced binfmt_misc where all registered binfmt_misc handlers where
+ * available to all user and user namespaces on the system.
+ *
+ * Return: the binfmt_misc instance of the caller's user namespace
+ */
+static struct binfmt_misc *load_binfmt_misc(void)
+{
+ const struct user_namespace *user_ns;
+ struct binfmt_misc *misc;
+
+ user_ns = current_user_ns();
+ while (user_ns) {
+ /* Pairs with smp_store_release() in bm_fill_super(). */
+ misc = smp_load_acquire(&user_ns->binfmt_misc);
+ if (misc)
+ return misc;
+
+ user_ns = user_ns->parent;
+ }
+
+ return &init_binfmt_misc;
+}
+
/*
* the loader itself
*/
@@ -133,18 +203,14 @@ static int load_misc_binary(struct linux_binprm *bprm)
{
Node *fmt;
struct file *interp_file = NULL;
- int retval;
+ int retval = -ENOEXEC;
+ struct binfmt_misc *misc;
- retval = -ENOEXEC;
- if (!enabled)
+ misc = load_binfmt_misc();
+ if (!misc->enabled)
return retval;
- /* to keep locking time low, we copy the interpreter string */
- read_lock(&entries_lock);
- fmt = check_file(bprm);
- if (fmt)
- dget(fmt->dentry);
- read_unlock(&entries_lock);
+ fmt = get_binfmt_handler(misc, bprm);
if (!fmt)
return retval;
@@ -198,7 +264,16 @@ static int load_misc_binary(struct linux_binprm *bprm)
retval = 0;
ret:
- dput(fmt->dentry);
+
+ /*
+ * If we actually put the node here all concurrent calls to
+ * load_misc_binary() will have finished. We also know
+ * that for the refcount to be zero someone must have concurently
+ * removed the binary type handler from the list and it's our job to
+ * free it.
+ */
+ put_binfmt_handler(fmt);
+
return retval;
}
@@ -287,7 +362,7 @@ static Node *create_entry(const char __user *buffer, size_t count)
err = -ENOMEM;
memsize = sizeof(Node) + count + 8;
- e = kmalloc(memsize, GFP_KERNEL);
+ e = kmalloc(memsize, GFP_KERNEL_ACCOUNT);
if (!e)
goto out;
@@ -399,7 +474,7 @@ static Node *create_entry(const char __user *buffer, size_t count)
if (e->mask) {
int i;
- char *masked = kmalloc(e->size, GFP_KERNEL);
+ char *masked = kmalloc(e->size, GFP_KERNEL_ACCOUNT);
print_hex_dump_bytes(
KBUILD_MODNAME ": register: mask[decoded]: ",
@@ -547,35 +622,114 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
if (inode) {
inode->i_ino = get_next_ino();
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
}
return inode;
}
+/**
+ * i_binfmt_misc - retrieve struct binfmt_misc from a binfmt_misc inode
+ * @inode: inode of the relevant binfmt_misc instance
+ *
+ * This helper retrieves struct binfmt_misc from a binfmt_misc inode. This can
+ * be done without any memory barriers because we are guaranteed that
+ * user_ns->binfmt_misc is fully initialized. It was fully initialized when the
+ * binfmt_misc mount was first created.
+ *
+ * Return: struct binfmt_misc of the relevant binfmt_misc instance
+ */
+static struct binfmt_misc *i_binfmt_misc(struct inode *inode)
+{
+ return inode->i_sb->s_user_ns->binfmt_misc;
+}
+
+/**
+ * bm_evict_inode - cleanup data associated with @inode
+ * @inode: inode to which the data is attached
+ *
+ * Cleanup the binary type handler data associated with @inode if a binary type
+ * entry is removed or the filesystem is unmounted and the super block is
+ * shutdown.
+ *
+ * If the ->evict call was not caused by a super block shutdown but by a write
+ * to remove the entry or all entries via bm_{entry,status}_write() the entry
+ * will have already been removed from the list. We keep the list_empty() check
+ * to make that explicit.
+*/
static void bm_evict_inode(struct inode *inode)
{
Node *e = inode->i_private;
- if (e && e->flags & MISC_FMT_OPEN_FILE)
- filp_close(e->interp_file, NULL);
-
clear_inode(inode);
- kfree(e);
+
+ if (e) {
+ struct binfmt_misc *misc;
+
+ misc = i_binfmt_misc(inode);
+ write_lock(&misc->entries_lock);
+ if (!list_empty(&e->list))
+ list_del_init(&e->list);
+ write_unlock(&misc->entries_lock);
+ put_binfmt_handler(e);
+ }
}
-static void kill_node(Node *e)
+/**
+ * unlink_binfmt_dentry - remove the dentry for the binary type handler
+ * @dentry: dentry associated with the binary type handler
+ *
+ * Do the actual filesystem work to remove a dentry for a registered binary
+ * type handler. Since binfmt_misc only allows simple files to be created
+ * directly under the root dentry of the filesystem we ensure that we are
+ * indeed passed a dentry directly beneath the root dentry, that the inode
+ * associated with the root dentry is locked, and that it is a regular file we
+ * are asked to remove.
+ */
+static void unlink_binfmt_dentry(struct dentry *dentry)
{
- struct dentry *dentry;
+ struct dentry *parent = dentry->d_parent;
+ struct inode *inode, *parent_inode;
- write_lock(&entries_lock);
- list_del_init(&e->list);
- write_unlock(&entries_lock);
+ /* All entries are immediate descendants of the root dentry. */
+ if (WARN_ON_ONCE(dentry->d_sb->s_root != parent))
+ return;
- dentry = e->dentry;
- drop_nlink(d_inode(dentry));
- d_drop(dentry);
- dput(dentry);
- simple_release_fs(&bm_mnt, &entry_count);
+ /* We only expect to be called on regular files. */
+ inode = d_inode(dentry);
+ if (WARN_ON_ONCE(!S_ISREG(inode->i_mode)))
+ return;
+
+ /* The parent inode must be locked. */
+ parent_inode = d_inode(parent);
+ if (WARN_ON_ONCE(!inode_is_locked(parent_inode)))
+ return;
+
+ if (simple_positive(dentry)) {
+ dget(dentry);
+ simple_unlink(parent_inode, dentry);
+ d_delete(dentry);
+ dput(dentry);
+ }
+}
+
+/**
+ * remove_binfmt_handler - remove a binary type handler
+ * @misc: handle to binfmt_misc instance
+ * @e: binary type handler to remove
+ *
+ * Remove a binary type handler from the list of binary type handlers and
+ * remove its associated dentry. This is called from
+ * binfmt_{entry,status}_write(). In the future, we might want to think about
+ * adding a proper ->unlink() method to binfmt_misc instead of forcing caller's
+ * to use writes to files in order to delete binary type handlers. But it has
+ * worked for so long that it's not a pressing issue.
+ */
+static void remove_binfmt_handler(struct binfmt_misc *misc, Node *e)
+{
+ write_lock(&misc->entries_lock);
+ list_del_init(&e->list);
+ write_unlock(&misc->entries_lock);
+ unlink_binfmt_dentry(e->dentry);
}
/* /<entry> */
@@ -602,8 +756,8 @@ bm_entry_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
- struct dentry *root;
- Node *e = file_inode(file)->i_private;
+ struct inode *inode = file_inode(file);
+ Node *e = inode->i_private;
int res = parse_command(buffer, count);
switch (res) {
@@ -617,13 +771,22 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
break;
case 3:
/* Delete this handler. */
- root = file_inode(file)->i_sb->s_root;
- inode_lock(d_inode(root));
+ inode = d_inode(inode->i_sb->s_root);
+ inode_lock(inode);
+ /*
+ * In order to add new element or remove elements from the list
+ * via bm_{entry,register,status}_write() inode_lock() on the
+ * root inode must be held.
+ * The lock is exclusive ensuring that the list can't be
+ * modified. Only load_misc_binary() can access but does so
+ * read-only. So we only need to take the write lock when we
+ * actually remove the entry from the list.
+ */
if (!list_empty(&e->list))
- kill_node(e);
+ remove_binfmt_handler(i_binfmt_misc(inode), e);
- inode_unlock(d_inode(root));
+ inode_unlock(inode);
break;
default:
return res;
@@ -647,6 +810,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
struct inode *inode;
struct super_block *sb = file_inode(file)->i_sb;
struct dentry *root = sb->s_root, *dentry;
+ struct binfmt_misc *misc;
int err = 0;
struct file *f = NULL;
@@ -656,7 +820,18 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
return PTR_ERR(e);
if (e->flags & MISC_FMT_OPEN_FILE) {
+ const struct cred *old_cred;
+
+ /*
+ * Now that we support unprivileged binfmt_misc mounts make
+ * sure we use the credentials that the register @file was
+ * opened with to also open the interpreter. Before that this
+ * didn't matter much as only a privileged process could open
+ * the register file.
+ */
+ old_cred = override_creds(file->f_cred);
f = open_exec(e->interpreter);
+ revert_creds(old_cred);
if (IS_ERR(f)) {
pr_notice("register: failed to install interpreter file %s\n",
e->interpreter);
@@ -682,21 +857,16 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
if (!inode)
goto out2;
- err = simple_pin_fs(&bm_fs_type, &bm_mnt, &entry_count);
- if (err) {
- iput(inode);
- inode = NULL;
- goto out2;
- }
-
+ refcount_set(&e->users, 1);
e->dentry = dget(dentry);
inode->i_private = e;
inode->i_fop = &bm_entry_operations;
d_instantiate(dentry, inode);
- write_lock(&entries_lock);
- list_add(&e->list, &entries);
- write_unlock(&entries_lock);
+ misc = i_binfmt_misc(inode);
+ write_lock(&misc->entries_lock);
+ list_add(&e->list, &misc->entries);
+ write_unlock(&misc->entries_lock);
err = 0;
out2:
@@ -723,35 +893,50 @@ static const struct file_operations bm_register_operations = {
static ssize_t
bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
{
- char *s = enabled ? "enabled\n" : "disabled\n";
+ struct binfmt_misc *misc;
+ char *s;
+ misc = i_binfmt_misc(file_inode(file));
+ s = misc->enabled ? "enabled\n" : "disabled\n";
return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
}
static ssize_t bm_status_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
+ struct binfmt_misc *misc;
int res = parse_command(buffer, count);
- struct dentry *root;
+ Node *e, *next;
+ struct inode *inode;
+ misc = i_binfmt_misc(file_inode(file));
switch (res) {
case 1:
/* Disable all handlers. */
- enabled = 0;
+ misc->enabled = false;
break;
case 2:
/* Enable all handlers. */
- enabled = 1;
+ misc->enabled = true;
break;
case 3:
/* Delete all handlers. */
- root = file_inode(file)->i_sb->s_root;
- inode_lock(d_inode(root));
+ inode = d_inode(file_inode(file)->i_sb->s_root);
+ inode_lock(inode);
- while (!list_empty(&entries))
- kill_node(list_first_entry(&entries, Node, list));
+ /*
+ * In order to add new element or remove elements from the list
+ * via bm_{entry,register,status}_write() inode_lock() on the
+ * root inode must be held.
+ * The lock is exclusive ensuring that the list can't be
+ * modified. Only load_misc_binary() can access but does so
+ * read-only. So we only need to take the write lock when we
+ * actually remove the entry from the list.
+ */
+ list_for_each_entry_safe(e, next, &misc->entries, list)
+ remove_binfmt_handler(misc, e);
- inode_unlock(d_inode(root));
+ inode_unlock(inode);
break;
default:
return res;
@@ -768,32 +953,100 @@ static const struct file_operations bm_status_operations = {
/* Superblock handling */
+static void bm_put_super(struct super_block *sb)
+{
+ struct user_namespace *user_ns = sb->s_fs_info;
+
+ sb->s_fs_info = NULL;
+ put_user_ns(user_ns);
+}
+
static const struct super_operations s_ops = {
.statfs = simple_statfs,
.evict_inode = bm_evict_inode,
+ .put_super = bm_put_super,
};
static int bm_fill_super(struct super_block *sb, struct fs_context *fc)
{
int err;
+ struct user_namespace *user_ns = sb->s_user_ns;
+ struct binfmt_misc *misc;
static const struct tree_descr bm_files[] = {
[2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO},
[3] = {"register", &bm_register_operations, S_IWUSR},
/* last one */ {""}
};
+ if (WARN_ON(user_ns != current_user_ns()))
+ return -EINVAL;
+
+ /*
+ * Lazily allocate a new binfmt_misc instance for this namespace, i.e.
+ * do it here during the first mount of binfmt_misc. We don't need to
+ * waste memory for every user namespace allocation. It's likely much
+ * more common to not mount a separate binfmt_misc instance than it is
+ * to mount one.
+ *
+ * While multiple superblocks can exist they are keyed by userns in
+ * s_fs_info for binfmt_misc. Hence, the vfs guarantees that
+ * bm_fill_super() is called exactly once whenever a binfmt_misc
+ * superblock for a userns is created. This in turn lets us conclude
+ * that when a binfmt_misc superblock is created for the first time for
+ * a userns there's no one racing us. Therefore we don't need any
+ * barriers when we dereference binfmt_misc.
+ */
+ misc = user_ns->binfmt_misc;
+ if (!misc) {
+ /*
+ * If it turns out that most user namespaces actually want to
+ * register their own binary type handler and therefore all
+ * create their own separate binfm_misc mounts we should
+ * consider turning this into a kmem cache.
+ */
+ misc = kzalloc(sizeof(struct binfmt_misc), GFP_KERNEL);
+ if (!misc)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&misc->entries);
+ rwlock_init(&misc->entries_lock);
+
+ /* Pairs with smp_load_acquire() in load_binfmt_misc(). */
+ smp_store_release(&user_ns->binfmt_misc, misc);
+ }
+
+ /*
+ * When the binfmt_misc superblock for this userns is shutdown
+ * ->enabled might have been set to false and we don't reinitialize
+ * ->enabled again in put_super() as someone might already be mounting
+ * binfmt_misc again. It also would be pointless since by the time
+ * ->put_super() is called we know that the binary type list for this
+ * bintfmt_misc mount is empty making load_misc_binary() return
+ * -ENOEXEC independent of whether ->enabled is true. Instead, if
+ * someone mounts binfmt_misc for the first time or again we simply
+ * reset ->enabled to true.
+ */
+ misc->enabled = true;
+
err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files);
if (!err)
sb->s_op = &s_ops;
return err;
}
+static void bm_free(struct fs_context *fc)
+{
+ if (fc->s_fs_info)
+ put_user_ns(fc->s_fs_info);
+}
+
static int bm_get_tree(struct fs_context *fc)
{
- return get_tree_single(fc, bm_fill_super);
+ return get_tree_keyed(fc, bm_fill_super, get_user_ns(fc->user_ns));
}
static const struct fs_context_operations bm_context_ops = {
+ .free = bm_free,
.get_tree = bm_get_tree,
};
@@ -812,6 +1065,7 @@ static struct file_system_type bm_fs_type = {
.owner = THIS_MODULE,
.name = "binfmt_misc",
.init_fs_context = bm_init_fs_context,
+ .fs_flags = FS_USERNS_MOUNT,
.kill_sb = kill_litter_super,
};
MODULE_ALIAS_FS("binfmt_misc");
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index a25c9910d90b..4fb925e8c981 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -48,27 +48,6 @@ config BTRFS_FS_POSIX_ACL
If you don't know what Access Control Lists are, say N
-config BTRFS_FS_CHECK_INTEGRITY
- bool "Btrfs with integrity check tool compiled in (DEPRECATED)"
- depends on BTRFS_FS
- help
- This feature has been deprecated and will be removed in 6.7.
-
- Adds code that examines all block write requests (including
- writes of the super block). The goal is to verify that the
- state of the filesystem on disk is always consistent, i.e.,
- after a power-loss or kernel panic event the filesystem is
- in a consistent state.
-
- If the integrity check tool is included and activated in
- the mount options, plenty of kernel memory is used, and
- plenty of additional CPU cycles are spent. Enabling this
- functionality is not intended for normal use.
-
- In most cases, unless you are a btrfs developer who needs
- to verify the integrity of (super)-block write requests
- during the run of a regression test, say N
-
config BTRFS_FS_RUN_SANITY_TESTS
bool "Btrfs will run sanity tests upon loading"
depends on BTRFS_FS
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 90d53209755b..525af975f61c 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -33,10 +33,9 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \
- lru_cache.o
+ lru_cache.o raid-stripe-tree.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
-btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o
btrfs-$(CONFIG_FS_VERITY) += verity.o
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index 8cfc8214109c..aa0844535644 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -4,6 +4,7 @@
#define BTRFS_ACCESSORS_H
#include <linux/stddef.h>
+#include <asm/unaligned.h>
struct btrfs_map_token {
struct extent_buffer *eb;
@@ -305,6 +306,14 @@ BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
+BTRFS_SETGET_FUNCS(stripe_extent_encoding, struct btrfs_stripe_extent, encoding, 8);
+BTRFS_SETGET_FUNCS(raid_stride_devid, struct btrfs_raid_stride, devid, 64);
+BTRFS_SETGET_FUNCS(raid_stride_physical, struct btrfs_raid_stride, physical, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_extent_encoding,
+ struct btrfs_stripe_extent, encoding, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_devid, struct btrfs_raid_stride, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_physical, struct btrfs_raid_stride, physical, 64);
+
/* struct btrfs_dev_extent */
BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64);
BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
@@ -349,6 +358,9 @@ BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, count, 3
BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, count, 32);
+BTRFS_SETGET_FUNCS(extent_owner_ref_root_id, struct btrfs_extent_owner_ref,
+ root_id, 64);
+
BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref,
type, 8);
BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref,
@@ -365,6 +377,8 @@ static inline u32 btrfs_extent_inline_ref_size(int type)
if (type == BTRFS_EXTENT_DATA_REF_KEY)
return sizeof(struct btrfs_extent_data_ref) +
offsetof(struct btrfs_extent_inline_ref, offset);
+ if (type == BTRFS_EXTENT_OWNER_REF_KEY)
+ return sizeof(struct btrfs_extent_inline_ref);
return 0;
}
@@ -966,6 +980,8 @@ BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item,
flags, 64);
BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item,
rescan, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_enable_gen, struct btrfs_qgroup_status_item,
+ enable_gen, 64);
/* btrfs_qgroup_info_item */
BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item,
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index ce083e99ef68..9e261aac671e 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -9,6 +9,7 @@
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/freezer.h>
+#include <trace/events/btrfs.h>
#include "async-thread.h"
#include "ctree.h"
@@ -242,7 +243,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq,
break;
trace_btrfs_ordered_sched(work);
spin_unlock_irqrestore(lock, flags);
- work->ordered_func(work);
+ work->ordered_func(work, false);
/* now take the lock again and drop our item from the list */
spin_lock_irqsave(lock, flags);
@@ -277,7 +278,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq,
* We don't want to call the ordered free functions with
* the lock held.
*/
- work->ordered_free(work);
+ work->ordered_func(work, true);
/* NB: work must not be dereferenced past this point. */
trace_btrfs_all_work_done(wq->fs_info, work);
}
@@ -285,7 +286,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq,
spin_unlock_irqrestore(lock, flags);
if (free_self) {
- self->ordered_free(self);
+ self->ordered_func(self, true);
/* NB: self must not be dereferenced past this point. */
trace_btrfs_all_work_done(wq->fs_info, self);
}
@@ -300,7 +301,7 @@ static void btrfs_work_helper(struct work_struct *normal_work)
/*
* We should not touch things inside work in the following cases:
- * 1) after work->func() if it has no ordered_free
+ * 1) after work->func() if it has no ordered_func(..., true) to free
* Since the struct is freed in work->func().
* 2) after setting WORK_DONE_BIT
* The work may be freed in other threads almost instantly.
@@ -329,11 +330,10 @@ static void btrfs_work_helper(struct work_struct *normal_work)
}
void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
- btrfs_func_t ordered_func, btrfs_func_t ordered_free)
+ btrfs_ordered_func_t ordered_func)
{
work->func = func;
work->ordered_func = ordered_func;
- work->ordered_free = ordered_free;
INIT_WORK(&work->normal_work, btrfs_work_helper);
INIT_LIST_HEAD(&work->ordered_list);
work->flags = 0;
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 30f66c5e2e6e..62b8a0d57898 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -13,11 +13,11 @@ struct btrfs_fs_info;
struct btrfs_workqueue;
struct btrfs_work;
typedef void (*btrfs_func_t)(struct btrfs_work *arg);
+typedef void (*btrfs_ordered_func_t)(struct btrfs_work *arg, bool);
struct btrfs_work {
btrfs_func_t func;
- btrfs_func_t ordered_func;
- btrfs_func_t ordered_free;
+ btrfs_ordered_func_t ordered_func;
/* Don't touch things below */
struct work_struct normal_work;
@@ -35,7 +35,7 @@ struct btrfs_workqueue *btrfs_alloc_ordered_workqueue(
struct btrfs_fs_info *fs_info, const char *name,
unsigned int flags);
void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
- btrfs_func_t ordered_func, btrfs_func_t ordered_free);
+ btrfs_ordered_func_t ordered_func);
void btrfs_queue_work(struct btrfs_workqueue *wq,
struct btrfs_work *work);
void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index a4a809efc92f..beed7e459dab 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1129,6 +1129,9 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx,
count, sc, GFP_NOFS);
break;
}
+ case BTRFS_EXTENT_OWNER_REF_KEY:
+ ASSERT(btrfs_fs_incompat(ctx->fs_info, SIMPLE_QUOTA));
+ break;
default:
WARN_ON(1);
}
@@ -2998,7 +3001,7 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
}
void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info,
- struct btrfs_backref_cache *cache, int is_reloc)
+ struct btrfs_backref_cache *cache, bool is_reloc)
{
int i;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 71d535e03dca..ab4ca0eda605 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -247,7 +247,7 @@ struct prelim_ref {
struct rb_node rbnode;
u64 root_id;
struct btrfs_key key_for_search;
- int level;
+ u8 level;
int count;
struct extent_inode_elem *inode_list;
u64 parent;
@@ -440,11 +440,11 @@ struct btrfs_backref_cache {
* Reloction backref cache require more info for reloc root compared
* to generic backref cache.
*/
- unsigned int is_reloc;
+ bool is_reloc;
};
void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info,
- struct btrfs_backref_cache *cache, int is_reloc);
+ struct btrfs_backref_cache *cache, bool is_reloc);
struct btrfs_backref_node *btrfs_backref_alloc_node(
struct btrfs_backref_cache *cache, u64 bytenr, int level);
struct btrfs_backref_edge *btrfs_backref_alloc_edge(
@@ -533,9 +533,9 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
void btrfs_backref_release_cache(struct btrfs_backref_cache *cache);
static inline void btrfs_backref_panic(struct btrfs_fs_info *fs_info,
- u64 bytenr, int errno)
+ u64 bytenr, int error)
{
- btrfs_panic(fs_info, errno,
+ btrfs_panic(fs_info, error,
"Inconsistency in backref cache found at offset %llu",
bytenr);
}
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 12b12443efaa..4f3b693a16b1 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -10,11 +10,11 @@
#include "volumes.h"
#include "raid56.h"
#include "async-thread.h"
-#include "check-integrity.h"
#include "dev-replace.h"
#include "rcu-string.h"
#include "zoned.h"
#include "file-item.h"
+#include "raid-stripe-tree.h"
static struct bio_set btrfs_bioset;
static struct bio_set btrfs_clone_bioset;
@@ -416,6 +416,9 @@ static void btrfs_orig_write_end_io(struct bio *bio)
else
bio->bi_status = BLK_STS_OK;
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status)
+ stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+
btrfs_orig_bbio_end_io(bbio);
btrfs_put_bioc(bioc);
}
@@ -427,6 +430,8 @@ static void btrfs_clone_write_end_io(struct bio *bio)
if (bio->bi_status) {
atomic_inc(&stripe->bioc->error);
btrfs_log_dev_io_error(bio, stripe->dev);
+ } else if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
}
/* Pass on control to the original bio this one was cloned from */
@@ -463,8 +468,6 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
(unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev),
dev->devid, bio->bi_iter.bi_size);
- btrfsic_check_bio(bio);
-
if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT)
blkcg_punt_bio_submit(bio);
else
@@ -490,6 +493,7 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
bio->bi_private = &bioc->stripes[dev_nr];
bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
bioc->stripes[dev_nr].bioc = bioc;
+ bioc->size = bio->bi_iter.bi_size;
btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
}
@@ -499,6 +503,8 @@ static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
if (!bioc) {
/* Single mirror read/write fast path. */
btrfs_bio(bio)->mirror_num = mirror_num;
+ if (bio_op(bio) != REQ_OP_READ)
+ btrfs_bio(bio)->orig_physical = smap->physical;
bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT;
if (bio_op(bio) != REQ_OP_READ)
btrfs_bio(bio)->orig_physical = smap->physical;
@@ -568,13 +574,20 @@ static void run_one_async_start(struct btrfs_work *work)
*
* At IO completion time the csums attached on the ordered extent record are
* inserted into the tree.
+ *
+ * If called with @do_free == true, then it will free the work struct.
*/
-static void run_one_async_done(struct btrfs_work *work)
+static void run_one_async_done(struct btrfs_work *work, bool do_free)
{
struct async_submit_bio *async =
container_of(work, struct async_submit_bio, work);
struct bio *bio = &async->bbio->bio;
+ if (do_free) {
+ kfree(container_of(work, struct async_submit_bio, work));
+ return;
+ }
+
/* If an error occurred we just want to clean up the bio and move on. */
if (bio->bi_status) {
btrfs_orig_bbio_end_io(async->bbio);
@@ -590,11 +603,6 @@ static void run_one_async_done(struct btrfs_work *work)
__btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
}
-static void run_one_async_free(struct btrfs_work *work)
-{
- kfree(container_of(work, struct async_submit_bio, work));
-}
-
static bool should_async_write(struct btrfs_bio *bbio)
{
/* Submit synchronously if the checksum implementation is fast. */
@@ -636,8 +644,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
async->smap = *smap;
async->mirror_num = mirror_num;
- btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
- run_one_async_free);
+ btrfs_init_work(&async->work, run_one_async_start, run_one_async_done);
btrfs_queue_work(fs_info->workers, &async->work);
return true;
}
@@ -657,9 +664,11 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
blk_status_t ret;
int error;
+ smap.is_scrub = !bbio->inode;
+
btrfs_bio_counter_inc_blocked(fs_info);
error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
- &bioc, &smap, &mirror_num, 1);
+ &bioc, &smap, &mirror_num);
if (error) {
ret = errno_to_blk_status(error);
goto fail;
@@ -691,6 +700,18 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
bio->bi_opf |= REQ_OP_ZONE_APPEND;
}
+ if (is_data_bbio(bbio) && bioc &&
+ btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
+ /*
+ * No locking for the list update, as we only add to
+ * the list in the I/O submission path, and list
+ * iteration only happens in the completion path, which
+ * can't happen until after the last submission.
+ */
+ btrfs_get_bioc(bioc);
+ list_add_tail(&bioc->rst_ordered_entry, &bbio->ordered->bioc_list);
+ }
+
/*
* Csum items for reloc roots have already been cloned at this
* point, so they are handled as part of the no-checksum case.
@@ -779,8 +800,6 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
__bio_add_page(&bio, page, length, pg_offset);
-
- btrfsic_check_bio(&bio);
ret = submit_bio_wait(&bio);
if (ret) {
/* try to remap that extent elsewhere? */
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index b2e5107b7cec..6e5dc68ff661 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -935,7 +935,7 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
caching_ctl->block_group = cache;
refcount_set(&caching_ctl->count, 2);
atomic_set(&caching_ctl->progress, 0);
- btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
+ btrfs_init_work(&caching_ctl->work, caching_thread, NULL);
spin_lock(&cache->lock);
if (cache->cached != BTRFS_CACHE_NO) {
@@ -1286,7 +1286,7 @@ out:
/* Once for the lookup reference */
btrfs_put_block_group(block_group);
if (remove_rsv)
- btrfs_delayed_refs_rsv_release(fs_info, 1);
+ btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
btrfs_free_path(path);
return ret;
}
@@ -2601,7 +2601,7 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans,
btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
btrfs_set_dev_extent_length(leaf, extent, num_bytes);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
out:
btrfs_free_path(path);
return ret;
@@ -2709,7 +2709,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
/* Already aborted the transaction if it failed. */
next:
- btrfs_delayed_refs_rsv_release(fs_info, 1);
+ btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
list_del_init(&block_group->bg_list);
clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
}
@@ -2819,8 +2819,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
#endif
list_add_tail(&cache->bg_list, &trans->new_bgs);
- trans->delayed_ref_updates++;
- btrfs_update_delayed_refs_rsv(trans);
+ btrfs_inc_delayed_refs_rsv_bg_inserts(fs_info);
set_avail_alloc_bits(fs_info, type);
return cache;
@@ -3025,7 +3024,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
cache->global_root_id);
btrfs_set_stack_block_group_flags(&bgi, cache->flags);
write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
fail:
btrfs_release_path(path);
/*
@@ -3051,7 +3050,6 @@ static int cache_save_setup(struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct btrfs_root *root = fs_info->tree_root;
struct inode *inode = NULL;
struct extent_changeset *data_reserved = NULL;
u64 alloc_hint = 0;
@@ -3103,7 +3101,7 @@ again:
* time.
*/
BTRFS_I(inode)->generation = 0;
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
if (ret) {
/*
* So theoretically we could recover from this, simply set the
@@ -3370,7 +3368,7 @@ again:
if (should_put)
btrfs_put_block_group(cache);
if (drop_reserve)
- btrfs_delayed_refs_rsv_release(fs_info, 1);
+ btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
/*
* Avoid blocking other tasks for too long. It might even save
* us from writing caches for block groups that are going to be
@@ -3474,8 +3472,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
cache_save_setup(cache, trans, path);
if (!ret)
- ret = btrfs_run_delayed_refs(trans,
- (unsigned long) -1);
+ ret = btrfs_run_delayed_refs(trans, U64_MAX);
if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
cache->io_ctl.inode = NULL;
@@ -3518,7 +3515,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
/* If its not on the io list, we need to put the block group */
if (should_put)
btrfs_put_block_group(cache);
- btrfs_delayed_refs_rsv_release(fs_info, 1);
+ btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
spin_lock(&cur_trans->dirty_bgs_lock);
}
spin_unlock(&cur_trans->dirty_bgs_lock);
@@ -3543,12 +3540,12 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, bool alloc)
{
struct btrfs_fs_info *info = trans->fs_info;
- struct btrfs_block_group *cache = NULL;
- u64 total = num_bytes;
+ struct btrfs_space_info *space_info;
+ struct btrfs_block_group *cache;
u64 old_val;
- u64 byte_in_group;
+ bool reclaim = false;
+ bool bg_already_dirty = true;
int factor;
- int ret = 0;
/* Block accounting for super block */
spin_lock(&info->delalloc_root_lock);
@@ -3560,97 +3557,86 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
btrfs_set_super_bytes_used(info->super_copy, old_val);
spin_unlock(&info->delalloc_root_lock);
- while (total) {
- struct btrfs_space_info *space_info;
- bool reclaim = false;
-
- cache = btrfs_lookup_block_group(info, bytenr);
- if (!cache) {
- ret = -ENOENT;
- break;
- }
- space_info = cache->space_info;
- factor = btrfs_bg_type_to_factor(cache->flags);
+ cache = btrfs_lookup_block_group(info, bytenr);
+ if (!cache)
+ return -ENOENT;
- /*
- * If this block group has free space cache written out, we
- * need to make sure to load it if we are removing space. This
- * is because we need the unpinning stage to actually add the
- * space back to the block group, otherwise we will leak space.
- */
- if (!alloc && !btrfs_block_group_done(cache))
- btrfs_cache_block_group(cache, true);
+ /* An extent can not span multiple block groups. */
+ ASSERT(bytenr + num_bytes <= cache->start + cache->length);
- byte_in_group = bytenr - cache->start;
- WARN_ON(byte_in_group > cache->length);
+ space_info = cache->space_info;
+ factor = btrfs_bg_type_to_factor(cache->flags);
- spin_lock(&space_info->lock);
- spin_lock(&cache->lock);
+ /*
+ * If this block group has free space cache written out, we need to make
+ * sure to load it if we are removing space. This is because we need
+ * the unpinning stage to actually add the space back to the block group,
+ * otherwise we will leak space.
+ */
+ if (!alloc && !btrfs_block_group_done(cache))
+ btrfs_cache_block_group(cache, true);
- if (btrfs_test_opt(info, SPACE_CACHE) &&
- cache->disk_cache_state < BTRFS_DC_CLEAR)
- cache->disk_cache_state = BTRFS_DC_CLEAR;
+ spin_lock(&space_info->lock);
+ spin_lock(&cache->lock);
- old_val = cache->used;
- num_bytes = min(total, cache->length - byte_in_group);
- if (alloc) {
- old_val += num_bytes;
- cache->used = old_val;
- cache->reserved -= num_bytes;
- space_info->bytes_reserved -= num_bytes;
- space_info->bytes_used += num_bytes;
- space_info->disk_used += num_bytes * factor;
- spin_unlock(&cache->lock);
- spin_unlock(&space_info->lock);
- } else {
- old_val -= num_bytes;
- cache->used = old_val;
- cache->pinned += num_bytes;
- btrfs_space_info_update_bytes_pinned(info, space_info,
- num_bytes);
- space_info->bytes_used -= num_bytes;
- space_info->disk_used -= num_bytes * factor;
+ if (btrfs_test_opt(info, SPACE_CACHE) &&
+ cache->disk_cache_state < BTRFS_DC_CLEAR)
+ cache->disk_cache_state = BTRFS_DC_CLEAR;
- reclaim = should_reclaim_block_group(cache, num_bytes);
+ old_val = cache->used;
+ if (alloc) {
+ old_val += num_bytes;
+ cache->used = old_val;
+ cache->reserved -= num_bytes;
+ space_info->bytes_reserved -= num_bytes;
+ space_info->bytes_used += num_bytes;
+ space_info->disk_used += num_bytes * factor;
+ spin_unlock(&cache->lock);
+ spin_unlock(&space_info->lock);
+ } else {
+ old_val -= num_bytes;
+ cache->used = old_val;
+ cache->pinned += num_bytes;
+ btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes);
+ space_info->bytes_used -= num_bytes;
+ space_info->disk_used -= num_bytes * factor;
- spin_unlock(&cache->lock);
- spin_unlock(&space_info->lock);
+ reclaim = should_reclaim_block_group(cache, num_bytes);
- set_extent_bit(&trans->transaction->pinned_extents,
- bytenr, bytenr + num_bytes - 1,
- EXTENT_DIRTY, NULL);
- }
+ spin_unlock(&cache->lock);
+ spin_unlock(&space_info->lock);
- spin_lock(&trans->transaction->dirty_bgs_lock);
- if (list_empty(&cache->dirty_list)) {
- list_add_tail(&cache->dirty_list,
- &trans->transaction->dirty_bgs);
- trans->delayed_ref_updates++;
- btrfs_get_block_group(cache);
- }
- spin_unlock(&trans->transaction->dirty_bgs_lock);
+ set_extent_bit(&trans->transaction->pinned_extents, bytenr,
+ bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
+ }
- /*
- * No longer have used bytes in this block group, queue it for
- * deletion. We do this after adding the block group to the
- * dirty list to avoid races between cleaner kthread and space
- * cache writeout.
- */
- if (!alloc && old_val == 0) {
- if (!btrfs_test_opt(info, DISCARD_ASYNC))
- btrfs_mark_bg_unused(cache);
- } else if (!alloc && reclaim) {
- btrfs_mark_bg_to_reclaim(cache);
- }
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (list_empty(&cache->dirty_list)) {
+ list_add_tail(&cache->dirty_list, &trans->transaction->dirty_bgs);
+ bg_already_dirty = false;
+ btrfs_get_block_group(cache);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
- btrfs_put_block_group(cache);
- total -= num_bytes;
- bytenr += num_bytes;
+ /*
+ * No longer have used bytes in this block group, queue it for deletion.
+ * We do this after adding the block group to the dirty list to avoid
+ * races between cleaner kthread and space cache writeout.
+ */
+ if (!alloc && old_val == 0) {
+ if (!btrfs_test_opt(info, DISCARD_ASYNC))
+ btrfs_mark_bg_unused(cache);
+ } else if (!alloc && reclaim) {
+ btrfs_mark_bg_to_reclaim(cache);
}
+ btrfs_put_block_group(cache);
+
/* Modified block groups are accounted for in the delayed_refs_rsv. */
- btrfs_update_delayed_refs_rsv(trans);
- return ret;
+ if (!bg_already_dirty)
+ btrfs_inc_delayed_refs_rsv_bg_updates(info);
+
+ return 0;
}
/*
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 77684c5e0c8b..ceb5f586a2d5 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -221,7 +221,8 @@ int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
if (num_bytes == 0)
return 0;
- ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
+ num_bytes, flush);
if (!ret)
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true);
@@ -261,7 +262,8 @@ int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
if (!ret)
return 0;
- ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
+ num_bytes, flush);
if (!ret) {
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false);
return 0;
@@ -279,10 +281,10 @@ u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *target = NULL;
/*
- * If we are the delayed_rsv then push to the global rsv, otherwise dump
- * into the delayed rsv if it is not full.
+ * If we are a delayed block reserve then push to the global rsv,
+ * otherwise dump into the global delayed reserve if it is not full.
*/
- if (block_rsv == delayed_rsv)
+ if (block_rsv->type == BTRFS_BLOCK_RSV_DELOPS)
target = global_rsv;
else if (block_rsv != global_rsv && !btrfs_block_rsv_full(delayed_rsv))
target = delayed_rsv;
@@ -354,6 +356,11 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
min_items++;
}
+ if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) {
+ num_bytes += btrfs_root_used(&fs_info->stripe_root->root_item);
+ min_items++;
+ }
+
/*
* But we also want to reserve enough space so we can do the fallback
* global reserve for an unlink, which is an additional
@@ -405,6 +412,7 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root)
case BTRFS_EXTENT_TREE_OBJECTID:
case BTRFS_FREE_SPACE_TREE_OBJECTID:
case BTRFS_BLOCK_GROUP_TREE_OBJECTID:
+ case BTRFS_RAID_STRIPE_TREE_OBJECTID:
root->block_rsv = &fs_info->delayed_refs_rsv;
break;
case BTRFS_ROOT_TREE_OBJECTID:
@@ -517,8 +525,8 @@ again:
block_rsv->type, ret);
}
try_reserve:
- ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize,
- BTRFS_RESERVE_NO_FLUSH);
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
+ blocksize, BTRFS_RESERVE_NO_FLUSH);
if (!ret)
return block_rsv;
/*
@@ -539,7 +547,7 @@ try_reserve:
* one last time to force a reservation if there's enough actual space
* on disk to make the reservation.
*/
- ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize,
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, blocksize,
BTRFS_RESERVE_FLUSH_EMERGENCY);
if (!ret)
return block_rsv;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index bda1fdbba666..5572ae52444e 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -8,6 +8,8 @@
#include <linux/hash.h>
#include <linux/refcount.h>
+#include <linux/fscrypt.h>
+#include <trace/events/btrfs.h>
#include "extent_map.h"
#include "extent_io.h"
#include "ordered-data.h"
@@ -79,11 +81,21 @@ struct btrfs_inode {
*/
struct btrfs_key location;
+ /* Cached value of inode property 'compression'. */
+ u8 prop_compress;
+
+ /*
+ * Force compression on the file using the defrag ioctl, could be
+ * different from prop_compress and takes precedence if set.
+ */
+ u8 defrag_compress;
+
/*
* Lock for counters and all fields used to determine if the inode is in
* the log or not (last_trans, last_sub_trans, last_log_commit,
- * logged_trans), to access/update new_delalloc_bytes and to update the
- * VFS' inode number of bytes used.
+ * logged_trans), to access/update delalloc_bytes, new_delalloc_bytes,
+ * defrag_bytes, disk_i_size, outstanding_extents, csum_bytes and to
+ * update the VFS' inode number of bytes used.
*/
spinlock_t lock;
@@ -102,8 +114,18 @@ struct btrfs_inode {
/* held while logging the inode in tree-log.c */
struct mutex log_mutex;
+ /*
+ * Counters to keep track of the number of extent item's we may use due
+ * to delalloc and such. outstanding_extents is the number of extent
+ * items we think we'll end up using, and reserved_extents is the number
+ * of extent items we've reserved metadata for. Protected by 'lock'.
+ */
+ unsigned outstanding_extents;
+
/* used to order data wrt metadata */
- struct btrfs_ordered_inode_tree ordered_tree;
+ spinlock_t ordered_tree_lock;
+ struct rb_root ordered_tree;
+ struct rb_node *ordered_tree_last;
/* list of all the delalloc inodes in the FS. There are times we need
* to write all the delalloc pages to disk, and this list is used
@@ -122,28 +144,31 @@ struct btrfs_inode {
u64 generation;
/*
- * transid of the trans_handle that last modified this inode
+ * ID of the transaction handle that last modified this inode.
+ * Protected by 'lock'.
*/
u64 last_trans;
/*
- * transid that last logged this inode
+ * ID of the transaction that last logged this inode.
+ * Protected by 'lock'.
*/
u64 logged_trans;
/*
- * log transid when this inode was last modified
+ * Log transaction ID when this inode was last modified.
+ * Protected by 'lock'.
*/
int last_sub_trans;
- /* a local copy of root's last_log_commit */
+ /* A local copy of root's last_log_commit. Protected by 'lock'. */
int last_log_commit;
union {
/*
* Total number of bytes pending delalloc, used by stat to
* calculate the real block usage of the file. This is used
- * only for files.
+ * only for files. Protected by 'lock'.
*/
u64 delalloc_bytes;
/*
@@ -161,7 +186,7 @@ struct btrfs_inode {
* Total number of bytes pending delalloc that fall within a file
* range that is either a hole or beyond EOF (and no prealloc extent
* exists in the range). This is always <= delalloc_bytes and this
- * is used only for files.
+ * is used only for files. Protected by 'lock'.
*/
u64 new_delalloc_bytes;
/*
@@ -172,15 +197,15 @@ struct btrfs_inode {
};
/*
- * total number of bytes pending defrag, used by stat to check whether
- * it needs COW.
+ * Total number of bytes pending defrag, used by stat to check whether
+ * it needs COW. Protected by 'lock'.
*/
u64 defrag_bytes;
/*
- * the size of the file stored in the metadata on disk. data=ordered
+ * The size of the file stored in the metadata on disk. data=ordered
* means the in-memory i_size might be larger than the size on disk
- * because not all the blocks are written yet.
+ * because not all the blocks are written yet. Protected by 'lock'.
*/
u64 disk_i_size;
@@ -214,7 +239,7 @@ struct btrfs_inode {
/*
* Number of bytes outstanding that are going to need csums. This is
- * used in ENOSPC accounting.
+ * used in ENOSPC accounting. Protected by 'lock'.
*/
u64 csum_bytes;
@@ -223,30 +248,13 @@ struct btrfs_inode {
/* Read-only compatibility flags, upper half of inode_item::flags */
u32 ro_flags;
- /*
- * Counters to keep track of the number of extent item's we may use due
- * to delalloc and such. outstanding_extents is the number of extent
- * items we think we'll end up using, and reserved_extents is the number
- * of extent items we've reserved metadata for.
- */
- unsigned outstanding_extents;
-
struct btrfs_block_rsv block_rsv;
- /*
- * Cached values of inode properties
- */
- unsigned prop_compress; /* per-file compression algorithm */
- /*
- * Force compression on the file using the defrag ioctl, could be
- * different from prop_compress and takes precedence if set
- */
- unsigned defrag_compress;
-
struct btrfs_delayed_node *delayed_node;
/* File creation time. */
- struct timespec64 i_otime;
+ u64 i_otime_sec;
+ u32 i_otime_nsec;
/* Hook into fs_info->delayed_iputs */
struct list_head delayed_iput;
@@ -387,7 +395,7 @@ static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
spin_lock(&inode->lock);
if (inode->logged_trans == generation &&
inode->last_sub_trans <= inode->last_log_commit &&
- inode->last_sub_trans <= inode->root->last_log_commit)
+ inode->last_sub_trans <= btrfs_get_root_last_log_commit(inode->root))
ret = true;
spin_unlock(&inode->lock);
return ret;
@@ -481,9 +489,9 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct page *page, size_t pg_offset,
u64 start, u64 end);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode);
+ struct btrfs_inode *inode);
int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode);
+ struct btrfs_inode *inode);
int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode);
int btrfs_orphan_cleanup(struct btrfs_root *root);
int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size);
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
deleted file mode 100644
index 3caf339c4bb3..000000000000
--- a/fs/btrfs/check-integrity.c
+++ /dev/null
@@ -1,2871 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) STRATO AG 2011. All rights reserved.
- */
-
-/*
- * This module can be used to catch cases when the btrfs kernel
- * code executes write requests to the disk that bring the file
- * system in an inconsistent state. In such a state, a power-loss
- * or kernel panic event would cause that the data on disk is
- * lost or at least damaged.
- *
- * Code is added that examines all block write requests during
- * runtime (including writes of the super block). Three rules
- * are verified and an error is printed on violation of the
- * rules:
- * 1. It is not allowed to write a disk block which is
- * currently referenced by the super block (either directly
- * or indirectly).
- * 2. When a super block is written, it is verified that all
- * referenced (directly or indirectly) blocks fulfill the
- * following requirements:
- * 2a. All referenced blocks have either been present when
- * the file system was mounted, (i.e., they have been
- * referenced by the super block) or they have been
- * written since then and the write completion callback
- * was called and no write error was indicated and a
- * FLUSH request to the device where these blocks are
- * located was received and completed.
- * 2b. All referenced blocks need to have a generation
- * number which is equal to the parent's number.
- *
- * One issue that was found using this module was that the log
- * tree on disk became temporarily corrupted because disk blocks
- * that had been in use for the log tree had been freed and
- * reused too early, while being referenced by the written super
- * block.
- *
- * The search term in the kernel log that can be used to filter
- * on the existence of detected integrity issues is
- * "btrfs: attempt".
- *
- * The integrity check is enabled via mount options. These
- * mount options are only supported if the integrity check
- * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY.
- *
- * Example #1, apply integrity checks to all metadata:
- * mount /dev/sdb1 /mnt -o check_int
- *
- * Example #2, apply integrity checks to all metadata and
- * to data extents:
- * mount /dev/sdb1 /mnt -o check_int_data
- *
- * Example #3, apply integrity checks to all metadata and dump
- * the tree that the super block references to kernel messages
- * each time after a super block was written:
- * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263
- *
- * If the integrity check tool is included and activated in
- * the mount options, plenty of kernel memory is used, and
- * plenty of additional CPU cycles are spent. Enabling this
- * functionality is not intended for normal use. In most
- * cases, unless you are a btrfs developer who needs to verify
- * the integrity of (super)-block write requests, do not
- * enable the config option BTRFS_FS_CHECK_INTEGRITY to
- * include and compile the integrity check tool.
- *
- * Expect millions of lines of information in the kernel log with an
- * enabled check_int_print_mask. Therefore set LOG_BUF_SHIFT in the
- * kernel config to at least 26 (which is 64MB). Usually the value is
- * limited to 21 (which is 2MB) in init/Kconfig. The file needs to be
- * changed like this before LOG_BUF_SHIFT can be set to a high value:
- * config LOG_BUF_SHIFT
- * int "Kernel log buffer size (16 => 64KB, 17 => 128KB)"
- * range 12 30
- */
-
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/mutex.h>
-#include <linux/blkdev.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <crypto/hash.h>
-#include "messages.h"
-#include "ctree.h"
-#include "disk-io.h"
-#include "transaction.h"
-#include "extent_io.h"
-#include "volumes.h"
-#include "print-tree.h"
-#include "locking.h"
-#include "check-integrity.h"
-#include "rcu-string.h"
-#include "compression.h"
-#include "accessors.h"
-
-#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
-#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
-#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100
-#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051
-#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807
-#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530
-#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
-#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters,
- * excluding " [...]" */
-#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
-
-/*
- * The definition of the bitmask fields for the print_mask.
- * They are specified with the mount option check_integrity_print_mask.
- */
-#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE 0x00000001
-#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION 0x00000002
-#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE 0x00000004
-#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE 0x00000008
-#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH 0x00000010
-#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH 0x00000020
-#define BTRFSIC_PRINT_MASK_VERBOSE 0x00000040
-#define BTRFSIC_PRINT_MASK_VERY_VERBOSE 0x00000080
-#define BTRFSIC_PRINT_MASK_INITIAL_TREE 0x00000100
-#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES 0x00000200
-#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400
-#define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800
-#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000
-#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE 0x00002000
-
-struct btrfsic_dev_state;
-struct btrfsic_state;
-
-struct btrfsic_block {
- u32 magic_num; /* only used for debug purposes */
- unsigned int is_metadata:1; /* if it is meta-data, not data-data */
- unsigned int is_superblock:1; /* if it is one of the superblocks */
- unsigned int is_iodone:1; /* if is done by lower subsystem */
- unsigned int iodone_w_error:1; /* error was indicated to endio */
- unsigned int never_written:1; /* block was added because it was
- * referenced, not because it was
- * written */
- unsigned int mirror_num; /* large enough to hold
- * BTRFS_SUPER_MIRROR_MAX */
- struct btrfsic_dev_state *dev_state;
- u64 dev_bytenr; /* key, physical byte num on disk */
- u64 logical_bytenr; /* logical byte num on disk */
- u64 generation;
- struct btrfs_disk_key disk_key; /* extra info to print in case of
- * issues, will not always be correct */
- struct list_head collision_resolving_node; /* list node */
- struct list_head all_blocks_node; /* list node */
-
- /* the following two lists contain block_link items */
- struct list_head ref_to_list; /* list */
- struct list_head ref_from_list; /* list */
- struct btrfsic_block *next_in_same_bio;
- void *orig_bio_private;
- bio_end_io_t *orig_bio_end_io;
- blk_opf_t submit_bio_bh_rw;
- u64 flush_gen; /* only valid if !never_written */
-};
-
-/*
- * Elements of this type are allocated dynamically and required because
- * each block object can refer to and can be ref from multiple blocks.
- * The key to lookup them in the hashtable is the dev_bytenr of
- * the block ref to plus the one from the block referred from.
- * The fact that they are searchable via a hashtable and that a
- * ref_cnt is maintained is not required for the btrfs integrity
- * check algorithm itself, it is only used to make the output more
- * beautiful in case that an error is detected (an error is defined
- * as a write operation to a block while that block is still referenced).
- */
-struct btrfsic_block_link {
- u32 magic_num; /* only used for debug purposes */
- u32 ref_cnt;
- struct list_head node_ref_to; /* list node */
- struct list_head node_ref_from; /* list node */
- struct list_head collision_resolving_node; /* list node */
- struct btrfsic_block *block_ref_to;
- struct btrfsic_block *block_ref_from;
- u64 parent_generation;
-};
-
-struct btrfsic_dev_state {
- u32 magic_num; /* only used for debug purposes */
- struct block_device *bdev;
- struct btrfsic_state *state;
- struct list_head collision_resolving_node; /* list node */
- struct btrfsic_block dummy_block_for_bio_bh_flush;
- u64 last_flush_gen;
-};
-
-struct btrfsic_block_hashtable {
- struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE];
-};
-
-struct btrfsic_block_link_hashtable {
- struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE];
-};
-
-struct btrfsic_dev_state_hashtable {
- struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE];
-};
-
-struct btrfsic_block_data_ctx {
- u64 start; /* virtual bytenr */
- u64 dev_bytenr; /* physical bytenr on device */
- u32 len;
- struct btrfsic_dev_state *dev;
- char **datav;
- struct page **pagev;
- void *mem_to_free;
-};
-
-/* This structure is used to implement recursion without occupying
- * any stack space, refer to btrfsic_process_metablock() */
-struct btrfsic_stack_frame {
- u32 magic;
- u32 nr;
- int error;
- int i;
- int limit_nesting;
- int num_copies;
- int mirror_num;
- struct btrfsic_block *block;
- struct btrfsic_block_data_ctx *block_ctx;
- struct btrfsic_block *next_block;
- struct btrfsic_block_data_ctx next_block_ctx;
- struct btrfs_header *hdr;
- struct btrfsic_stack_frame *prev;
-};
-
-/* Some state per mounted filesystem */
-struct btrfsic_state {
- u32 print_mask;
- int include_extent_data;
- struct list_head all_blocks_list;
- struct btrfsic_block_hashtable block_hashtable;
- struct btrfsic_block_link_hashtable block_link_hashtable;
- struct btrfs_fs_info *fs_info;
- u64 max_superblock_generation;
- struct btrfsic_block *latest_superblock;
- u32 metablock_size;
- u32 datablock_size;
-};
-
-static int btrfsic_process_metablock(struct btrfsic_state *state,
- struct btrfsic_block *block,
- struct btrfsic_block_data_ctx *block_ctx,
- int limit_nesting, int force_iodone_flag);
-static void btrfsic_read_from_block_data(
- struct btrfsic_block_data_ctx *block_ctx,
- void *dst, u32 offset, size_t len);
-static int btrfsic_create_link_to_next_block(
- struct btrfsic_state *state,
- struct btrfsic_block *block,
- struct btrfsic_block_data_ctx
- *block_ctx, u64 next_bytenr,
- int limit_nesting,
- struct btrfsic_block_data_ctx *next_block_ctx,
- struct btrfsic_block **next_blockp,
- int force_iodone_flag,
- int *num_copiesp, int *mirror_nump,
- struct btrfs_disk_key *disk_key,
- u64 parent_generation);
-static int btrfsic_handle_extent_data(struct btrfsic_state *state,
- struct btrfsic_block *block,
- struct btrfsic_block_data_ctx *block_ctx,
- u32 item_offset, int force_iodone_flag);
-static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
- struct btrfsic_block_data_ctx *block_ctx_out,
- int mirror_num);
-static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
-static int btrfsic_read_block(struct btrfsic_state *state,
- struct btrfsic_block_data_ctx *block_ctx);
-static int btrfsic_process_written_superblock(
- struct btrfsic_state *state,
- struct btrfsic_block *const block,
- struct btrfs_super_block *const super_hdr);
-static void btrfsic_bio_end_io(struct bio *bp);
-static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
- const struct btrfsic_block *block,
- int recursion_level);
-static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
- struct btrfsic_block *const block,
- int recursion_level);
-static void btrfsic_print_add_link(const struct btrfsic_state *state,
- const struct btrfsic_block_link *l);
-static void btrfsic_print_rem_link(const struct btrfsic_state *state,
- const struct btrfsic_block_link *l);
-static char btrfsic_get_block_type(const struct btrfsic_state *state,
- const struct btrfsic_block *block);
-static void btrfsic_dump_tree(const struct btrfsic_state *state);
-static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
- const struct btrfsic_block *block,
- int indent_level);
-static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
- struct btrfsic_state *state,
- struct btrfsic_block_data_ctx *next_block_ctx,
- struct btrfsic_block *next_block,
- struct btrfsic_block *from_block,
- u64 parent_generation);
-static struct btrfsic_block *btrfsic_block_lookup_or_add(
- struct btrfsic_state *state,
- struct btrfsic_block_data_ctx *block_ctx,
- const char *additional_string,
- int is_metadata,
- int is_iodone,
- int never_written,
- int mirror_num,
- int *was_created);
-static int btrfsic_process_superblock_dev_mirror(
- struct btrfsic_state *state,
- struct btrfsic_dev_state *dev_state,
- struct btrfs_device *device,
- int superblock_mirror_num,
- struct btrfsic_dev_state **selected_dev_state,
- struct btrfs_super_block *selected_super);
-static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev);
-static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
- u64 bytenr,
- struct btrfsic_dev_state *dev_state,
- u64 dev_bytenr);
-
-static struct mutex btrfsic_mutex;
-static int btrfsic_is_initialized;
-static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable;
-
-
-static void btrfsic_block_init(struct btrfsic_block *b)
-{
- b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER;
- b->dev_state = NULL;
- b->dev_bytenr = 0;
- b->logical_bytenr = 0;
- b->generation = BTRFSIC_GENERATION_UNKNOWN;
- b->disk_key.objectid = 0;
- b->disk_key.type = 0;
- b->disk_key.offset = 0;
- b->is_metadata = 0;
- b->is_superblock = 0;
- b->is_iodone = 0;
- b->iodone_w_error = 0;
- b->never_written = 0;
- b->mirror_num = 0;
- b->next_in_same_bio = NULL;
- b->orig_bio_private = NULL;
- b->orig_bio_end_io = NULL;
- INIT_LIST_HEAD(&b->collision_resolving_node);
- INIT_LIST_HEAD(&b->all_blocks_node);
- INIT_LIST_HEAD(&b->ref_to_list);
- INIT_LIST_HEAD(&b->ref_from_list);
- b->submit_bio_bh_rw = 0;
- b->flush_gen = 0;
-}
-
-static struct btrfsic_block *btrfsic_block_alloc(void)
-{
- struct btrfsic_block *b;
-
- b = kzalloc(sizeof(*b), GFP_NOFS);
- if (NULL != b)
- btrfsic_block_init(b);
-
- return b;
-}
-
-static void btrfsic_block_free(struct btrfsic_block *b)
-{
- BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num));
- kfree(b);
-}
-
-static void btrfsic_block_link_init(struct btrfsic_block_link *l)
-{
- l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER;
- l->ref_cnt = 1;
- INIT_LIST_HEAD(&l->node_ref_to);
- INIT_LIST_HEAD(&l->node_ref_from);
- INIT_LIST_HEAD(&l->collision_resolving_node);
- l->block_ref_to = NULL;
- l->block_ref_from = NULL;
-}
-
-static struct btrfsic_block_link *btrfsic_block_link_alloc(void)
-{
- struct btrfsic_block_link *l;
-
- l = kzalloc(sizeof(*l), GFP_NOFS);
- if (NULL != l)
- btrfsic_block_link_init(l);
-
- return l;
-}
-
-static void btrfsic_block_link_free(struct btrfsic_block_link *l)
-{
- BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num));
- kfree(l);
-}
-
-static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
-{
- ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
- ds->bdev = NULL;
- ds->state = NULL;
- INIT_LIST_HEAD(&ds->collision_resolving_node);
- ds->last_flush_gen = 0;
- btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
- ds->dummy_block_for_bio_bh_flush.is_iodone = 1;
- ds->dummy_block_for_bio_bh_flush.dev_state = ds;
-}
-
-static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void)
-{
- struct btrfsic_dev_state *ds;
-
- ds = kzalloc(sizeof(*ds), GFP_NOFS);
- if (NULL != ds)
- btrfsic_dev_state_init(ds);
-
- return ds;
-}
-
-static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds)
-{
- BUG_ON(!(NULL == ds ||
- BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num));
- kfree(ds);
-}
-
-static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h)
-{
- int i;
-
- for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++)
- INIT_LIST_HEAD(h->table + i);
-}
-
-static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
- struct btrfsic_block_hashtable *h)
-{
- const unsigned int hashval =
- (((unsigned int)(b->dev_bytenr >> 16)) ^
- ((unsigned int)((uintptr_t)b->dev_state->bdev))) &
- (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
-
- list_add(&b->collision_resolving_node, h->table + hashval);
-}
-
-static void btrfsic_block_hashtable_remove(struct btrfsic_block *b)
-{
- list_del(&b->collision_resolving_node);
-}
-
-static struct btrfsic_block *btrfsic_block_hashtable_lookup(
- struct block_device *bdev,
- u64 dev_bytenr,
- struct btrfsic_block_hashtable *h)
-{
- const unsigned int hashval =
- (((unsigned int)(dev_bytenr >> 16)) ^
- ((unsigned int)((uintptr_t)bdev))) &
- (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
- struct btrfsic_block *b;
-
- list_for_each_entry(b, h->table + hashval, collision_resolving_node) {
- if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
- return b;
- }
-
- return NULL;
-}
-
-static void btrfsic_block_link_hashtable_init(
- struct btrfsic_block_link_hashtable *h)
-{
- int i;
-
- for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++)
- INIT_LIST_HEAD(h->table + i);
-}
-
-static void btrfsic_block_link_hashtable_add(
- struct btrfsic_block_link *l,
- struct btrfsic_block_link_hashtable *h)
-{
- const unsigned int hashval =
- (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^
- ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^
- ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^
- ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev)))
- & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
-
- BUG_ON(NULL == l->block_ref_to);
- BUG_ON(NULL == l->block_ref_from);
- list_add(&l->collision_resolving_node, h->table + hashval);
-}
-
-static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l)
-{
- list_del(&l->collision_resolving_node);
-}
-
-static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
- struct block_device *bdev_ref_to,
- u64 dev_bytenr_ref_to,
- struct block_device *bdev_ref_from,
- u64 dev_bytenr_ref_from,
- struct btrfsic_block_link_hashtable *h)
-{
- const unsigned int hashval =
- (((unsigned int)(dev_bytenr_ref_to >> 16)) ^
- ((unsigned int)(dev_bytenr_ref_from >> 16)) ^
- ((unsigned int)((uintptr_t)bdev_ref_to)) ^
- ((unsigned int)((uintptr_t)bdev_ref_from))) &
- (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
- struct btrfsic_block_link *l;
-
- list_for_each_entry(l, h->table + hashval, collision_resolving_node) {
- BUG_ON(NULL == l->block_ref_to);
- BUG_ON(NULL == l->block_ref_from);
- if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
- l->block_ref_to->dev_bytenr == dev_bytenr_ref_to &&
- l->block_ref_from->dev_state->bdev == bdev_ref_from &&
- l->block_ref_from->dev_bytenr == dev_bytenr_ref_from)
- return l;
- }
-
- return NULL;
-}
-
-static void btrfsic_dev_state_hashtable_init(
- struct btrfsic_dev_state_hashtable *h)
-{
- int i;
-
- for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++)
- INIT_LIST_HEAD(h->table + i);
-}
-
-static void btrfsic_dev_state_hashtable_add(
- struct btrfsic_dev_state *ds,
- struct btrfsic_dev_state_hashtable *h)
-{
- const unsigned int hashval =
- (((unsigned int)((uintptr_t)ds->bdev->bd_dev)) &
- (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
-
- list_add(&ds->collision_resolving_node, h->table + hashval);
-}
-
-static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds)
-{
- list_del(&ds->collision_resolving_node);
-}
-
-static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev,
- struct btrfsic_dev_state_hashtable *h)
-{
- const unsigned int hashval =
- dev & (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1);
- struct btrfsic_dev_state *ds;
-
- list_for_each_entry(ds, h->table + hashval, collision_resolving_node) {
- if (ds->bdev->bd_dev == dev)
- return ds;
- }
-
- return NULL;
-}
-
-static int btrfsic_process_superblock(struct btrfsic_state *state,
- struct btrfs_fs_devices *fs_devices)
-{
- struct btrfs_super_block *selected_super;
- struct list_head *dev_head = &fs_devices->devices;
- struct btrfs_device *device;
- struct btrfsic_dev_state *selected_dev_state = NULL;
- int ret = 0;
- int pass;
-
- selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
- if (!selected_super)
- return -ENOMEM;
-
- list_for_each_entry(device, dev_head, dev_list) {
- int i;
- struct btrfsic_dev_state *dev_state;
-
- if (!device->bdev || !device->name)
- continue;
-
- dev_state = btrfsic_dev_state_lookup(device->bdev->bd_dev);
- BUG_ON(NULL == dev_state);
- for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
- ret = btrfsic_process_superblock_dev_mirror(
- state, dev_state, device, i,
- &selected_dev_state, selected_super);
- if (0 != ret && 0 == i) {
- kfree(selected_super);
- return ret;
- }
- }
- }
-
- if (NULL == state->latest_superblock) {
- pr_info("btrfsic: no superblock found!\n");
- kfree(selected_super);
- return -1;
- }
-
- for (pass = 0; pass < 3; pass++) {
- int num_copies;
- int mirror_num;
- u64 next_bytenr;
-
- switch (pass) {
- case 0:
- next_bytenr = btrfs_super_root(selected_super);
- if (state->print_mask &
- BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
- pr_info("root@%llu\n", next_bytenr);
- break;
- case 1:
- next_bytenr = btrfs_super_chunk_root(selected_super);
- if (state->print_mask &
- BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
- pr_info("chunk@%llu\n", next_bytenr);
- break;
- case 2:
- next_bytenr = btrfs_super_log_root(selected_super);
- if (0 == next_bytenr)
- continue;
- if (state->print_mask &
- BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
- pr_info("log@%llu\n", next_bytenr);
- break;
- }
-
- num_copies = btrfs_num_copies(state->fs_info, next_bytenr,
- state->metablock_size);
- if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
- pr_info("num_copies(log_bytenr=%llu) = %d\n",
- next_bytenr, num_copies);
-
- for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
- struct btrfsic_block *next_block;
- struct btrfsic_block_data_ctx tmp_next_block_ctx;
- struct btrfsic_block_link *l;
-
- ret = btrfsic_map_block(state, next_bytenr,
- state->metablock_size,
- &tmp_next_block_ctx,
- mirror_num);
- if (ret) {
- pr_info("btrfsic: btrfsic_map_block(root @%llu, mirror %d) failed!\n",
- next_bytenr, mirror_num);
- kfree(selected_super);
- return -1;
- }
-
- next_block = btrfsic_block_hashtable_lookup(
- tmp_next_block_ctx.dev->bdev,
- tmp_next_block_ctx.dev_bytenr,
- &state->block_hashtable);
- BUG_ON(NULL == next_block);
-
- l = btrfsic_block_link_hashtable_lookup(
- tmp_next_block_ctx.dev->bdev,
- tmp_next_block_ctx.dev_bytenr,
- state->latest_superblock->dev_state->
- bdev,
- state->latest_superblock->dev_bytenr,
- &state->block_link_hashtable);
- BUG_ON(NULL == l);
-
- ret = btrfsic_read_block(state, &tmp_next_block_ctx);
- if (ret < (int)PAGE_SIZE) {
- pr_info("btrfsic: read @logical %llu failed!\n",
- tmp_next_block_ctx.start);
- btrfsic_release_block_ctx(&tmp_next_block_ctx);
- kfree(selected_super);
- return -1;
- }
-
- ret = btrfsic_process_metablock(state,
- next_block,
- &tmp_next_block_ctx,
- BTRFS_MAX_LEVEL + 3, 1);
- btrfsic_release_block_ctx(&tmp_next_block_ctx);
- }
- }
-
- kfree(selected_super);
- return ret;
-}
-
-static int btrfsic_process_superblock_dev_mirror(
- struct btrfsic_state *state,
- struct btrfsic_dev_state *dev_state,
- struct btrfs_device *device,
- int superblock_mirror_num,
- struct btrfsic_dev_state **selected_dev_state,
- struct btrfs_super_block *selected_super)
-{
- struct btrfs_fs_info *fs_info = state->fs_info;
- struct btrfs_super_block *super_tmp;
- u64 dev_bytenr;
- struct btrfsic_block *superblock_tmp;
- int pass;
- struct block_device *const superblock_bdev = device->bdev;
- struct page *page;
- struct address_space *mapping = superblock_bdev->bd_inode->i_mapping;
- int ret = 0;
-
- /* super block bytenr is always the unmapped device bytenr */
- dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
- if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes)
- return -1;
-
- page = read_cache_page_gfp(mapping, dev_bytenr >> PAGE_SHIFT, GFP_NOFS);
- if (IS_ERR(page))
- return -1;
-
- super_tmp = page_address(page);
-
- if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
- btrfs_super_magic(super_tmp) != BTRFS_MAGIC ||
- memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
- btrfs_super_nodesize(super_tmp) != state->metablock_size ||
- btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
- ret = 0;
- goto out;
- }
-
- superblock_tmp =
- btrfsic_block_hashtable_lookup(superblock_bdev,
- dev_bytenr,
- &state->block_hashtable);
- if (NULL == superblock_tmp) {
- superblock_tmp = btrfsic_block_alloc();
- if (NULL == superblock_tmp) {
- ret = -1;
- goto out;
- }
- /* for superblock, only the dev_bytenr makes sense */
- superblock_tmp->dev_bytenr = dev_bytenr;
- superblock_tmp->dev_state = dev_state;
- superblock_tmp->logical_bytenr = dev_bytenr;
- superblock_tmp->generation = btrfs_super_generation(super_tmp);
- superblock_tmp->is_metadata = 1;
- superblock_tmp->is_superblock = 1;
- superblock_tmp->is_iodone = 1;
- superblock_tmp->never_written = 0;
- superblock_tmp->mirror_num = 1 + superblock_mirror_num;
- if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
- btrfs_info_in_rcu(fs_info,
- "new initial S-block (bdev %p, %s) @%llu (%pg/%llu/%d)",
- superblock_bdev,
- btrfs_dev_name(device), dev_bytenr,
- dev_state->bdev, dev_bytenr,
- superblock_mirror_num);
- list_add(&superblock_tmp->all_blocks_node,
- &state->all_blocks_list);
- btrfsic_block_hashtable_add(superblock_tmp,
- &state->block_hashtable);
- }
-
- /* select the one with the highest generation field */
- if (btrfs_super_generation(super_tmp) >
- state->max_superblock_generation ||
- 0 == state->max_superblock_generation) {
- memcpy(selected_super, super_tmp, sizeof(*selected_super));
- *selected_dev_state = dev_state;
- state->max_superblock_generation =
- btrfs_super_generation(super_tmp);
- state->latest_superblock = superblock_tmp;
- }
-
- for (pass = 0; pass < 3; pass++) {
- u64 next_bytenr;
- int num_copies;
- int mirror_num;
- const char *additional_string = NULL;
- struct btrfs_disk_key tmp_disk_key;
-
- tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
- tmp_disk_key.offset = 0;
- switch (pass) {
- case 0:
- btrfs_set_disk_key_objectid(&tmp_disk_key,
- BTRFS_ROOT_TREE_OBJECTID);
- additional_string = "initial root ";
- next_bytenr = btrfs_super_root(super_tmp);
- break;
- case 1:
- btrfs_set_disk_key_objectid(&tmp_disk_key,
- BTRFS_CHUNK_TREE_OBJECTID);
- additional_string = "initial chunk ";
- next_bytenr = btrfs_super_chunk_root(super_tmp);
- break;
- case 2:
- btrfs_set_disk_key_objectid(&tmp_disk_key,
- BTRFS_TREE_LOG_OBJECTID);
- additional_string = "initial log ";
- next_bytenr = btrfs_super_log_root(super_tmp);
- if (0 == next_bytenr)
- continue;
- break;
- }
-
- num_copies = btrfs_num_copies(fs_info, next_bytenr,
- state->metablock_size);
- if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
- pr_info("num_copies(log_bytenr=%llu) = %d\n",
- next_bytenr, num_copies);
- for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
- struct btrfsic_block *next_block;
- struct btrfsic_block_data_ctx tmp_next_block_ctx;
- struct btrfsic_block_link *l;
-
- if (btrfsic_map_block(state, next_bytenr,
- state->metablock_size,
- &tmp_next_block_ctx,
- mirror_num)) {
- pr_info("btrfsic: btrfsic_map_block(bytenr @%llu, mirror %d) failed!\n",
- next_bytenr, mirror_num);
- ret = -1;
- goto out;
- }
-
- next_block = btrfsic_block_lookup_or_add(
- state, &tmp_next_block_ctx,
- additional_string, 1, 1, 0,
- mirror_num, NULL);
- if (NULL == next_block) {
- btrfsic_release_block_ctx(&tmp_next_block_ctx);
- ret = -1;
- goto out;
- }
-
- next_block->disk_key = tmp_disk_key;
- next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
- l = btrfsic_block_link_lookup_or_add(
- state, &tmp_next_block_ctx,
- next_block, superblock_tmp,
- BTRFSIC_GENERATION_UNKNOWN);
- btrfsic_release_block_ctx(&tmp_next_block_ctx);
- if (NULL == l) {
- ret = -1;
- goto out;
- }
- }
- }
- if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)
- btrfsic_dump_tree_sub(state, superblock_tmp, 0);
-
-out:
- put_page(page);
- return ret;
-}
-
-static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
-{
- struct btrfsic_stack_frame *sf;
-
- sf = kzalloc(sizeof(*sf), GFP_NOFS);
- if (sf)
- sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
- return sf;
-}
-
-static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf)
-{
- BUG_ON(!(NULL == sf ||
- BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic));
- kfree(sf);
-}
-
-static noinline_for_stack int btrfsic_process_metablock(
- struct btrfsic_state *state,
- struct btrfsic_block *const first_block,
- struct btrfsic_block_data_ctx *const first_block_ctx,
- int first_limit_nesting, int force_iodone_flag)
-{
- struct btrfsic_stack_frame initial_stack_frame = { 0 };
- struct btrfsic_stack_frame *sf;
- struct btrfsic_stack_frame *next_stack;
- struct btrfs_header *const first_hdr =
- (struct btrfs_header *)first_block_ctx->datav[0];
-
- BUG_ON(!first_hdr);
- sf = &initial_stack_frame;
- sf->error = 0;
- sf->i = -1;
- sf->limit_nesting = first_limit_nesting;
- sf->block = first_block;
- sf->block_ctx = first_block_ctx;
- sf->next_block = NULL;
- sf->hdr = first_hdr;
- sf->prev = NULL;
-
-continue_with_new_stack_frame:
- sf->block->generation = btrfs_stack_header_generation(sf->hdr);
- if (0 == sf->hdr->level) {
- struct btrfs_leaf *const leafhdr =
- (struct btrfs_leaf *)sf->hdr;
-
- if (-1 == sf->i) {
- sf->nr = btrfs_stack_header_nritems(&leafhdr->header);
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("leaf %llu items %d generation %llu owner %llu\n",
- sf->block_ctx->start, sf->nr,
- btrfs_stack_header_generation(
- &leafhdr->header),
- btrfs_stack_header_owner(
- &leafhdr->header));
- }
-
-continue_with_current_leaf_stack_frame:
- if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
- sf->i++;
- sf->num_copies = 0;
- }
-
- if (sf->i < sf->nr) {
- struct btrfs_item disk_item;
- u32 disk_item_offset =
- (uintptr_t)(leafhdr->items + sf->i) -
- (uintptr_t)leafhdr;
- struct btrfs_disk_key *disk_key;
- u8 type;
- u32 item_offset;
- u32 item_size;
-
- if (disk_item_offset + sizeof(struct btrfs_item) >
- sf->block_ctx->len) {
-leaf_item_out_of_bounce_error:
- pr_info(
- "btrfsic: leaf item out of bounce at logical %llu, dev %pg\n",
- sf->block_ctx->start,
- sf->block_ctx->dev->bdev);
- goto one_stack_frame_backwards;
- }
- btrfsic_read_from_block_data(sf->block_ctx,
- &disk_item,
- disk_item_offset,
- sizeof(struct btrfs_item));
- item_offset = btrfs_stack_item_offset(&disk_item);
- item_size = btrfs_stack_item_size(&disk_item);
- disk_key = &disk_item.key;
- type = btrfs_disk_key_type(disk_key);
-
- if (BTRFS_ROOT_ITEM_KEY == type) {
- struct btrfs_root_item root_item;
- u32 root_item_offset;
- u64 next_bytenr;
-
- root_item_offset = item_offset +
- offsetof(struct btrfs_leaf, items);
- if (root_item_offset + item_size >
- sf->block_ctx->len)
- goto leaf_item_out_of_bounce_error;
- btrfsic_read_from_block_data(
- sf->block_ctx, &root_item,
- root_item_offset,
- item_size);
- next_bytenr = btrfs_root_bytenr(&root_item);
-
- sf->error =
- btrfsic_create_link_to_next_block(
- state,
- sf->block,
- sf->block_ctx,
- next_bytenr,
- sf->limit_nesting,
- &sf->next_block_ctx,
- &sf->next_block,
- force_iodone_flag,
- &sf->num_copies,
- &sf->mirror_num,
- disk_key,
- btrfs_root_generation(
- &root_item));
- if (sf->error)
- goto one_stack_frame_backwards;
-
- if (NULL != sf->next_block) {
- struct btrfs_header *const next_hdr =
- (struct btrfs_header *)
- sf->next_block_ctx.datav[0];
-
- next_stack =
- btrfsic_stack_frame_alloc();
- if (NULL == next_stack) {
- sf->error = -1;
- btrfsic_release_block_ctx(
- &sf->
- next_block_ctx);
- goto one_stack_frame_backwards;
- }
-
- next_stack->i = -1;
- next_stack->block = sf->next_block;
- next_stack->block_ctx =
- &sf->next_block_ctx;
- next_stack->next_block = NULL;
- next_stack->hdr = next_hdr;
- next_stack->limit_nesting =
- sf->limit_nesting - 1;
- next_stack->prev = sf;
- sf = next_stack;
- goto continue_with_new_stack_frame;
- }
- } else if (BTRFS_EXTENT_DATA_KEY == type &&
- state->include_extent_data) {
- sf->error = btrfsic_handle_extent_data(
- state,
- sf->block,
- sf->block_ctx,
- item_offset,
- force_iodone_flag);
- if (sf->error)
- goto one_stack_frame_backwards;
- }
-
- goto continue_with_current_leaf_stack_frame;
- }
- } else {
- struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr;
-
- if (-1 == sf->i) {
- sf->nr = btrfs_stack_header_nritems(&nodehdr->header);
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("node %llu level %d items %d generation %llu owner %llu\n",
- sf->block_ctx->start,
- nodehdr->header.level, sf->nr,
- btrfs_stack_header_generation(
- &nodehdr->header),
- btrfs_stack_header_owner(
- &nodehdr->header));
- }
-
-continue_with_current_node_stack_frame:
- if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
- sf->i++;
- sf->num_copies = 0;
- }
-
- if (sf->i < sf->nr) {
- struct btrfs_key_ptr key_ptr;
- u32 key_ptr_offset;
- u64 next_bytenr;
-
- key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) -
- (uintptr_t)nodehdr;
- if (key_ptr_offset + sizeof(struct btrfs_key_ptr) >
- sf->block_ctx->len) {
- pr_info(
- "btrfsic: node item out of bounce at logical %llu, dev %pg\n",
- sf->block_ctx->start,
- sf->block_ctx->dev->bdev);
- goto one_stack_frame_backwards;
- }
- btrfsic_read_from_block_data(
- sf->block_ctx, &key_ptr, key_ptr_offset,
- sizeof(struct btrfs_key_ptr));
- next_bytenr = btrfs_stack_key_blockptr(&key_ptr);
-
- sf->error = btrfsic_create_link_to_next_block(
- state,
- sf->block,
- sf->block_ctx,
- next_bytenr,
- sf->limit_nesting,
- &sf->next_block_ctx,
- &sf->next_block,
- force_iodone_flag,
- &sf->num_copies,
- &sf->mirror_num,
- &key_ptr.key,
- btrfs_stack_key_generation(&key_ptr));
- if (sf->error)
- goto one_stack_frame_backwards;
-
- if (NULL != sf->next_block) {
- struct btrfs_header *const next_hdr =
- (struct btrfs_header *)
- sf->next_block_ctx.datav[0];
-
- next_stack = btrfsic_stack_frame_alloc();
- if (NULL == next_stack) {
- sf->error = -1;
- goto one_stack_frame_backwards;
- }
-
- next_stack->i = -1;
- next_stack->block = sf->next_block;
- next_stack->block_ctx = &sf->next_block_ctx;
- next_stack->next_block = NULL;
- next_stack->hdr = next_hdr;
- next_stack->limit_nesting =
- sf->limit_nesting - 1;
- next_stack->prev = sf;
- sf = next_stack;
- goto continue_with_new_stack_frame;
- }
-
- goto continue_with_current_node_stack_frame;
- }
- }
-
-one_stack_frame_backwards:
- if (NULL != sf->prev) {
- struct btrfsic_stack_frame *const prev = sf->prev;
-
- /* the one for the initial block is freed in the caller */
- btrfsic_release_block_ctx(sf->block_ctx);
-
- if (sf->error) {
- prev->error = sf->error;
- btrfsic_stack_frame_free(sf);
- sf = prev;
- goto one_stack_frame_backwards;
- }
-
- btrfsic_stack_frame_free(sf);
- sf = prev;
- goto continue_with_new_stack_frame;
- } else {
- BUG_ON(&initial_stack_frame != sf);
- }
-
- return sf->error;
-}
-
-static void btrfsic_read_from_block_data(
- struct btrfsic_block_data_ctx *block_ctx,
- void *dstv, u32 offset, size_t len)
-{
- size_t cur;
- size_t pgoff;
- char *kaddr;
- char *dst = (char *)dstv;
- size_t start_offset = offset_in_page(block_ctx->start);
- unsigned long i = (start_offset + offset) >> PAGE_SHIFT;
-
- WARN_ON(offset + len > block_ctx->len);
- pgoff = offset_in_page(start_offset + offset);
-
- while (len > 0) {
- cur = min(len, ((size_t)PAGE_SIZE - pgoff));
- BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_SIZE));
- kaddr = block_ctx->datav[i];
- memcpy(dst, kaddr + pgoff, cur);
-
- dst += cur;
- len -= cur;
- pgoff = 0;
- i++;
- }
-}
-
-static int btrfsic_create_link_to_next_block(
- struct btrfsic_state *state,
- struct btrfsic_block *block,
- struct btrfsic_block_data_ctx *block_ctx,
- u64 next_bytenr,
- int limit_nesting,
- struct btrfsic_block_data_ctx *next_block_ctx,
- struct btrfsic_block **next_blockp,
- int force_iodone_flag,
- int *num_copiesp, int *mirror_nump,
- struct btrfs_disk_key *disk_key,
- u64 parent_generation)
-{
- struct btrfs_fs_info *fs_info = state->fs_info;
- struct btrfsic_block *next_block = NULL;
- int ret;
- struct btrfsic_block_link *l;
- int did_alloc_block_link;
- int block_was_created;
-
- *next_blockp = NULL;
- if (0 == *num_copiesp) {
- *num_copiesp = btrfs_num_copies(fs_info, next_bytenr,
- state->metablock_size);
- if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
- pr_info("num_copies(log_bytenr=%llu) = %d\n",
- next_bytenr, *num_copiesp);
- *mirror_nump = 1;
- }
-
- if (*mirror_nump > *num_copiesp)
- return 0;
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("btrfsic_create_link_to_next_block(mirror_num=%d)\n",
- *mirror_nump);
- ret = btrfsic_map_block(state, next_bytenr,
- state->metablock_size,
- next_block_ctx, *mirror_nump);
- if (ret) {
- pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
- next_bytenr, *mirror_nump);
- btrfsic_release_block_ctx(next_block_ctx);
- *next_blockp = NULL;
- return -1;
- }
-
- next_block = btrfsic_block_lookup_or_add(state,
- next_block_ctx, "referenced ",
- 1, force_iodone_flag,
- !force_iodone_flag,
- *mirror_nump,
- &block_was_created);
- if (NULL == next_block) {
- btrfsic_release_block_ctx(next_block_ctx);
- *next_blockp = NULL;
- return -1;
- }
- if (block_was_created) {
- l = NULL;
- next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
- } else {
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
- if (next_block->logical_bytenr != next_bytenr &&
- !(!next_block->is_metadata &&
- 0 == next_block->logical_bytenr))
- pr_info(
-"referenced block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n",
- next_bytenr, next_block_ctx->dev->bdev,
- next_block_ctx->dev_bytenr, *mirror_nump,
- btrfsic_get_block_type(state,
- next_block),
- next_block->logical_bytenr);
- else
- pr_info(
- "referenced block @%llu (%pg/%llu/%d) found in hash table, %c\n",
- next_bytenr, next_block_ctx->dev->bdev,
- next_block_ctx->dev_bytenr, *mirror_nump,
- btrfsic_get_block_type(state,
- next_block));
- }
- next_block->logical_bytenr = next_bytenr;
-
- next_block->mirror_num = *mirror_nump;
- l = btrfsic_block_link_hashtable_lookup(
- next_block_ctx->dev->bdev,
- next_block_ctx->dev_bytenr,
- block_ctx->dev->bdev,
- block_ctx->dev_bytenr,
- &state->block_link_hashtable);
- }
-
- next_block->disk_key = *disk_key;
- if (NULL == l) {
- l = btrfsic_block_link_alloc();
- if (NULL == l) {
- btrfsic_release_block_ctx(next_block_ctx);
- *next_blockp = NULL;
- return -1;
- }
-
- did_alloc_block_link = 1;
- l->block_ref_to = next_block;
- l->block_ref_from = block;
- l->ref_cnt = 1;
- l->parent_generation = parent_generation;
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- btrfsic_print_add_link(state, l);
-
- list_add(&l->node_ref_to, &block->ref_to_list);
- list_add(&l->node_ref_from, &next_block->ref_from_list);
-
- btrfsic_block_link_hashtable_add(l,
- &state->block_link_hashtable);
- } else {
- did_alloc_block_link = 0;
- if (0 == limit_nesting) {
- l->ref_cnt++;
- l->parent_generation = parent_generation;
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- btrfsic_print_add_link(state, l);
- }
- }
-
- if (limit_nesting > 0 && did_alloc_block_link) {
- ret = btrfsic_read_block(state, next_block_ctx);
- if (ret < (int)next_block_ctx->len) {
- pr_info("btrfsic: read block @logical %llu failed!\n",
- next_bytenr);
- btrfsic_release_block_ctx(next_block_ctx);
- *next_blockp = NULL;
- return -1;
- }
-
- *next_blockp = next_block;
- } else {
- *next_blockp = NULL;
- }
- (*mirror_nump)++;
-
- return 0;
-}
-
-static int btrfsic_handle_extent_data(
- struct btrfsic_state *state,
- struct btrfsic_block *block,
- struct btrfsic_block_data_ctx *block_ctx,
- u32 item_offset, int force_iodone_flag)
-{
- struct btrfs_fs_info *fs_info = state->fs_info;
- struct btrfs_file_extent_item file_extent_item;
- u64 file_extent_item_offset;
- u64 next_bytenr;
- u64 num_bytes;
- u64 generation;
- struct btrfsic_block_link *l;
- int ret;
-
- file_extent_item_offset = offsetof(struct btrfs_leaf, items) +
- item_offset;
- if (file_extent_item_offset +
- offsetof(struct btrfs_file_extent_item, disk_num_bytes) >
- block_ctx->len) {
- pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n",
- block_ctx->start, block_ctx->dev->bdev);
- return -1;
- }
-
- btrfsic_read_from_block_data(block_ctx, &file_extent_item,
- file_extent_item_offset,
- offsetof(struct btrfs_file_extent_item, disk_num_bytes));
- if (BTRFS_FILE_EXTENT_REG != file_extent_item.type ||
- btrfs_stack_file_extent_disk_bytenr(&file_extent_item) == 0) {
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
- pr_info("extent_data: type %u, disk_bytenr = %llu\n",
- file_extent_item.type,
- btrfs_stack_file_extent_disk_bytenr(
- &file_extent_item));
- return 0;
- }
-
- if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) >
- block_ctx->len) {
- pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n",
- block_ctx->start, block_ctx->dev->bdev);
- return -1;
- }
- btrfsic_read_from_block_data(block_ctx, &file_extent_item,
- file_extent_item_offset,
- sizeof(struct btrfs_file_extent_item));
- next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item);
- if (btrfs_stack_file_extent_compression(&file_extent_item) ==
- BTRFS_COMPRESS_NONE) {
- next_bytenr += btrfs_stack_file_extent_offset(&file_extent_item);
- num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item);
- } else {
- num_bytes = btrfs_stack_file_extent_disk_num_bytes(&file_extent_item);
- }
- generation = btrfs_stack_file_extent_generation(&file_extent_item);
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
- pr_info("extent_data: type %u, disk_bytenr = %llu, offset = %llu, num_bytes = %llu\n",
- file_extent_item.type,
- btrfs_stack_file_extent_disk_bytenr(&file_extent_item),
- btrfs_stack_file_extent_offset(&file_extent_item),
- num_bytes);
- while (num_bytes > 0) {
- u32 chunk_len;
- int num_copies;
- int mirror_num;
-
- if (num_bytes > state->datablock_size)
- chunk_len = state->datablock_size;
- else
- chunk_len = num_bytes;
-
- num_copies = btrfs_num_copies(fs_info, next_bytenr,
- state->datablock_size);
- if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
- pr_info("num_copies(log_bytenr=%llu) = %d\n",
- next_bytenr, num_copies);
- for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
- struct btrfsic_block_data_ctx next_block_ctx;
- struct btrfsic_block *next_block;
- int block_was_created;
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("btrfsic_handle_extent_data(mirror_num=%d)\n",
- mirror_num);
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
- pr_info("\tdisk_bytenr = %llu, num_bytes %u\n",
- next_bytenr, chunk_len);
- ret = btrfsic_map_block(state, next_bytenr,
- chunk_len, &next_block_ctx,
- mirror_num);
- if (ret) {
- pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
- next_bytenr, mirror_num);
- return -1;
- }
-
- next_block = btrfsic_block_lookup_or_add(
- state,
- &next_block_ctx,
- "referenced ",
- 0,
- force_iodone_flag,
- !force_iodone_flag,
- mirror_num,
- &block_was_created);
- if (NULL == next_block) {
- btrfsic_release_block_ctx(&next_block_ctx);
- return -1;
- }
- if (!block_was_created) {
- if ((state->print_mask &
- BTRFSIC_PRINT_MASK_VERBOSE) &&
- next_block->logical_bytenr != next_bytenr &&
- !(!next_block->is_metadata &&
- 0 == next_block->logical_bytenr)) {
- pr_info(
-"referenced block @%llu (%pg/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu)\n",
- next_bytenr,
- next_block_ctx.dev->bdev,
- next_block_ctx.dev_bytenr,
- mirror_num,
- next_block->logical_bytenr);
- }
- next_block->logical_bytenr = next_bytenr;
- next_block->mirror_num = mirror_num;
- }
-
- l = btrfsic_block_link_lookup_or_add(state,
- &next_block_ctx,
- next_block, block,
- generation);
- btrfsic_release_block_ctx(&next_block_ctx);
- if (NULL == l)
- return -1;
- }
-
- next_bytenr += chunk_len;
- num_bytes -= chunk_len;
- }
-
- return 0;
-}
-
-static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
- struct btrfsic_block_data_ctx *block_ctx_out,
- int mirror_num)
-{
- struct btrfs_fs_info *fs_info = state->fs_info;
- int ret;
- u64 length;
- struct btrfs_io_context *bioc = NULL;
- struct btrfs_io_stripe smap, *map;
- struct btrfs_device *device;
-
- length = len;
- ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, bytenr, &length, &bioc,
- NULL, &mirror_num, 0);
- if (ret) {
- block_ctx_out->start = 0;
- block_ctx_out->dev_bytenr = 0;
- block_ctx_out->len = 0;
- block_ctx_out->dev = NULL;
- block_ctx_out->datav = NULL;
- block_ctx_out->pagev = NULL;
- block_ctx_out->mem_to_free = NULL;
-
- return ret;
- }
-
- if (bioc)
- map = &bioc->stripes[0];
- else
- map = &smap;
-
- device = map->dev;
- if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) ||
- !device->bdev || !device->name)
- block_ctx_out->dev = NULL;
- else
- block_ctx_out->dev = btrfsic_dev_state_lookup(
- device->bdev->bd_dev);
- block_ctx_out->dev_bytenr = map->physical;
- block_ctx_out->start = bytenr;
- block_ctx_out->len = len;
- block_ctx_out->datav = NULL;
- block_ctx_out->pagev = NULL;
- block_ctx_out->mem_to_free = NULL;
-
- kfree(bioc);
- if (NULL == block_ctx_out->dev) {
- ret = -ENXIO;
- pr_info("btrfsic: error, cannot lookup dev (#1)!\n");
- }
-
- return ret;
-}
-
-static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
-{
- if (block_ctx->mem_to_free) {
- unsigned int num_pages;
-
- BUG_ON(!block_ctx->datav);
- BUG_ON(!block_ctx->pagev);
- num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
- PAGE_SHIFT;
- /* Pages must be unmapped in reverse order */
- while (num_pages > 0) {
- num_pages--;
- if (block_ctx->datav[num_pages])
- block_ctx->datav[num_pages] = NULL;
- if (block_ctx->pagev[num_pages]) {
- __free_page(block_ctx->pagev[num_pages]);
- block_ctx->pagev[num_pages] = NULL;
- }
- }
-
- kfree(block_ctx->mem_to_free);
- block_ctx->mem_to_free = NULL;
- block_ctx->pagev = NULL;
- block_ctx->datav = NULL;
- }
-}
-
-static int btrfsic_read_block(struct btrfsic_state *state,
- struct btrfsic_block_data_ctx *block_ctx)
-{
- unsigned int num_pages;
- unsigned int i;
- size_t size;
- u64 dev_bytenr;
- int ret;
-
- BUG_ON(block_ctx->datav);
- BUG_ON(block_ctx->pagev);
- BUG_ON(block_ctx->mem_to_free);
- if (!PAGE_ALIGNED(block_ctx->dev_bytenr)) {
- pr_info("btrfsic: read_block() with unaligned bytenr %llu\n",
- block_ctx->dev_bytenr);
- return -1;
- }
-
- num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
- PAGE_SHIFT;
- size = sizeof(*block_ctx->datav) + sizeof(*block_ctx->pagev);
- block_ctx->mem_to_free = kcalloc(num_pages, size, GFP_NOFS);
- if (!block_ctx->mem_to_free)
- return -ENOMEM;
- block_ctx->datav = block_ctx->mem_to_free;
- block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
- ret = btrfs_alloc_page_array(num_pages, block_ctx->pagev);
- if (ret)
- return ret;
-
- dev_bytenr = block_ctx->dev_bytenr;
- for (i = 0; i < num_pages;) {
- struct bio *bio;
- unsigned int j;
-
- bio = bio_alloc(block_ctx->dev->bdev, num_pages - i,
- REQ_OP_READ, GFP_NOFS);
- bio->bi_iter.bi_sector = dev_bytenr >> SECTOR_SHIFT;
-
- for (j = i; j < num_pages; j++) {
- ret = bio_add_page(bio, block_ctx->pagev[j],
- PAGE_SIZE, 0);
- if (PAGE_SIZE != ret)
- break;
- }
- if (j == i) {
- pr_info("btrfsic: error, failed to add a single page!\n");
- return -1;
- }
- if (submit_bio_wait(bio)) {
- pr_info("btrfsic: read error at logical %llu dev %pg!\n",
- block_ctx->start, block_ctx->dev->bdev);
- bio_put(bio);
- return -1;
- }
- bio_put(bio);
- dev_bytenr += (j - i) * PAGE_SIZE;
- i = j;
- }
- for (i = 0; i < num_pages; i++)
- block_ctx->datav[i] = page_address(block_ctx->pagev[i]);
-
- return block_ctx->len;
-}
-
-static void btrfsic_dump_database(struct btrfsic_state *state)
-{
- const struct btrfsic_block *b_all;
-
- BUG_ON(NULL == state);
-
- pr_info("all_blocks_list:\n");
- list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) {
- const struct btrfsic_block_link *l;
-
- pr_info("%c-block @%llu (%pg/%llu/%d)\n",
- btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->bdev,
- b_all->dev_bytenr, b_all->mirror_num);
-
- list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) {
- pr_info(
- " %c @%llu (%pg/%llu/%d) refers %u* to %c @%llu (%pg/%llu/%d)\n",
- btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->bdev,
- b_all->dev_bytenr, b_all->mirror_num,
- l->ref_cnt,
- btrfsic_get_block_type(state, l->block_ref_to),
- l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->bdev,
- l->block_ref_to->dev_bytenr,
- l->block_ref_to->mirror_num);
- }
-
- list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) {
- pr_info(
- " %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n",
- btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->bdev,
- b_all->dev_bytenr, b_all->mirror_num,
- l->ref_cnt,
- btrfsic_get_block_type(state, l->block_ref_from),
- l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->bdev,
- l->block_ref_from->dev_bytenr,
- l->block_ref_from->mirror_num);
- }
-
- pr_info("\n");
- }
-}
-
-/*
- * Test whether the disk block contains a tree block (leaf or node)
- * (note that this test fails for the super block)
- */
-static noinline_for_stack int btrfsic_test_for_metadata(
- struct btrfsic_state *state,
- char **datav, unsigned int num_pages)
-{
- struct btrfs_fs_info *fs_info = state->fs_info;
- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
- struct btrfs_header *h;
- u8 csum[BTRFS_CSUM_SIZE];
- unsigned int i;
-
- if (num_pages * PAGE_SIZE < state->metablock_size)
- return 1; /* not metadata */
- num_pages = state->metablock_size >> PAGE_SHIFT;
- h = (struct btrfs_header *)datav[0];
-
- if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE))
- return 1;
-
- shash->tfm = fs_info->csum_shash;
- crypto_shash_init(shash);
-
- for (i = 0; i < num_pages; i++) {
- u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE);
- size_t sublen = i ? PAGE_SIZE :
- (PAGE_SIZE - BTRFS_CSUM_SIZE);
-
- crypto_shash_update(shash, data, sublen);
- }
- crypto_shash_final(shash, csum);
- if (memcmp(csum, h->csum, fs_info->csum_size))
- return 1;
-
- return 0; /* is metadata */
-}
-
-static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
- u64 dev_bytenr, char **mapped_datav,
- unsigned int num_pages,
- struct bio *bio, int *bio_is_patched,
- blk_opf_t submit_bio_bh_rw)
-{
- int is_metadata;
- struct btrfsic_block *block;
- struct btrfsic_block_data_ctx block_ctx;
- int ret;
- struct btrfsic_state *state = dev_state->state;
- struct block_device *bdev = dev_state->bdev;
- unsigned int processed_len;
-
- if (NULL != bio_is_patched)
- *bio_is_patched = 0;
-
-again:
- if (num_pages == 0)
- return;
-
- processed_len = 0;
- is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav,
- num_pages));
-
- block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
- &state->block_hashtable);
- if (NULL != block) {
- u64 bytenr = 0;
- struct btrfsic_block_link *l, *tmp;
-
- if (block->is_superblock) {
- bytenr = btrfs_super_bytenr((struct btrfs_super_block *)
- mapped_datav[0]);
- if (num_pages * PAGE_SIZE <
- BTRFS_SUPER_INFO_SIZE) {
- pr_info("btrfsic: cannot work with too short bios!\n");
- return;
- }
- is_metadata = 1;
- BUG_ON(!PAGE_ALIGNED(BTRFS_SUPER_INFO_SIZE));
- processed_len = BTRFS_SUPER_INFO_SIZE;
- if (state->print_mask &
- BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
- pr_info("[before new superblock is written]:\n");
- btrfsic_dump_tree_sub(state, block, 0);
- }
- }
- if (is_metadata) {
- if (!block->is_superblock) {
- if (num_pages * PAGE_SIZE <
- state->metablock_size) {
- pr_info("btrfsic: cannot work with too short bios!\n");
- return;
- }
- processed_len = state->metablock_size;
- bytenr = btrfs_stack_header_bytenr(
- (struct btrfs_header *)
- mapped_datav[0]);
- btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
- dev_state,
- dev_bytenr);
- }
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
- if (block->logical_bytenr != bytenr &&
- !(!block->is_metadata &&
- block->logical_bytenr == 0))
- pr_info(
-"written block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n",
- bytenr, dev_state->bdev,
- dev_bytenr,
- block->mirror_num,
- btrfsic_get_block_type(state,
- block),
- block->logical_bytenr);
- else
- pr_info(
- "written block @%llu (%pg/%llu/%d) found in hash table, %c\n",
- bytenr, dev_state->bdev,
- dev_bytenr, block->mirror_num,
- btrfsic_get_block_type(state,
- block));
- }
- block->logical_bytenr = bytenr;
- } else {
- if (num_pages * PAGE_SIZE <
- state->datablock_size) {
- pr_info("btrfsic: cannot work with too short bios!\n");
- return;
- }
- processed_len = state->datablock_size;
- bytenr = block->logical_bytenr;
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info(
- "written block @%llu (%pg/%llu/%d) found in hash table, %c\n",
- bytenr, dev_state->bdev, dev_bytenr,
- block->mirror_num,
- btrfsic_get_block_type(state, block));
- }
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("ref_to_list: %cE, ref_from_list: %cE\n",
- list_empty(&block->ref_to_list) ? ' ' : '!',
- list_empty(&block->ref_from_list) ? ' ' : '!');
- if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
- pr_info(
-"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n",
- btrfsic_get_block_type(state, block), bytenr,
- dev_state->bdev, dev_bytenr, block->mirror_num,
- block->generation,
- btrfs_disk_key_objectid(&block->disk_key),
- block->disk_key.type,
- btrfs_disk_key_offset(&block->disk_key),
- btrfs_stack_header_generation(
- (struct btrfs_header *) mapped_datav[0]),
- state->max_superblock_generation);
- btrfsic_dump_tree(state);
- }
-
- if (!block->is_iodone && !block->never_written) {
- pr_info(
-"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n",
- btrfsic_get_block_type(state, block), bytenr,
- dev_state->bdev, dev_bytenr, block->mirror_num,
- block->generation,
- btrfs_stack_header_generation(
- (struct btrfs_header *)
- mapped_datav[0]));
- /* it would not be safe to go on */
- btrfsic_dump_tree(state);
- goto continue_loop;
- }
-
- /*
- * Clear all references of this block. Do not free
- * the block itself even if is not referenced anymore
- * because it still carries valuable information
- * like whether it was ever written and IO completed.
- */
- list_for_each_entry_safe(l, tmp, &block->ref_to_list,
- node_ref_to) {
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- btrfsic_print_rem_link(state, l);
- l->ref_cnt--;
- if (0 == l->ref_cnt) {
- list_del(&l->node_ref_to);
- list_del(&l->node_ref_from);
- btrfsic_block_link_hashtable_remove(l);
- btrfsic_block_link_free(l);
- }
- }
-
- block_ctx.dev = dev_state;
- block_ctx.dev_bytenr = dev_bytenr;
- block_ctx.start = bytenr;
- block_ctx.len = processed_len;
- block_ctx.pagev = NULL;
- block_ctx.mem_to_free = NULL;
- block_ctx.datav = mapped_datav;
-
- if (is_metadata || state->include_extent_data) {
- block->never_written = 0;
- block->iodone_w_error = 0;
- if (NULL != bio) {
- block->is_iodone = 0;
- BUG_ON(NULL == bio_is_patched);
- if (!*bio_is_patched) {
- block->orig_bio_private =
- bio->bi_private;
- block->orig_bio_end_io =
- bio->bi_end_io;
- block->next_in_same_bio = NULL;
- bio->bi_private = block;
- bio->bi_end_io = btrfsic_bio_end_io;
- *bio_is_patched = 1;
- } else {
- struct btrfsic_block *chained_block =
- (struct btrfsic_block *)
- bio->bi_private;
-
- BUG_ON(NULL == chained_block);
- block->orig_bio_private =
- chained_block->orig_bio_private;
- block->orig_bio_end_io =
- chained_block->orig_bio_end_io;
- block->next_in_same_bio = chained_block;
- bio->bi_private = block;
- }
- } else {
- block->is_iodone = 1;
- block->orig_bio_private = NULL;
- block->orig_bio_end_io = NULL;
- block->next_in_same_bio = NULL;
- }
- }
-
- block->flush_gen = dev_state->last_flush_gen + 1;
- block->submit_bio_bh_rw = submit_bio_bh_rw;
- if (is_metadata) {
- block->logical_bytenr = bytenr;
- block->is_metadata = 1;
- if (block->is_superblock) {
- BUG_ON(PAGE_SIZE !=
- BTRFS_SUPER_INFO_SIZE);
- ret = btrfsic_process_written_superblock(
- state,
- block,
- (struct btrfs_super_block *)
- mapped_datav[0]);
- if (state->print_mask &
- BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
- pr_info("[after new superblock is written]:\n");
- btrfsic_dump_tree_sub(state, block, 0);
- }
- } else {
- block->mirror_num = 0; /* unknown */
- ret = btrfsic_process_metablock(
- state,
- block,
- &block_ctx,
- 0, 0);
- }
- if (ret)
- pr_info("btrfsic: btrfsic_process_metablock(root @%llu) failed!\n",
- dev_bytenr);
- } else {
- block->is_metadata = 0;
- block->mirror_num = 0; /* unknown */
- block->generation = BTRFSIC_GENERATION_UNKNOWN;
- if (!state->include_extent_data
- && list_empty(&block->ref_from_list)) {
- /*
- * disk block is overwritten with extent
- * data (not meta data) and we are configured
- * to not include extent data: take the
- * chance and free the block's memory
- */
- btrfsic_block_hashtable_remove(block);
- list_del(&block->all_blocks_node);
- btrfsic_block_free(block);
- }
- }
- btrfsic_release_block_ctx(&block_ctx);
- } else {
- /* block has not been found in hash table */
- u64 bytenr;
-
- if (!is_metadata) {
- processed_len = state->datablock_size;
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info(
- "written block (%pg/%llu/?) !found in hash table, D\n",
- dev_state->bdev, dev_bytenr);
- if (!state->include_extent_data) {
- /* ignore that written D block */
- goto continue_loop;
- }
-
- /* this is getting ugly for the
- * include_extent_data case... */
- bytenr = 0; /* unknown */
- } else {
- processed_len = state->metablock_size;
- bytenr = btrfs_stack_header_bytenr(
- (struct btrfs_header *)
- mapped_datav[0]);
- btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
- dev_bytenr);
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info(
- "written block @%llu (%pg/%llu/?) !found in hash table, M\n",
- bytenr, dev_state->bdev, dev_bytenr);
- }
-
- block_ctx.dev = dev_state;
- block_ctx.dev_bytenr = dev_bytenr;
- block_ctx.start = bytenr;
- block_ctx.len = processed_len;
- block_ctx.pagev = NULL;
- block_ctx.mem_to_free = NULL;
- block_ctx.datav = mapped_datav;
-
- block = btrfsic_block_alloc();
- if (NULL == block) {
- btrfsic_release_block_ctx(&block_ctx);
- goto continue_loop;
- }
- block->dev_state = dev_state;
- block->dev_bytenr = dev_bytenr;
- block->logical_bytenr = bytenr;
- block->is_metadata = is_metadata;
- block->never_written = 0;
- block->iodone_w_error = 0;
- block->mirror_num = 0; /* unknown */
- block->flush_gen = dev_state->last_flush_gen + 1;
- block->submit_bio_bh_rw = submit_bio_bh_rw;
- if (NULL != bio) {
- block->is_iodone = 0;
- BUG_ON(NULL == bio_is_patched);
- if (!*bio_is_patched) {
- block->orig_bio_private = bio->bi_private;
- block->orig_bio_end_io = bio->bi_end_io;
- block->next_in_same_bio = NULL;
- bio->bi_private = block;
- bio->bi_end_io = btrfsic_bio_end_io;
- *bio_is_patched = 1;
- } else {
- struct btrfsic_block *chained_block =
- (struct btrfsic_block *)
- bio->bi_private;
-
- BUG_ON(NULL == chained_block);
- block->orig_bio_private =
- chained_block->orig_bio_private;
- block->orig_bio_end_io =
- chained_block->orig_bio_end_io;
- block->next_in_same_bio = chained_block;
- bio->bi_private = block;
- }
- } else {
- block->is_iodone = 1;
- block->orig_bio_private = NULL;
- block->orig_bio_end_io = NULL;
- block->next_in_same_bio = NULL;
- }
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("new written %c-block @%llu (%pg/%llu/%d)\n",
- is_metadata ? 'M' : 'D',
- block->logical_bytenr, block->dev_state->bdev,
- block->dev_bytenr, block->mirror_num);
- list_add(&block->all_blocks_node, &state->all_blocks_list);
- btrfsic_block_hashtable_add(block, &state->block_hashtable);
-
- if (is_metadata) {
- ret = btrfsic_process_metablock(state, block,
- &block_ctx, 0, 0);
- if (ret)
- pr_info("btrfsic: process_metablock(root @%llu) failed!\n",
- dev_bytenr);
- }
- btrfsic_release_block_ctx(&block_ctx);
- }
-
-continue_loop:
- BUG_ON(!processed_len);
- dev_bytenr += processed_len;
- mapped_datav += processed_len >> PAGE_SHIFT;
- num_pages -= processed_len >> PAGE_SHIFT;
- goto again;
-}
-
-static void btrfsic_bio_end_io(struct bio *bp)
-{
- struct btrfsic_block *block = bp->bi_private;
- int iodone_w_error;
-
- /* mutex is not held! This is not save if IO is not yet completed
- * on umount */
- iodone_w_error = 0;
- if (bp->bi_status)
- iodone_w_error = 1;
-
- BUG_ON(NULL == block);
- bp->bi_private = block->orig_bio_private;
- bp->bi_end_io = block->orig_bio_end_io;
-
- do {
- struct btrfsic_block *next_block;
- struct btrfsic_dev_state *const dev_state = block->dev_state;
-
- if ((dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
- pr_info("bio_end_io(err=%d) for %c @%llu (%pg/%llu/%d)\n",
- bp->bi_status,
- btrfsic_get_block_type(dev_state->state, block),
- block->logical_bytenr, dev_state->bdev,
- block->dev_bytenr, block->mirror_num);
- next_block = block->next_in_same_bio;
- block->iodone_w_error = iodone_w_error;
- if (block->submit_bio_bh_rw & REQ_PREFLUSH) {
- dev_state->last_flush_gen++;
- if ((dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
- pr_info("bio_end_io() new %pg flush_gen=%llu\n",
- dev_state->bdev,
- dev_state->last_flush_gen);
- }
- if (block->submit_bio_bh_rw & REQ_FUA)
- block->flush_gen = 0; /* FUA completed means block is
- * on disk */
- block->is_iodone = 1; /* for FLUSH, this releases the block */
- block = next_block;
- } while (NULL != block);
-
- bp->bi_end_io(bp);
-}
-
-static int btrfsic_process_written_superblock(
- struct btrfsic_state *state,
- struct btrfsic_block *const superblock,
- struct btrfs_super_block *const super_hdr)
-{
- struct btrfs_fs_info *fs_info = state->fs_info;
- int pass;
-
- superblock->generation = btrfs_super_generation(super_hdr);
- if (!(superblock->generation > state->max_superblock_generation ||
- 0 == state->max_superblock_generation)) {
- if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
- pr_info(
- "btrfsic: superblock @%llu (%pg/%llu/%d) with old gen %llu <= %llu\n",
- superblock->logical_bytenr,
- superblock->dev_state->bdev,
- superblock->dev_bytenr, superblock->mirror_num,
- btrfs_super_generation(super_hdr),
- state->max_superblock_generation);
- } else {
- if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
- pr_info(
- "btrfsic: got new superblock @%llu (%pg/%llu/%d) with new gen %llu > %llu\n",
- superblock->logical_bytenr,
- superblock->dev_state->bdev,
- superblock->dev_bytenr, superblock->mirror_num,
- btrfs_super_generation(super_hdr),
- state->max_superblock_generation);
-
- state->max_superblock_generation =
- btrfs_super_generation(super_hdr);
- state->latest_superblock = superblock;
- }
-
- for (pass = 0; pass < 3; pass++) {
- int ret;
- u64 next_bytenr;
- struct btrfsic_block *next_block;
- struct btrfsic_block_data_ctx tmp_next_block_ctx;
- struct btrfsic_block_link *l;
- int num_copies;
- int mirror_num;
- const char *additional_string = NULL;
- struct btrfs_disk_key tmp_disk_key = {0};
-
- btrfs_set_disk_key_objectid(&tmp_disk_key,
- BTRFS_ROOT_ITEM_KEY);
- btrfs_set_disk_key_objectid(&tmp_disk_key, 0);
-
- switch (pass) {
- case 0:
- btrfs_set_disk_key_objectid(&tmp_disk_key,
- BTRFS_ROOT_TREE_OBJECTID);
- additional_string = "root ";
- next_bytenr = btrfs_super_root(super_hdr);
- if (state->print_mask &
- BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
- pr_info("root@%llu\n", next_bytenr);
- break;
- case 1:
- btrfs_set_disk_key_objectid(&tmp_disk_key,
- BTRFS_CHUNK_TREE_OBJECTID);
- additional_string = "chunk ";
- next_bytenr = btrfs_super_chunk_root(super_hdr);
- if (state->print_mask &
- BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
- pr_info("chunk@%llu\n", next_bytenr);
- break;
- case 2:
- btrfs_set_disk_key_objectid(&tmp_disk_key,
- BTRFS_TREE_LOG_OBJECTID);
- additional_string = "log ";
- next_bytenr = btrfs_super_log_root(super_hdr);
- if (0 == next_bytenr)
- continue;
- if (state->print_mask &
- BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
- pr_info("log@%llu\n", next_bytenr);
- break;
- }
-
- num_copies = btrfs_num_copies(fs_info, next_bytenr,
- BTRFS_SUPER_INFO_SIZE);
- if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
- pr_info("num_copies(log_bytenr=%llu) = %d\n",
- next_bytenr, num_copies);
- for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
- int was_created;
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("btrfsic_process_written_superblock(mirror_num=%d)\n", mirror_num);
- ret = btrfsic_map_block(state, next_bytenr,
- BTRFS_SUPER_INFO_SIZE,
- &tmp_next_block_ctx,
- mirror_num);
- if (ret) {
- pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
- next_bytenr, mirror_num);
- return -1;
- }
-
- next_block = btrfsic_block_lookup_or_add(
- state,
- &tmp_next_block_ctx,
- additional_string,
- 1, 0, 1,
- mirror_num,
- &was_created);
- if (NULL == next_block) {
- btrfsic_release_block_ctx(&tmp_next_block_ctx);
- return -1;
- }
-
- next_block->disk_key = tmp_disk_key;
- if (was_created)
- next_block->generation =
- BTRFSIC_GENERATION_UNKNOWN;
- l = btrfsic_block_link_lookup_or_add(
- state,
- &tmp_next_block_ctx,
- next_block,
- superblock,
- BTRFSIC_GENERATION_UNKNOWN);
- btrfsic_release_block_ctx(&tmp_next_block_ctx);
- if (NULL == l)
- return -1;
- }
- }
-
- if (WARN_ON(-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)))
- btrfsic_dump_tree(state);
-
- return 0;
-}
-
-static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
- struct btrfsic_block *const block,
- int recursion_level)
-{
- const struct btrfsic_block_link *l;
- int ret = 0;
-
- if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
- /*
- * Note that this situation can happen and does not
- * indicate an error in regular cases. It happens
- * when disk blocks are freed and later reused.
- * The check-integrity module is not aware of any
- * block free operations, it just recognizes block
- * write operations. Therefore it keeps the linkage
- * information for a block until a block is
- * rewritten. This can temporarily cause incorrect
- * and even circular linkage information. This
- * causes no harm unless such blocks are referenced
- * by the most recent super block.
- */
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("btrfsic: abort cyclic linkage (case 1).\n");
-
- return ret;
- }
-
- /*
- * This algorithm is recursive because the amount of used stack
- * space is very small and the max recursion depth is limited.
- */
- list_for_each_entry(l, &block->ref_to_list, node_ref_to) {
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info(
- "rl=%d, %c @%llu (%pg/%llu/%d) %u* refers to %c @%llu (%pg/%llu/%d)\n",
- recursion_level,
- btrfsic_get_block_type(state, block),
- block->logical_bytenr, block->dev_state->bdev,
- block->dev_bytenr, block->mirror_num,
- l->ref_cnt,
- btrfsic_get_block_type(state, l->block_ref_to),
- l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->bdev,
- l->block_ref_to->dev_bytenr,
- l->block_ref_to->mirror_num);
- if (l->block_ref_to->never_written) {
- pr_info(
-"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is never written!\n",
- btrfsic_get_block_type(state, l->block_ref_to),
- l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->bdev,
- l->block_ref_to->dev_bytenr,
- l->block_ref_to->mirror_num);
- ret = -1;
- } else if (!l->block_ref_to->is_iodone) {
- pr_info(
-"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not yet iodone!\n",
- btrfsic_get_block_type(state, l->block_ref_to),
- l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->bdev,
- l->block_ref_to->dev_bytenr,
- l->block_ref_to->mirror_num);
- ret = -1;
- } else if (l->block_ref_to->iodone_w_error) {
- pr_info(
-"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which has write error!\n",
- btrfsic_get_block_type(state, l->block_ref_to),
- l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->bdev,
- l->block_ref_to->dev_bytenr,
- l->block_ref_to->mirror_num);
- ret = -1;
- } else if (l->parent_generation !=
- l->block_ref_to->generation &&
- BTRFSIC_GENERATION_UNKNOWN !=
- l->parent_generation &&
- BTRFSIC_GENERATION_UNKNOWN !=
- l->block_ref_to->generation) {
- pr_info(
-"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) with generation %llu != parent generation %llu!\n",
- btrfsic_get_block_type(state, l->block_ref_to),
- l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->bdev,
- l->block_ref_to->dev_bytenr,
- l->block_ref_to->mirror_num,
- l->block_ref_to->generation,
- l->parent_generation);
- ret = -1;
- } else if (l->block_ref_to->flush_gen >
- l->block_ref_to->dev_state->last_flush_gen) {
- pr_info(
-"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n",
- btrfsic_get_block_type(state, l->block_ref_to),
- l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->bdev,
- l->block_ref_to->dev_bytenr,
- l->block_ref_to->mirror_num, block->flush_gen,
- l->block_ref_to->dev_state->last_flush_gen);
- ret = -1;
- } else if (-1 == btrfsic_check_all_ref_blocks(state,
- l->block_ref_to,
- recursion_level +
- 1)) {
- ret = -1;
- }
- }
-
- return ret;
-}
-
-static int btrfsic_is_block_ref_by_superblock(
- const struct btrfsic_state *state,
- const struct btrfsic_block *block,
- int recursion_level)
-{
- const struct btrfsic_block_link *l;
-
- if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
- /* refer to comment at "abort cyclic linkage (case 1)" */
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("btrfsic: abort cyclic linkage (case 2).\n");
-
- return 0;
- }
-
- /*
- * This algorithm is recursive because the amount of used stack space
- * is very small and the max recursion depth is limited.
- */
- list_for_each_entry(l, &block->ref_from_list, node_ref_from) {
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info(
- "rl=%d, %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n",
- recursion_level,
- btrfsic_get_block_type(state, block),
- block->logical_bytenr, block->dev_state->bdev,
- block->dev_bytenr, block->mirror_num,
- l->ref_cnt,
- btrfsic_get_block_type(state, l->block_ref_from),
- l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->bdev,
- l->block_ref_from->dev_bytenr,
- l->block_ref_from->mirror_num);
- if (l->block_ref_from->is_superblock &&
- state->latest_superblock->dev_bytenr ==
- l->block_ref_from->dev_bytenr &&
- state->latest_superblock->dev_state->bdev ==
- l->block_ref_from->dev_state->bdev)
- return 1;
- else if (btrfsic_is_block_ref_by_superblock(state,
- l->block_ref_from,
- recursion_level +
- 1))
- return 1;
- }
-
- return 0;
-}
-
-static void btrfsic_print_add_link(const struct btrfsic_state *state,
- const struct btrfsic_block_link *l)
-{
- pr_info("add %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n",
- l->ref_cnt,
- btrfsic_get_block_type(state, l->block_ref_from),
- l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->bdev,
- l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
- btrfsic_get_block_type(state, l->block_ref_to),
- l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr,
- l->block_ref_to->mirror_num);
-}
-
-static void btrfsic_print_rem_link(const struct btrfsic_state *state,
- const struct btrfsic_block_link *l)
-{
- pr_info("rem %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n",
- l->ref_cnt,
- btrfsic_get_block_type(state, l->block_ref_from),
- l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->bdev,
- l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
- btrfsic_get_block_type(state, l->block_ref_to),
- l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr,
- l->block_ref_to->mirror_num);
-}
-
-static char btrfsic_get_block_type(const struct btrfsic_state *state,
- const struct btrfsic_block *block)
-{
- if (block->is_superblock &&
- state->latest_superblock->dev_bytenr == block->dev_bytenr &&
- state->latest_superblock->dev_state->bdev == block->dev_state->bdev)
- return 'S';
- else if (block->is_superblock)
- return 's';
- else if (block->is_metadata)
- return 'M';
- else
- return 'D';
-}
-
-static void btrfsic_dump_tree(const struct btrfsic_state *state)
-{
- btrfsic_dump_tree_sub(state, state->latest_superblock, 0);
-}
-
-static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
- const struct btrfsic_block *block,
- int indent_level)
-{
- const struct btrfsic_block_link *l;
- int indent_add;
- static char buf[80];
- int cursor_position;
-
- /*
- * Should better fill an on-stack buffer with a complete line and
- * dump it at once when it is time to print a newline character.
- */
-
- /*
- * This algorithm is recursive because the amount of used stack space
- * is very small and the max recursion depth is limited.
- */
- indent_add = sprintf(buf, "%c-%llu(%pg/%llu/%u)",
- btrfsic_get_block_type(state, block),
- block->logical_bytenr, block->dev_state->bdev,
- block->dev_bytenr, block->mirror_num);
- if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
- printk("[...]\n");
- return;
- }
- printk(buf);
- indent_level += indent_add;
- if (list_empty(&block->ref_to_list)) {
- printk("\n");
- return;
- }
- if (block->mirror_num > 1 &&
- !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) {
- printk(" [...]\n");
- return;
- }
-
- cursor_position = indent_level;
- list_for_each_entry(l, &block->ref_to_list, node_ref_to) {
- while (cursor_position < indent_level) {
- printk(" ");
- cursor_position++;
- }
- if (l->ref_cnt > 1)
- indent_add = sprintf(buf, " %d*--> ", l->ref_cnt);
- else
- indent_add = sprintf(buf, " --> ");
- if (indent_level + indent_add >
- BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
- printk("[...]\n");
- cursor_position = 0;
- continue;
- }
-
- printk(buf);
-
- btrfsic_dump_tree_sub(state, l->block_ref_to,
- indent_level + indent_add);
- cursor_position = 0;
- }
-}
-
-static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
- struct btrfsic_state *state,
- struct btrfsic_block_data_ctx *next_block_ctx,
- struct btrfsic_block *next_block,
- struct btrfsic_block *from_block,
- u64 parent_generation)
-{
- struct btrfsic_block_link *l;
-
- l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev,
- next_block_ctx->dev_bytenr,
- from_block->dev_state->bdev,
- from_block->dev_bytenr,
- &state->block_link_hashtable);
- if (NULL == l) {
- l = btrfsic_block_link_alloc();
- if (!l)
- return NULL;
-
- l->block_ref_to = next_block;
- l->block_ref_from = from_block;
- l->ref_cnt = 1;
- l->parent_generation = parent_generation;
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- btrfsic_print_add_link(state, l);
-
- list_add(&l->node_ref_to, &from_block->ref_to_list);
- list_add(&l->node_ref_from, &next_block->ref_from_list);
-
- btrfsic_block_link_hashtable_add(l,
- &state->block_link_hashtable);
- } else {
- l->ref_cnt++;
- l->parent_generation = parent_generation;
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- btrfsic_print_add_link(state, l);
- }
-
- return l;
-}
-
-static struct btrfsic_block *btrfsic_block_lookup_or_add(
- struct btrfsic_state *state,
- struct btrfsic_block_data_ctx *block_ctx,
- const char *additional_string,
- int is_metadata,
- int is_iodone,
- int never_written,
- int mirror_num,
- int *was_created)
-{
- struct btrfsic_block *block;
-
- block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev,
- block_ctx->dev_bytenr,
- &state->block_hashtable);
- if (NULL == block) {
- struct btrfsic_dev_state *dev_state;
-
- block = btrfsic_block_alloc();
- if (!block)
- return NULL;
-
- dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev->bd_dev);
- if (NULL == dev_state) {
- pr_info("btrfsic: error, lookup dev_state failed!\n");
- btrfsic_block_free(block);
- return NULL;
- }
- block->dev_state = dev_state;
- block->dev_bytenr = block_ctx->dev_bytenr;
- block->logical_bytenr = block_ctx->start;
- block->is_metadata = is_metadata;
- block->is_iodone = is_iodone;
- block->never_written = never_written;
- block->mirror_num = mirror_num;
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("New %s%c-block @%llu (%pg/%llu/%d)\n",
- additional_string,
- btrfsic_get_block_type(state, block),
- block->logical_bytenr, dev_state->bdev,
- block->dev_bytenr, mirror_num);
- list_add(&block->all_blocks_node, &state->all_blocks_list);
- btrfsic_block_hashtable_add(block, &state->block_hashtable);
- if (NULL != was_created)
- *was_created = 1;
- } else {
- if (NULL != was_created)
- *was_created = 0;
- }
-
- return block;
-}
-
-static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
- u64 bytenr,
- struct btrfsic_dev_state *dev_state,
- u64 dev_bytenr)
-{
- struct btrfs_fs_info *fs_info = state->fs_info;
- struct btrfsic_block_data_ctx block_ctx;
- int num_copies;
- int mirror_num;
- int match = 0;
- int ret;
-
- num_copies = btrfs_num_copies(fs_info, bytenr, state->metablock_size);
-
- for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
- ret = btrfsic_map_block(state, bytenr, state->metablock_size,
- &block_ctx, mirror_num);
- if (ret) {
- pr_info("btrfsic: btrfsic_map_block(logical @%llu, mirror %d) failed!\n",
- bytenr, mirror_num);
- continue;
- }
-
- if (dev_state->bdev == block_ctx.dev->bdev &&
- dev_bytenr == block_ctx.dev_bytenr) {
- match++;
- btrfsic_release_block_ctx(&block_ctx);
- break;
- }
- btrfsic_release_block_ctx(&block_ctx);
- }
-
- if (WARN_ON(!match)) {
- pr_info(
-"btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%pg, phys_bytenr=%llu)!\n",
- bytenr, dev_state->bdev, dev_bytenr);
- for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
- ret = btrfsic_map_block(state, bytenr,
- state->metablock_size,
- &block_ctx, mirror_num);
- if (ret)
- continue;
-
- pr_info("read logical bytenr @%llu maps to (%pg/%llu/%d)\n",
- bytenr, block_ctx.dev->bdev,
- block_ctx.dev_bytenr, mirror_num);
- }
- }
-}
-
-static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev)
-{
- return btrfsic_dev_state_hashtable_lookup(dev,
- &btrfsic_dev_state_hashtable);
-}
-
-static void btrfsic_check_write_bio(struct bio *bio, struct btrfsic_dev_state *dev_state)
-{
- unsigned int segs = bio_segments(bio);
- u64 dev_bytenr = 512 * bio->bi_iter.bi_sector;
- u64 cur_bytenr = dev_bytenr;
- struct bvec_iter iter;
- struct bio_vec bvec;
- char **mapped_datav;
- int bio_is_patched = 0;
- int i = 0;
-
- if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
- pr_info(
-"submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
- bio_op(bio), bio->bi_opf, segs,
- bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev);
-
- mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS);
- if (!mapped_datav)
- return;
-
- bio_for_each_segment(bvec, bio, iter) {
- BUG_ON(bvec.bv_len != PAGE_SIZE);
- mapped_datav[i] = page_address(bvec.bv_page);
- i++;
-
- if (dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE)
- pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n",
- i, cur_bytenr, bvec.bv_len, bvec.bv_offset);
- cur_bytenr += bvec.bv_len;
- }
-
- btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, segs,
- bio, &bio_is_patched, bio->bi_opf);
- kfree(mapped_datav);
-}
-
-static void btrfsic_check_flush_bio(struct bio *bio, struct btrfsic_dev_state *dev_state)
-{
- if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
- pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n",
- bio_op(bio), bio->bi_opf, bio->bi_bdev);
-
- if (dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
- struct btrfsic_block *const block =
- &dev_state->dummy_block_for_bio_bh_flush;
-
- block->is_iodone = 0;
- block->never_written = 0;
- block->iodone_w_error = 0;
- block->flush_gen = dev_state->last_flush_gen + 1;
- block->submit_bio_bh_rw = bio->bi_opf;
- block->orig_bio_private = bio->bi_private;
- block->orig_bio_end_io = bio->bi_end_io;
- block->next_in_same_bio = NULL;
- bio->bi_private = block;
- bio->bi_end_io = btrfsic_bio_end_io;
- } else if ((dev_state->state->print_mask &
- (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
- BTRFSIC_PRINT_MASK_VERBOSE))) {
- pr_info(
-"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n",
- dev_state->bdev);
- }
-}
-
-void btrfsic_check_bio(struct bio *bio)
-{
- struct btrfsic_dev_state *dev_state;
-
- if (!btrfsic_is_initialized)
- return;
-
- /*
- * We can be called before btrfsic_mount, so there might not be a
- * dev_state.
- */
- dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev);
- mutex_lock(&btrfsic_mutex);
- if (dev_state) {
- if (bio_op(bio) == REQ_OP_WRITE && bio_has_data(bio))
- btrfsic_check_write_bio(bio, dev_state);
- else if (bio->bi_opf & REQ_PREFLUSH)
- btrfsic_check_flush_bio(bio, dev_state);
- }
- mutex_unlock(&btrfsic_mutex);
-}
-
-int btrfsic_mount(struct btrfs_fs_info *fs_info,
- struct btrfs_fs_devices *fs_devices,
- int including_extent_data, u32 print_mask)
-{
- int ret;
- struct btrfsic_state *state;
- struct list_head *dev_head = &fs_devices->devices;
- struct btrfs_device *device;
-
- if (!PAGE_ALIGNED(fs_info->nodesize)) {
- pr_info("btrfsic: cannot handle nodesize %d not being a multiple of PAGE_SIZE %ld!\n",
- fs_info->nodesize, PAGE_SIZE);
- return -1;
- }
- if (!PAGE_ALIGNED(fs_info->sectorsize)) {
- pr_info("btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_SIZE %ld!\n",
- fs_info->sectorsize, PAGE_SIZE);
- return -1;
- }
- state = kvzalloc(sizeof(*state), GFP_KERNEL);
- if (!state)
- return -ENOMEM;
-
- if (!btrfsic_is_initialized) {
- mutex_init(&btrfsic_mutex);
- btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable);
- btrfsic_is_initialized = 1;
- }
- mutex_lock(&btrfsic_mutex);
- state->fs_info = fs_info;
- state->print_mask = print_mask;
- state->include_extent_data = including_extent_data;
- state->metablock_size = fs_info->nodesize;
- state->datablock_size = fs_info->sectorsize;
- INIT_LIST_HEAD(&state->all_blocks_list);
- btrfsic_block_hashtable_init(&state->block_hashtable);
- btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
- state->max_superblock_generation = 0;
- state->latest_superblock = NULL;
-
- list_for_each_entry(device, dev_head, dev_list) {
- struct btrfsic_dev_state *ds;
-
- if (!device->bdev || !device->name)
- continue;
-
- ds = btrfsic_dev_state_alloc();
- if (NULL == ds) {
- mutex_unlock(&btrfsic_mutex);
- return -ENOMEM;
- }
- ds->bdev = device->bdev;
- ds->state = state;
- btrfsic_dev_state_hashtable_add(ds,
- &btrfsic_dev_state_hashtable);
- }
-
- ret = btrfsic_process_superblock(state, fs_devices);
- if (0 != ret) {
- mutex_unlock(&btrfsic_mutex);
- btrfsic_unmount(fs_devices);
- return ret;
- }
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE)
- btrfsic_dump_database(state);
- if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE)
- btrfsic_dump_tree(state);
-
- mutex_unlock(&btrfsic_mutex);
- return 0;
-}
-
-void btrfsic_unmount(struct btrfs_fs_devices *fs_devices)
-{
- struct btrfsic_block *b_all, *tmp_all;
- struct btrfsic_state *state;
- struct list_head *dev_head = &fs_devices->devices;
- struct btrfs_device *device;
-
- if (!btrfsic_is_initialized)
- return;
-
- mutex_lock(&btrfsic_mutex);
-
- state = NULL;
- list_for_each_entry(device, dev_head, dev_list) {
- struct btrfsic_dev_state *ds;
-
- if (!device->bdev || !device->name)
- continue;
-
- ds = btrfsic_dev_state_hashtable_lookup(
- device->bdev->bd_dev,
- &btrfsic_dev_state_hashtable);
- if (NULL != ds) {
- state = ds->state;
- btrfsic_dev_state_hashtable_remove(ds);
- btrfsic_dev_state_free(ds);
- }
- }
-
- if (NULL == state) {
- pr_info("btrfsic: error, cannot find state information on umount!\n");
- mutex_unlock(&btrfsic_mutex);
- return;
- }
-
- /*
- * Don't care about keeping the lists' state up to date,
- * just free all memory that was allocated dynamically.
- * Free the blocks and the block_links.
- */
- list_for_each_entry_safe(b_all, tmp_all, &state->all_blocks_list,
- all_blocks_node) {
- struct btrfsic_block_link *l, *tmp;
-
- list_for_each_entry_safe(l, tmp, &b_all->ref_to_list,
- node_ref_to) {
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- btrfsic_print_rem_link(state, l);
-
- l->ref_cnt--;
- if (0 == l->ref_cnt)
- btrfsic_block_link_free(l);
- }
-
- if (b_all->is_iodone || b_all->never_written)
- btrfsic_block_free(b_all);
- else
- pr_info(
-"btrfs: attempt to free %c-block @%llu (%pg/%llu/%d) on umount which is not yet iodone!\n",
- btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->bdev,
- b_all->dev_bytenr, b_all->mirror_num);
- }
-
- mutex_unlock(&btrfsic_mutex);
-
- kvfree(state);
-}
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
deleted file mode 100644
index e4c8aed7996f..000000000000
--- a/fs/btrfs/check-integrity.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) STRATO AG 2011. All rights reserved.
- */
-
-#ifndef BTRFS_CHECK_INTEGRITY_H
-#define BTRFS_CHECK_INTEGRITY_H
-
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-void btrfsic_check_bio(struct bio *bio);
-#else
-static inline void btrfsic_check_bio(struct bio *bio) { }
-#endif
-
-int btrfsic_mount(struct btrfs_fs_info *fs_info,
- struct btrfs_fs_devices *fs_devices,
- int including_extent_data, u32 print_mask);
-void btrfsic_unmount(struct btrfs_fs_devices *fs_devices);
-
-#endif
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8818ed5c390f..19b22b4653c8 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -193,12 +193,12 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb)
unsigned long index = cb->start >> PAGE_SHIFT;
unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
struct folio_batch fbatch;
- const int errno = blk_status_to_errno(cb->bbio.bio.bi_status);
+ const int error = blk_status_to_errno(cb->bbio.bio.bi_status);
int i;
int ret;
- if (errno)
- mapping_set_error(inode->i_mapping, errno);
+ if (error)
+ mapping_set_error(inode->i_mapping, error);
folio_batch_init(&fbatch);
while (index <= end_index) {
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 617d4827eec2..35c1d24d4a78 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -230,9 +230,9 @@ noinline void btrfs_release_path(struct btrfs_path *p)
* cause could be a bug, eg. due to ENOSPC, and not for common errors that are
* caused by external factors.
*/
-bool __cold abort_should_print_stack(int errno)
+bool __cold abort_should_print_stack(int error)
{
- switch (errno) {
+ switch (error) {
case -EIO:
case -EROFS:
case -ENOMEM:
@@ -316,6 +316,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
int ret = 0;
int level;
struct btrfs_disk_key disk_key;
+ u64 reloc_src_root = 0;
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != fs_info->running_transaction->transid);
@@ -328,9 +329,11 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
else
btrfs_node_key(buf, &disk_key, 0);
+ if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+ reloc_src_root = btrfs_header_owner(buf);
cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
&disk_key, level, buf->start, 0,
- BTRFS_NESTING_NEW_ROOT);
+ reloc_src_root, BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -359,7 +362,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
return ret;
}
- btrfs_mark_buffer_dirty(cow);
+ btrfs_mark_buffer_dirty(trans, cow);
*cow_ret = cow;
return 0;
}
@@ -429,7 +432,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (btrfs_block_can_be_shared(trans, root, buf)) {
ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
btrfs_header_level(buf), 1,
- &refs, &flags);
+ &refs, &flags, NULL);
if (ret)
return ret;
if (unlikely(refs == 0)) {
@@ -518,13 +521,13 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
* bytes the allocator should try to find free next to the block it returns.
* This is just a hint and may be ignored by the allocator.
*/
-static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *buf,
- struct extent_buffer *parent, int parent_slot,
- struct extent_buffer **cow_ret,
- u64 search_start, u64 empty_size,
- enum btrfs_lock_nesting nest)
+int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ struct extent_buffer *parent, int parent_slot,
+ struct extent_buffer **cow_ret,
+ u64 search_start, u64 empty_size,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_disk_key disk_key;
@@ -533,6 +536,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
int last_ref = 0;
int unlock_orig = 0;
u64 parent_start = 0;
+ u64 reloc_src_root = 0;
if (*cow_ret == buf)
unlock_orig = 1;
@@ -551,12 +555,14 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
else
btrfs_node_key(buf, &disk_key, 0);
- if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
- parent_start = parent->start;
-
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ if (parent)
+ parent_start = parent->start;
+ reloc_src_root = btrfs_header_owner(buf);
+ }
cow = btrfs_alloc_tree_block(trans, root, parent_start,
root->root_key.objectid, &disk_key, level,
- search_start, empty_size, nest);
+ search_start, empty_size, reloc_src_root, nest);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -627,7 +633,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
cow->start);
btrfs_set_node_ptr_generation(parent, parent_slot,
trans->transid);
- btrfs_mark_buffer_dirty(parent);
+ btrfs_mark_buffer_dirty(trans, parent);
if (last_ref) {
ret = btrfs_tree_mod_log_free_eb(buf);
if (ret) {
@@ -643,7 +649,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if (unlock_orig)
btrfs_tree_unlock(buf);
free_extent_buffer_stale(buf);
- btrfs_mark_buffer_dirty(cow);
+ btrfs_mark_buffer_dirty(trans, cow);
*cow_ret = cow;
return 0;
}
@@ -679,11 +685,11 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
}
/*
- * cows a single block, see __btrfs_cow_block for the real work.
+ * COWs a single block, see btrfs_force_cow_block() for the real work.
* This version of it has extra checks so that a block isn't COWed more than
* once per transaction, as long as it hasn't been written yet
*/
-noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
+int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
struct extent_buffer **cow_ret,
@@ -723,7 +729,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
return 0;
}
- search_start = buf->start & ~((u64)SZ_1G - 1);
+ search_start = round_down(buf->start, SZ_1G);
/*
* Before CoWing this block for later modification, check if it's
@@ -732,8 +738,8 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
* Also We don't care about the error, as it's handled internally.
*/
btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
- ret = __btrfs_cow_block(trans, root, buf, parent,
- parent_slot, cow_ret, search_start, 0, nest);
+ ret = btrfs_force_cow_block(trans, root, buf, parent, parent_slot,
+ cow_ret, search_start, 0, nest);
trace_btrfs_cow_block(root, buf, *cow_ret);
@@ -742,49 +748,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
/*
- * helper function for defrag to decide if two blocks pointed to by a
- * node are actually close by
- */
-static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
-{
- if (blocknr < other && other - (blocknr + blocksize) < 32768)
- return 1;
- if (blocknr > other && blocknr - (other + blocksize) < 32768)
- return 1;
- return 0;
-}
-
-#ifdef __LITTLE_ENDIAN
-
-/*
- * Compare two keys, on little-endian the disk order is same as CPU order and
- * we can avoid the conversion.
- */
-static int comp_keys(const struct btrfs_disk_key *disk_key,
- const struct btrfs_key *k2)
-{
- const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
-
- return btrfs_comp_cpu_keys(k1, k2);
-}
-
-#else
-
-/*
- * compare two keys in a memcmp fashion
- */
-static int comp_keys(const struct btrfs_disk_key *disk,
- const struct btrfs_key *k2)
-{
- struct btrfs_key k1;
-
- btrfs_disk_key_to_cpu(&k1, disk);
-
- return btrfs_comp_cpu_keys(&k1, k2);
-}
-#endif
-
-/*
* same as comp_keys only with two btrfs_key's
*/
int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
@@ -805,105 +768,6 @@ int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_ke
}
/*
- * this is used by the defrag code to go through all the
- * leaves pointed to by a node and reallocate them so that
- * disk order is close to key order
- */
-int btrfs_realloc_node(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct extent_buffer *parent,
- int start_slot, u64 *last_ret,
- struct btrfs_key *progress)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct extent_buffer *cur;
- u64 blocknr;
- u64 search_start = *last_ret;
- u64 last_block = 0;
- u64 other;
- u32 parent_nritems;
- int end_slot;
- int i;
- int err = 0;
- u32 blocksize;
- int progress_passed = 0;
- struct btrfs_disk_key disk_key;
-
- /*
- * COWing must happen through a running transaction, which always
- * matches the current fs generation (it's a transaction with a state
- * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs
- * into error state to prevent the commit of any transaction.
- */
- if (unlikely(trans->transaction != fs_info->running_transaction ||
- trans->transid != fs_info->generation)) {
- btrfs_abort_transaction(trans, -EUCLEAN);
- btrfs_crit(fs_info,
-"unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu",
- parent->start, btrfs_root_id(root), trans->transid,
- fs_info->running_transaction->transid,
- fs_info->generation);
- return -EUCLEAN;
- }
-
- parent_nritems = btrfs_header_nritems(parent);
- blocksize = fs_info->nodesize;
- end_slot = parent_nritems - 1;
-
- if (parent_nritems <= 1)
- return 0;
-
- for (i = start_slot; i <= end_slot; i++) {
- int close = 1;
-
- btrfs_node_key(parent, &disk_key, i);
- if (!progress_passed && comp_keys(&disk_key, progress) < 0)
- continue;
-
- progress_passed = 1;
- blocknr = btrfs_node_blockptr(parent, i);
- if (last_block == 0)
- last_block = blocknr;
-
- if (i > 0) {
- other = btrfs_node_blockptr(parent, i - 1);
- close = close_blocks(blocknr, other, blocksize);
- }
- if (!close && i < end_slot) {
- other = btrfs_node_blockptr(parent, i + 1);
- close = close_blocks(blocknr, other, blocksize);
- }
- if (close) {
- last_block = blocknr;
- continue;
- }
-
- cur = btrfs_read_node_slot(parent, i);
- if (IS_ERR(cur))
- return PTR_ERR(cur);
- if (search_start == 0)
- search_start = last_block;
-
- btrfs_tree_lock(cur);
- err = __btrfs_cow_block(trans, root, cur, parent, i,
- &cur, search_start,
- min(16 * blocksize,
- (end_slot - i) * blocksize),
- BTRFS_NESTING_COW);
- if (err) {
- btrfs_tree_unlock(cur);
- free_extent_buffer(cur);
- break;
- }
- search_start = cur->start;
- last_block = cur->start;
- *last_ret = search_start;
- btrfs_tree_unlock(cur);
- free_extent_buffer(cur);
- }
- return err;
-}
-
-/*
* Search for a key in the given extent_buffer.
*
* The lower boundary for the search is specified by the slot number @first_slot.
@@ -969,7 +833,7 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
tmp = &unaligned;
}
- ret = comp_keys(tmp, key);
+ ret = btrfs_comp_keys(tmp, key);
if (ret < 0)
low = mid + 1;
@@ -984,19 +848,19 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
return 1;
}
-static void root_add_used(struct btrfs_root *root, u32 size)
+static void root_add_used_bytes(struct btrfs_root *root)
{
spin_lock(&root->accounting_lock);
btrfs_set_root_used(&root->root_item,
- btrfs_root_used(&root->root_item) + size);
+ btrfs_root_used(&root->root_item) + root->fs_info->nodesize);
spin_unlock(&root->accounting_lock);
}
-static void root_sub_used(struct btrfs_root *root, u32 size)
+static void root_sub_used_bytes(struct btrfs_root *root)
{
spin_lock(&root->accounting_lock);
btrfs_set_root_used(&root->root_item,
- btrfs_root_used(&root->root_item) - size);
+ btrfs_root_used(&root->root_item) - root->fs_info->nodesize);
spin_unlock(&root->accounting_lock);
}
@@ -1112,7 +976,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
/* once for the path */
free_extent_buffer(mid);
- root_sub_used(root, mid->len);
+ root_sub_used_bytes(root);
btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
/* once for the root ptr */
free_extent_buffer_stale(mid);
@@ -1182,7 +1046,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
right = NULL;
goto out;
}
- root_sub_used(root, right->len);
+ root_sub_used_bytes(root);
btrfs_free_tree_block(trans, btrfs_root_id(root), right,
0, 1);
free_extent_buffer_stale(right);
@@ -1197,7 +1061,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
goto out;
}
btrfs_set_node_key(parent, &right_key, pslot + 1);
- btrfs_mark_buffer_dirty(parent);
+ btrfs_mark_buffer_dirty(trans, parent);
}
}
if (btrfs_header_nritems(mid) == 1) {
@@ -1240,7 +1104,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
mid = NULL;
goto out;
}
- root_sub_used(root, mid->len);
+ root_sub_used_bytes(root);
btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
free_extent_buffer_stale(mid);
mid = NULL;
@@ -1255,7 +1119,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
goto out;
}
btrfs_set_node_key(parent, &mid_key, pslot);
- btrfs_mark_buffer_dirty(parent);
+ btrfs_mark_buffer_dirty(trans, parent);
}
/* update the path */
@@ -1362,7 +1226,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
return ret;
}
btrfs_set_node_key(parent, &disk_key, pslot);
- btrfs_mark_buffer_dirty(parent);
+ btrfs_mark_buffer_dirty(trans, parent);
if (btrfs_header_nritems(left) > orig_slot) {
path->nodes[level] = left;
path->slots[level + 1] -= 1;
@@ -1422,7 +1286,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
return ret;
}
btrfs_set_node_key(parent, &disk_key, pslot + 1);
- btrfs_mark_buffer_dirty(parent);
+ btrfs_mark_buffer_dirty(trans, parent);
if (btrfs_header_nritems(mid) <= orig_slot) {
path->nodes[level] = right;
@@ -2006,7 +1870,7 @@ static int search_leaf(struct btrfs_trans_handle *trans,
* the extent buffer's header and we have recently accessed
* the header's level field.
*/
- ret = comp_keys(&first_key, key);
+ ret = btrfs_comp_keys(&first_key, key);
if (ret < 0) {
/*
* The first key is smaller than the key we want
@@ -2091,8 +1955,8 @@ static int search_leaf(struct btrfs_trans_handle *trans,
}
/*
- * btrfs_search_slot - look for a key in a tree and perform necessary
- * modifications to preserve tree invariants.
+ * Look for a key in a tree and perform necessary modifications to preserve
+ * tree invariants.
*
* @trans: Handle of transaction, used when modifying the tree
* @p: Holds all btree nodes along the search path
@@ -2515,7 +2379,7 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
*/
if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
btrfs_item_key(path->nodes[0], &found_key, path->slots[0]);
- ret = comp_keys(&found_key, &orig_key);
+ ret = btrfs_comp_keys(&found_key, &orig_key);
if (ret == 0) {
if (path->slots[0] > 0) {
path->slots[0]--;
@@ -2530,7 +2394,7 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
}
btrfs_item_key(path->nodes[0], &found_key, 0);
- ret = comp_keys(&found_key, &key);
+ ret = btrfs_comp_keys(&found_key, &key);
/*
* We might have had an item with the previous key in the tree right
* before we released our path. And after we released our path, that
@@ -2678,7 +2542,8 @@ int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
* higher levels
*
*/
-static void fixup_low_keys(struct btrfs_path *path,
+static void fixup_low_keys(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
struct btrfs_disk_key *key, int level)
{
int i;
@@ -2695,7 +2560,7 @@ static void fixup_low_keys(struct btrfs_path *path,
BTRFS_MOD_LOG_KEY_REPLACE);
BUG_ON(ret < 0);
btrfs_set_node_key(t, key, tslot);
- btrfs_mark_buffer_dirty(path->nodes[i]);
+ btrfs_mark_buffer_dirty(trans, path->nodes[i]);
if (tslot != 0)
break;
}
@@ -2707,10 +2572,11 @@ static void fixup_low_keys(struct btrfs_path *path,
* This function isn't completely safe. It's the caller's responsibility
* that the new key won't break the order
*/
-void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
+void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
const struct btrfs_key *new_key)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_disk_key disk_key;
struct extent_buffer *eb;
int slot;
@@ -2719,7 +2585,7 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
slot = path->slots[0];
if (slot > 0) {
btrfs_item_key(eb, &disk_key, slot - 1);
- if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
+ if (unlikely(btrfs_comp_keys(&disk_key, new_key) >= 0)) {
btrfs_print_leaf(eb);
btrfs_crit(fs_info,
"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
@@ -2733,7 +2599,7 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
}
if (slot < btrfs_header_nritems(eb) - 1) {
btrfs_item_key(eb, &disk_key, slot + 1);
- if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
+ if (unlikely(btrfs_comp_keys(&disk_key, new_key) <= 0)) {
btrfs_print_leaf(eb);
btrfs_crit(fs_info,
"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
@@ -2748,9 +2614,9 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
btrfs_cpu_key_to_disk(&disk_key, new_key);
btrfs_set_item_key(eb, &disk_key, slot);
- btrfs_mark_buffer_dirty(eb);
+ btrfs_mark_buffer_dirty(trans, eb);
if (slot == 0)
- fixup_low_keys(path, &disk_key, 1);
+ fixup_low_keys(trans, path, &disk_key, 1);
}
/*
@@ -2881,8 +2747,8 @@ static int push_node_left(struct btrfs_trans_handle *trans,
}
btrfs_set_header_nritems(src, src_nritems - push_items);
btrfs_set_header_nritems(dst, dst_nritems + push_items);
- btrfs_mark_buffer_dirty(src);
- btrfs_mark_buffer_dirty(dst);
+ btrfs_mark_buffer_dirty(trans, src);
+ btrfs_mark_buffer_dirty(trans, dst);
return ret;
}
@@ -2957,8 +2823,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
btrfs_set_header_nritems(src, src_nritems - push_items);
btrfs_set_header_nritems(dst, dst_nritems + push_items);
- btrfs_mark_buffer_dirty(src);
- btrfs_mark_buffer_dirty(dst);
+ btrfs_mark_buffer_dirty(trans, src);
+ btrfs_mark_buffer_dirty(trans, dst);
return ret;
}
@@ -2974,7 +2840,6 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, int level)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
u64 lower_gen;
struct extent_buffer *lower;
struct extent_buffer *c;
@@ -2993,11 +2858,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
&lower_key, level, root->node->start, 0,
- BTRFS_NESTING_NEW_ROOT);
+ 0, BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(c))
return PTR_ERR(c);
- root_add_used(root, fs_info->nodesize);
+ root_add_used_bytes(root);
btrfs_set_header_nritems(c, 1);
btrfs_set_node_key(c, &lower_key, 0);
@@ -3007,7 +2872,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
btrfs_set_node_ptr_generation(c, 0, lower_gen);
- btrfs_mark_buffer_dirty(c);
+ btrfs_mark_buffer_dirty(trans, c);
old = root->node;
ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
@@ -3079,7 +2944,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans,
WARN_ON(trans->transid == 0);
btrfs_set_node_ptr_generation(lower, slot, trans->transid);
btrfs_set_header_nritems(lower, nritems + 1);
- btrfs_mark_buffer_dirty(lower);
+ btrfs_mark_buffer_dirty(trans, lower);
return 0;
}
@@ -3137,11 +3002,11 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
&disk_key, level, c->start, 0,
- BTRFS_NESTING_SPLIT);
+ 0, BTRFS_NESTING_SPLIT);
if (IS_ERR(split))
return PTR_ERR(split);
- root_add_used(root, fs_info->nodesize);
+ root_add_used_bytes(root);
ASSERT(btrfs_header_level(c) == level);
ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
@@ -3158,8 +3023,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
btrfs_set_header_nritems(split, c_nritems - mid);
btrfs_set_header_nritems(c, mid);
- btrfs_mark_buffer_dirty(c);
- btrfs_mark_buffer_dirty(split);
+ btrfs_mark_buffer_dirty(trans, c);
+ btrfs_mark_buffer_dirty(trans, split);
ret = insert_ptr(trans, path, &disk_key, split->start,
path->slots[level + 1] + 1, level + 1);
@@ -3325,15 +3190,15 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
btrfs_set_header_nritems(left, left_nritems);
if (left_nritems)
- btrfs_mark_buffer_dirty(left);
+ btrfs_mark_buffer_dirty(trans, left);
else
btrfs_clear_buffer_dirty(trans, left);
- btrfs_mark_buffer_dirty(right);
+ btrfs_mark_buffer_dirty(trans, right);
btrfs_item_key(right, &disk_key, 0);
btrfs_set_node_key(upper, &disk_key, slot + 1);
- btrfs_mark_buffer_dirty(upper);
+ btrfs_mark_buffer_dirty(trans, upper);
/* then fixup the leaf pointer in the path */
if (path->slots[0] >= left_nritems) {
@@ -3545,14 +3410,14 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
btrfs_set_token_item_offset(&token, i, push_space);
}
- btrfs_mark_buffer_dirty(left);
+ btrfs_mark_buffer_dirty(trans, left);
if (right_nritems)
- btrfs_mark_buffer_dirty(right);
+ btrfs_mark_buffer_dirty(trans, right);
else
btrfs_clear_buffer_dirty(trans, right);
btrfs_item_key(right, &disk_key, 0);
- fixup_low_keys(path, &disk_key, 1);
+ fixup_low_keys(trans, path, &disk_key, 1);
/* then fixup the leaf pointer in the path */
if (path->slots[0] < push_items) {
@@ -3683,8 +3548,8 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
if (ret < 0)
return ret;
- btrfs_mark_buffer_dirty(right);
- btrfs_mark_buffer_dirty(l);
+ btrfs_mark_buffer_dirty(trans, right);
+ btrfs_mark_buffer_dirty(trans, l);
BUG_ON(path->slots[0] != slot);
if (mid <= slot) {
@@ -3888,13 +3753,13 @@ again:
* use BTRFS_NESTING_NEW_ROOT.
*/
right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
- &disk_key, 0, l->start, 0,
+ &disk_key, 0, l->start, 0, 0,
num_doubles ? BTRFS_NESTING_NEW_ROOT :
BTRFS_NESTING_SPLIT);
if (IS_ERR(right))
return PTR_ERR(right);
- root_add_used(root, fs_info->nodesize);
+ root_add_used_bytes(root);
if (split == 0) {
if (mid <= slot) {
@@ -3925,7 +3790,7 @@ again:
path->nodes[0] = right;
path->slots[0] = 0;
if (path->slots[1] == 0)
- fixup_low_keys(path, &disk_key, 1);
+ fixup_low_keys(trans, path, &disk_key, 1);
}
/*
* We create a new leaf 'right' for the required ins_len and
@@ -4024,7 +3889,8 @@ err:
return ret;
}
-static noinline int split_item(struct btrfs_path *path,
+static noinline int split_item(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
const struct btrfs_key *new_key,
unsigned long split_offset)
{
@@ -4083,7 +3949,7 @@ static noinline int split_item(struct btrfs_path *path,
write_extent_buffer(leaf, buf + split_offset,
btrfs_item_ptr_offset(leaf, slot),
item_size - split_offset);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
BUG_ON(btrfs_leaf_free_space(leaf) < 0);
kfree(buf);
@@ -4117,7 +3983,7 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- ret = split_item(path, new_key, split_offset);
+ ret = split_item(trans, path, new_key, split_offset);
return ret;
}
@@ -4127,7 +3993,8 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
* off the end of the item or if we shift the item to chop bytes off
* the front.
*/
-void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
+void btrfs_truncate_item(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path, u32 new_size, int from_end)
{
int slot;
struct extent_buffer *leaf;
@@ -4203,11 +4070,11 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
btrfs_set_item_key(leaf, &disk_key, slot);
if (slot == 0)
- fixup_low_keys(path, &disk_key, 1);
+ fixup_low_keys(trans, path, &disk_key, 1);
}
btrfs_set_item_size(leaf, slot, new_size);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
if (btrfs_leaf_free_space(leaf) < 0) {
btrfs_print_leaf(leaf);
@@ -4218,7 +4085,8 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
/*
* make the item pointed to by the path bigger, data_size is the added size.
*/
-void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
+void btrfs_extend_item(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path, u32 data_size)
{
int slot;
struct extent_buffer *leaf;
@@ -4268,7 +4136,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
data_end = old_data;
old_size = btrfs_item_size(leaf, slot);
btrfs_set_item_size(leaf, slot, old_size + data_size);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
if (btrfs_leaf_free_space(leaf) < 0) {
btrfs_print_leaf(leaf);
@@ -4279,6 +4147,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
/*
* Make space in the node before inserting one or more items.
*
+ * @trans: transaction handle
* @root: root we are inserting items to
* @path: points to the leaf/slot where we are going to insert new items
* @batch: information about the batch of items to insert
@@ -4286,7 +4155,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
* Main purpose is to save stack depth by doing the bulk of the work in a
* function that doesn't call btrfs_search_slot
*/
-static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
+static void setup_items_for_insert(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
const struct btrfs_item_batch *batch)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4306,7 +4176,7 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
*/
if (path->slots[0] == 0) {
btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
- fixup_low_keys(path, &disk_key, 1);
+ fixup_low_keys(trans, path, &disk_key, 1);
}
btrfs_unlock_up_safe(path, 1);
@@ -4365,7 +4235,7 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
}
btrfs_set_header_nritems(leaf, nritems + batch->nr);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
if (btrfs_leaf_free_space(leaf) < 0) {
btrfs_print_leaf(leaf);
@@ -4376,12 +4246,14 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
/*
* Insert a new item into a leaf.
*
+ * @trans: Transaction handle.
* @root: The root of the btree.
* @path: A path pointing to the target leaf and slot.
* @key: The key of the new item.
* @data_size: The size of the data associated with the new key.
*/
-void btrfs_setup_item_for_insert(struct btrfs_root *root,
+void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
struct btrfs_path *path,
const struct btrfs_key *key,
u32 data_size)
@@ -4393,7 +4265,7 @@ void btrfs_setup_item_for_insert(struct btrfs_root *root,
batch.total_data_size = data_size;
batch.nr = 1;
- setup_items_for_insert(root, path, &batch);
+ setup_items_for_insert(trans, root, path, &batch);
}
/*
@@ -4419,7 +4291,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
slot = path->slots[0];
BUG_ON(slot < 0);
- setup_items_for_insert(root, path, batch);
+ setup_items_for_insert(trans, root, path, batch);
return 0;
}
@@ -4444,7 +4316,7 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
leaf = path->nodes[0];
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
write_extent_buffer(leaf, data, ptr, data_size);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
}
btrfs_free_path(path);
return ret;
@@ -4475,7 +4347,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
return ret;
path->slots[0]++;
- btrfs_setup_item_for_insert(root, path, new_key, item_size);
+ btrfs_setup_item_for_insert(trans, root, path, new_key, item_size);
leaf = path->nodes[0];
memcpy_extent_buffer(leaf,
btrfs_item_ptr_offset(leaf, path->slots[0]),
@@ -4533,9 +4405,9 @@ int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_disk_key disk_key;
btrfs_node_key(parent, &disk_key, 0);
- fixup_low_keys(path, &disk_key, level + 1);
+ fixup_low_keys(trans, path, &disk_key, level + 1);
}
- btrfs_mark_buffer_dirty(parent);
+ btrfs_mark_buffer_dirty(trans, parent);
return 0;
}
@@ -4567,7 +4439,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
*/
btrfs_unlock_up_safe(path, 0);
- root_sub_used(root, leaf->len);
+ root_sub_used_bytes(root);
atomic_inc(&leaf->refs);
btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
@@ -4632,7 +4504,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_disk_key disk_key;
btrfs_item_key(leaf, &disk_key, 0);
- fixup_low_keys(path, &disk_key, 1);
+ fixup_low_keys(trans, path, &disk_key, 1);
}
/*
@@ -4697,11 +4569,11 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
* dirtied this buffer
*/
if (path->nodes[0] == leaf)
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
free_extent_buffer(leaf);
}
} else {
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
}
}
return ret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ff40acd63a37..196c005c31f6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -6,37 +6,10 @@
#ifndef BTRFS_CTREE_H
#define BTRFS_CTREE_H
-#include <linux/mm.h>
-#include <linux/sched/signal.h>
-#include <linux/highmem.h>
-#include <linux/fs.h>
-#include <linux/rwsem.h>
-#include <linux/semaphore.h>
-#include <linux/completion.h>
-#include <linux/backing-dev.h>
-#include <linux/wait.h>
-#include <linux/slab.h>
-#include <trace/events/btrfs.h>
-#include <asm/unaligned.h>
#include <linux/pagemap.h>
-#include <linux/btrfs.h>
-#include <linux/btrfs_tree.h>
-#include <linux/workqueue.h>
-#include <linux/security.h>
-#include <linux/sizes.h>
-#include <linux/dynamic_debug.h>
-#include <linux/refcount.h>
-#include <linux/crc32c.h>
-#include <linux/iomap.h>
-#include <linux/fscrypt.h>
-#include "extent-io-tree.h"
-#include "extent_io.h"
-#include "extent_map.h"
-#include "async-thread.h"
-#include "block-rsv.h"
#include "locking.h"
-#include "misc.h"
#include "fs.h"
+#include "accessors.h"
struct btrfs_trans_handle;
struct btrfs_transaction;
@@ -218,10 +191,22 @@ struct btrfs_root {
atomic_t log_commit[2];
/* Used only for log trees of subvolumes, not for the log root tree */
atomic_t log_batch;
+ /*
+ * Protected by the 'log_mutex' lock but can be read without holding
+ * that lock to avoid unnecessary lock contention, in which case it
+ * should be read using btrfs_get_root_log_transid() except if it's a
+ * log tree in which case it can be directly accessed. Updates to this
+ * field should always use btrfs_set_root_log_transid(), except for log
+ * trees where the field can be updated directly.
+ */
int log_transid;
/* No matter the commit succeeds or not*/
int log_transid_committed;
- /* Just be updated when the commit succeeds. */
+ /*
+ * Just be updated when the commit succeeds. Use
+ * btrfs_get_root_last_log_commit() and btrfs_set_root_last_log_commit()
+ * to access this field.
+ */
int last_log_commit;
pid_t log_start_pid;
@@ -326,6 +311,9 @@ struct btrfs_root {
/* Used only by log trees, when logging csum items */
struct extent_io_tree log_csum_range;
+ /* Used in simple quotas, track root during relocation. */
+ u64 relocation_src_root;
+
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
u64 alloc_bytenr;
#endif
@@ -352,6 +340,26 @@ static inline u64 btrfs_root_id(const struct btrfs_root *root)
return root->root_key.objectid;
}
+static inline int btrfs_get_root_log_transid(const struct btrfs_root *root)
+{
+ return READ_ONCE(root->log_transid);
+}
+
+static inline void btrfs_set_root_log_transid(struct btrfs_root *root, int log_transid)
+{
+ WRITE_ONCE(root->log_transid, log_transid);
+}
+
+static inline int btrfs_get_root_last_log_commit(const struct btrfs_root *root)
+{
+ return READ_ONCE(root->last_log_commit);
+}
+
+static inline void btrfs_set_root_last_log_commit(struct btrfs_root *root, int commit_id)
+{
+ WRITE_ONCE(root->last_log_commit, commit_id);
+}
+
/*
* Structure that conveys information about an extent that is going to replace
* all the extents in a file range.
@@ -470,30 +478,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \
((bytes) >> (fs_info)->sectorsize_bits)
-static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length)
-{
- return crc32c(crc, address, length);
-}
-
-static inline void btrfs_crc32c_final(u32 crc, u8 *result)
-{
- put_unaligned_le32(~crc, result);
-}
-
-static inline u64 btrfs_name_hash(const char *name, int len)
-{
- return crc32c((u32)~1, name, len);
-}
-
-/*
- * Figure the key offset of an extended inode ref
- */
-static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
- int len)
-{
- return (u64) crc32c(parent_objectid, name, len);
-}
-
static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
{
return mapping_gfp_constraint(mapping, ~__GFP_FS);
@@ -513,12 +497,42 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
const struct btrfs_key *key, int *slot);
int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2);
+
+#ifdef __LITTLE_ENDIAN
+
+/*
+ * Compare two keys, on little-endian the disk order is same as CPU order and
+ * we can avoid the conversion.
+ */
+static inline int btrfs_comp_keys(const struct btrfs_disk_key *disk_key,
+ const struct btrfs_key *k2)
+{
+ const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
+
+ return btrfs_comp_cpu_keys(k1, k2);
+}
+
+#else
+
+/* Compare two keys in a memcmp fashion. */
+static inline int btrfs_comp_keys(const struct btrfs_disk_key *disk,
+ const struct btrfs_key *k2)
+{
+ struct btrfs_key k1;
+
+ btrfs_disk_key_to_cpu(&k1, disk);
+
+ return btrfs_comp_cpu_keys(&k1, k2);
+}
+
+#endif
+
int btrfs_previous_item(struct btrfs_root *root,
struct btrfs_path *path, u64 min_objectid,
int type);
int btrfs_previous_extent_item(struct btrfs_root *root,
struct btrfs_path *path, u64 min_objectid);
-void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
+void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
const struct btrfs_key *new_key);
struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
@@ -536,6 +550,13 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct extent_buffer *parent, int parent_slot,
struct extent_buffer **cow_ret,
enum btrfs_lock_nesting nest);
+int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ struct extent_buffer *parent, int parent_slot,
+ struct extent_buffer **cow_ret,
+ u64 search_start, u64 empty_size,
+ enum btrfs_lock_nesting nest);
int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
@@ -545,8 +566,10 @@ int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
struct extent_buffer *buf);
int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_path *path, int level, int slot);
-void btrfs_extend_item(struct btrfs_path *path, u32 data_size);
-void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end);
+void btrfs_extend_item(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path, u32 data_size);
+void btrfs_truncate_item(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path, u32 new_size, int from_end);
int btrfs_split_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
@@ -567,10 +590,6 @@ int btrfs_search_slot_for_read(struct btrfs_root *root,
const struct btrfs_key *key,
struct btrfs_path *p, int find_higher,
int return_any);
-int btrfs_realloc_node(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct extent_buffer *parent,
- int start_slot, u64 *last_ret,
- struct btrfs_key *progress);
void btrfs_release_path(struct btrfs_path *p);
struct btrfs_path *btrfs_alloc_path(void);
void btrfs_free_path(struct btrfs_path *p);
@@ -610,7 +629,8 @@ struct btrfs_item_batch {
int nr;
};
-void btrfs_setup_item_for_insert(struct btrfs_root *root,
+void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
struct btrfs_path *path,
const struct btrfs_key *key,
u32 data_size);
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index f2ff4cbe8656..5244561e2016 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -338,13 +338,118 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
}
/*
+ * Check if two blocks addresses are close, used by defrag.
+ */
+static bool close_blocks(u64 blocknr, u64 other, u32 blocksize)
+{
+ if (blocknr < other && other - (blocknr + blocksize) < SZ_32K)
+ return true;
+ if (blocknr > other && blocknr - (other + blocksize) < SZ_32K)
+ return true;
+ return false;
+}
+
+/*
+ * Go through all the leaves pointed to by a node and reallocate them so that
+ * disk order is close to key order.
+ */
+static int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *parent,
+ int start_slot, u64 *last_ret,
+ struct btrfs_key *progress)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ const u32 blocksize = fs_info->nodesize;
+ const int end_slot = btrfs_header_nritems(parent) - 1;
+ u64 search_start = *last_ret;
+ u64 last_block = 0;
+ int ret = 0;
+ bool progress_passed = false;
+
+ /*
+ * COWing must happen through a running transaction, which always
+ * matches the current fs generation (it's a transaction with a state
+ * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs
+ * into error state to prevent the commit of any transaction.
+ */
+ if (unlikely(trans->transaction != fs_info->running_transaction ||
+ trans->transid != fs_info->generation)) {
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ btrfs_crit(fs_info,
+"unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu",
+ parent->start, btrfs_root_id(root), trans->transid,
+ fs_info->running_transaction->transid,
+ fs_info->generation);
+ return -EUCLEAN;
+ }
+
+ if (btrfs_header_nritems(parent) <= 1)
+ return 0;
+
+ for (int i = start_slot; i <= end_slot; i++) {
+ struct extent_buffer *cur;
+ struct btrfs_disk_key disk_key;
+ u64 blocknr;
+ u64 other;
+ bool close = true;
+
+ btrfs_node_key(parent, &disk_key, i);
+ if (!progress_passed && btrfs_comp_keys(&disk_key, progress) < 0)
+ continue;
+
+ progress_passed = true;
+ blocknr = btrfs_node_blockptr(parent, i);
+ if (last_block == 0)
+ last_block = blocknr;
+
+ if (i > 0) {
+ other = btrfs_node_blockptr(parent, i - 1);
+ close = close_blocks(blocknr, other, blocksize);
+ }
+ if (!close && i < end_slot) {
+ other = btrfs_node_blockptr(parent, i + 1);
+ close = close_blocks(blocknr, other, blocksize);
+ }
+ if (close) {
+ last_block = blocknr;
+ continue;
+ }
+
+ cur = btrfs_read_node_slot(parent, i);
+ if (IS_ERR(cur))
+ return PTR_ERR(cur);
+ if (search_start == 0)
+ search_start = last_block;
+
+ btrfs_tree_lock(cur);
+ ret = btrfs_force_cow_block(trans, root, cur, parent, i,
+ &cur, search_start,
+ min(16 * blocksize,
+ (end_slot - i) * blocksize),
+ BTRFS_NESTING_COW);
+ if (ret) {
+ btrfs_tree_unlock(cur);
+ free_extent_buffer(cur);
+ break;
+ }
+ search_start = cur->start;
+ last_block = cur->start;
+ *last_ret = search_start;
+ btrfs_tree_unlock(cur);
+ free_extent_buffer(cur);
+ }
+ return ret;
+}
+
+/*
* Defrag all the leaves in a given btree.
* Read all the leaves and try to get key order to
* better reflect disk order
*/
-int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
{
struct btrfs_path *path = NULL;
struct btrfs_key key;
@@ -461,6 +566,45 @@ done:
}
/*
+ * Defrag a given btree. Every leaf in the btree is read and defragmented.
+ */
+int btrfs_defrag_root(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
+
+ if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))
+ return 0;
+
+ while (1) {
+ struct btrfs_trans_handle *trans;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+
+ ret = btrfs_defrag_leaves(trans, root);
+
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(fs_info);
+ cond_resched();
+
+ if (btrfs_fs_closing(fs_info) || ret != -EAGAIN)
+ break;
+
+ if (btrfs_defrag_cancelled(fs_info)) {
+ btrfs_debug(fs_info, "defrag_root cancelled");
+ ret = -EAGAIN;
+ break;
+ }
+ }
+ clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);
+ return ret;
+}
+
+/*
* Defrag specific helper to get an extent map.
*
* Differences between this and btrfs_get_extent() are:
@@ -891,8 +1035,8 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
* very likely resulting in a larger extent after writeback is
* triggered (except in a case of free space fragmentation).
*/
- if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1,
- EXTENT_DELALLOC, 0, NULL))
+ if (test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1,
+ EXTENT_DELALLOC))
goto next;
/*
diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h
index 5305f2283b5e..5a62763528d1 100644
--- a/fs/btrfs/defrag.h
+++ b/fs/btrfs/defrag.h
@@ -12,7 +12,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u32 extent_thresh);
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
-int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+int btrfs_defrag_root(struct btrfs_root *root);
static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
{
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 427abaf608b8..2833e8ef4c09 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -199,7 +199,7 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
start = round_down(start, fs_info->sectorsize);
btrfs_free_reserved_data_space_noquota(fs_info, len);
- btrfs_qgroup_free_data(inode, reserved, start, len);
+ btrfs_qgroup_free_data(inode, reserved, start, len, NULL);
}
/*
@@ -322,9 +322,6 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
} else {
if (current->journal_info)
flush = BTRFS_RESERVE_FLUSH_LIMIT;
-
- if (btrfs_transaction_in_commit(fs_info))
- schedule_timeout(1);
}
num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
@@ -346,7 +343,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
noflush);
if (ret)
return ret;
- ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush);
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
+ meta_reserve, flush);
if (ret) {
btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
return ret;
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 90aaedce1548..7381241334e8 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -328,7 +328,8 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len,
}
/*
- * __btrfs_lookup_delayed_item - look up the delayed item by key
+ * Look up the delayed item by key.
+ *
* @delayed_node: pointer to the delayed node
* @index: the dir index value to lookup (offset of a dir index key)
*
@@ -517,7 +518,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
/*
* For insertions we track reserved metadata space by accounting
* for the number of leaves that will be used, based on the delayed
- * node's index_items_size field.
+ * node's curr_index_batch_size and index_item_leaves fields.
*/
if (item->type == BTRFS_DELAYED_DELETION_ITEM)
item->bytes_reserved = num_bytes;
@@ -1030,7 +1031,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode_item);
write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item,
sizeof(struct btrfs_inode_item));
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
goto out;
@@ -1378,8 +1379,7 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
return -ENOMEM;
async_work->delayed_root = delayed_root;
- btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL,
- NULL);
+ btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL);
async_work->nr = nr;
btrfs_queue_work(fs_info->delayed_workers, &async_work->work);
@@ -1760,8 +1760,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
}
/*
- * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree
- *
+ * Read dir info stored in the delayed tree.
*/
int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
struct list_head *ins_list)
@@ -1834,24 +1833,22 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_stack_inode_block_group(inode_item, 0);
btrfs_set_stack_timespec_sec(&inode_item->atime,
- inode->i_atime.tv_sec);
+ inode_get_atime_sec(inode));
btrfs_set_stack_timespec_nsec(&inode_item->atime,
- inode->i_atime.tv_nsec);
+ inode_get_atime_nsec(inode));
btrfs_set_stack_timespec_sec(&inode_item->mtime,
- inode->i_mtime.tv_sec);
+ inode_get_mtime_sec(inode));
btrfs_set_stack_timespec_nsec(&inode_item->mtime,
- inode->i_mtime.tv_nsec);
+ inode_get_mtime_nsec(inode));
btrfs_set_stack_timespec_sec(&inode_item->ctime,
- inode_get_ctime(inode).tv_sec);
+ inode_get_ctime_sec(inode));
btrfs_set_stack_timespec_nsec(&inode_item->ctime,
- inode_get_ctime(inode).tv_nsec);
+ inode_get_ctime_nsec(inode));
- btrfs_set_stack_timespec_sec(&inode_item->otime,
- BTRFS_I(inode)->i_otime.tv_sec);
- btrfs_set_stack_timespec_nsec(&inode_item->otime,
- BTRFS_I(inode)->i_otime.tv_nsec);
+ btrfs_set_stack_timespec_sec(&inode_item->otime, BTRFS_I(inode)->i_otime_sec);
+ btrfs_set_stack_timespec_nsec(&inode_item->otime, BTRFS_I(inode)->i_otime_nsec);
}
int btrfs_fill_inode(struct inode *inode, u32 *rdev)
@@ -1891,19 +1888,17 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item),
&BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
- inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime);
- inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime);
+ inode_set_atime(inode, btrfs_stack_timespec_sec(&inode_item->atime),
+ btrfs_stack_timespec_nsec(&inode_item->atime));
- inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime);
- inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime);
+ inode_set_mtime(inode, btrfs_stack_timespec_sec(&inode_item->mtime),
+ btrfs_stack_timespec_nsec(&inode_item->mtime));
inode_set_ctime(inode, btrfs_stack_timespec_sec(&inode_item->ctime),
btrfs_stack_timespec_nsec(&inode_item->ctime));
- BTRFS_I(inode)->i_otime.tv_sec =
- btrfs_stack_timespec_sec(&inode_item->otime);
- BTRFS_I(inode)->i_otime.tv_nsec =
- btrfs_stack_timespec_nsec(&inode_item->otime);
+ BTRFS_I(inode)->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime);
+ BTRFS_I(inode)->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime);
inode->i_generation = BTRFS_I(inode)->generation;
BTRFS_I(inode)->index_cnt = (u64)-1;
@@ -1914,9 +1909,9 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
}
int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *inode)
{
+ struct btrfs_root *root = inode->root;
struct btrfs_delayed_node *delayed_node;
int ret = 0;
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 1da213197f55..5cceb31bbd16 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -135,7 +135,6 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode);
int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *inode);
int btrfs_fill_inode(struct inode *inode, u32 *rdev);
int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 9fe4ccca50a0..891ea2fa263c 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -57,16 +57,20 @@ bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
* Release a ref head's reservation.
*
* @fs_info: the filesystem
- * @nr: number of items to drop
+ * @nr_refs: number of delayed refs to drop
+ * @nr_csums: number of csum items to drop
*
* Drops the delayed ref head's count from the delayed refs rsv and free any
* excess reservation we had.
*/
-void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
+void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums)
{
struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
- const u64 num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr);
- u64 released = 0;
+ u64 num_bytes;
+ u64 released;
+
+ num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr_refs);
+ num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums);
released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
if (released)
@@ -77,26 +81,118 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
/*
* Adjust the size of the delayed refs rsv.
*
- * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
- * it'll calculate the additional size and add it to the delayed_refs_rsv.
+ * This is to be called anytime we may have adjusted trans->delayed_ref_updates
+ * or trans->delayed_ref_csum_deletions, it'll calculate the additional size and
+ * add it to the delayed_refs_rsv.
*/
void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+ struct btrfs_block_rsv *local_rsv = &trans->delayed_rsv;
u64 num_bytes;
+ u64 reserved_bytes;
- if (!trans->delayed_ref_updates)
+ num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, trans->delayed_ref_updates);
+ num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info,
+ trans->delayed_ref_csum_deletions);
+
+ if (num_bytes == 0)
return;
- num_bytes = btrfs_calc_delayed_ref_bytes(fs_info,
- trans->delayed_ref_updates);
+ /*
+ * Try to take num_bytes from the transaction's local delayed reserve.
+ * If not possible, try to take as much as it's available. If the local
+ * reserve doesn't have enough reserved space, the delayed refs reserve
+ * will be refilled next time btrfs_delayed_refs_rsv_refill() is called
+ * by someone or if a transaction commit is triggered before that, the
+ * global block reserve will be used. We want to minimize using the
+ * global block reserve for cases we can account for in advance, to
+ * avoid exhausting it and reach -ENOSPC during a transaction commit.
+ */
+ spin_lock(&local_rsv->lock);
+ reserved_bytes = min(num_bytes, local_rsv->reserved);
+ local_rsv->reserved -= reserved_bytes;
+ local_rsv->full = (local_rsv->reserved >= local_rsv->size);
+ spin_unlock(&local_rsv->lock);
spin_lock(&delayed_rsv->lock);
delayed_rsv->size += num_bytes;
- delayed_rsv->full = false;
+ delayed_rsv->reserved += reserved_bytes;
+ delayed_rsv->full = (delayed_rsv->reserved >= delayed_rsv->size);
spin_unlock(&delayed_rsv->lock);
trans->delayed_ref_updates = 0;
+ trans->delayed_ref_csum_deletions = 0;
+}
+
+/*
+ * Adjust the size of the delayed refs block reserve for 1 block group item
+ * insertion, used after allocating a block group.
+ */
+void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+
+ spin_lock(&delayed_rsv->lock);
+ /*
+ * Inserting a block group item does not require changing the free space
+ * tree, only the extent tree or the block group tree, so this is all we
+ * need.
+ */
+ delayed_rsv->size += btrfs_calc_insert_metadata_size(fs_info, 1);
+ delayed_rsv->full = false;
+ spin_unlock(&delayed_rsv->lock);
+}
+
+/*
+ * Adjust the size of the delayed refs block reserve to release space for 1
+ * block group item insertion.
+ */
+void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+ const u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+ u64 released;
+
+ released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL);
+ if (released > 0)
+ trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
+ 0, released, 0);
+}
+
+/*
+ * Adjust the size of the delayed refs block reserve for 1 block group item
+ * update.
+ */
+void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+
+ spin_lock(&delayed_rsv->lock);
+ /*
+ * Updating a block group item does not result in new nodes/leaves and
+ * does not require changing the free space tree, only the extent tree
+ * or the block group tree, so this is all we need.
+ */
+ delayed_rsv->size += btrfs_calc_metadata_size(fs_info, 1);
+ delayed_rsv->full = false;
+ spin_unlock(&delayed_rsv->lock);
+}
+
+/*
+ * Adjust the size of the delayed refs block reserve to release space for 1
+ * block group item update.
+ */
+void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+ const u64 num_bytes = btrfs_calc_metadata_size(fs_info, 1);
+ u64 released;
+
+ released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL);
+ if (released > 0)
+ trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
+ 0, released, 0);
}
/*
@@ -154,6 +250,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
enum btrfs_reserve_flush_enum flush)
{
struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
+ struct btrfs_space_info *space_info = block_rsv->space_info;
u64 limit = btrfs_calc_delayed_ref_bytes(fs_info, 1);
u64 num_bytes = 0;
u64 refilled_bytes;
@@ -170,7 +267,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
if (!num_bytes)
return 0;
- ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(fs_info, space_info, num_bytes, flush);
if (ret)
return ret;
@@ -199,8 +296,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
spin_unlock(&block_rsv->lock);
if (to_free > 0)
- btrfs_space_info_free_bytes_may_use(fs_info, block_rsv->space_info,
- to_free);
+ btrfs_space_info_free_bytes_may_use(fs_info, space_info, to_free);
if (refilled_bytes > 0)
trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0,
@@ -422,7 +518,8 @@ int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
return 0;
}
-static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs,
+static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head,
struct btrfs_delayed_ref_node *ref)
{
@@ -433,9 +530,11 @@ static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs,
list_del(&ref->add_list);
btrfs_put_delayed_ref(ref);
atomic_dec(&delayed_refs->num_entries);
+ btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
}
-static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs,
+static bool merge_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head,
struct btrfs_delayed_ref_node *ref,
u64 seq)
@@ -464,10 +563,10 @@ static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs,
mod = -next->ref_mod;
}
- drop_delayed_ref(delayed_refs, head, next);
+ drop_delayed_ref(fs_info, delayed_refs, head, next);
ref->ref_mod += mod;
if (ref->ref_mod == 0) {
- drop_delayed_ref(delayed_refs, head, ref);
+ drop_delayed_ref(fs_info, delayed_refs, head, ref);
done = true;
} else {
/*
@@ -505,7 +604,7 @@ again:
ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
if (seq && ref->seq >= seq)
continue;
- if (merge_ref(delayed_refs, head, ref, seq))
+ if (merge_ref(fs_info, delayed_refs, head, ref, seq))
goto again;
}
}
@@ -584,10 +683,11 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
* Return true if the ref was merged into an existing one (and therefore can be
* freed by the caller).
*/
-static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root,
+static bool insert_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
struct btrfs_delayed_ref_node *ref)
{
+ struct btrfs_delayed_ref_root *root = &trans->transaction->delayed_refs;
struct btrfs_delayed_ref_node *exist;
int mod;
@@ -598,6 +698,7 @@ static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root,
list_add_tail(&ref->add_list, &href->ref_add_list);
atomic_inc(&root->num_entries);
spin_unlock(&href->lock);
+ trans->delayed_ref_updates++;
return false;
}
@@ -626,7 +727,7 @@ static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root,
/* remove existing tail if its ref_mod is zero */
if (exist->ref_mod == 0)
- drop_delayed_ref(root, href, exist);
+ drop_delayed_ref(trans->fs_info, root, href, exist);
spin_unlock(&href->lock);
return true;
}
@@ -647,6 +748,15 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
BUG_ON(existing->is_data != update->is_data);
spin_lock(&existing->lock);
+
+ /*
+ * When freeing an extent, we may not know the owning root when we
+ * first create the head_ref. However, some deref before the last deref
+ * will know it, so we just need to update the head_ref accordingly.
+ */
+ if (!existing->owning_root)
+ existing->owning_root = update->owning_root;
+
if (update->must_insert_reserved) {
/* if the extent was freed and then
* reallocated before the delayed ref
@@ -656,6 +766,7 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
* Set it again here
*/
existing->must_insert_reserved = update->must_insert_reserved;
+ existing->owning_root = update->owning_root;
/*
* update the num_bytes so we make sure the accounting
@@ -695,6 +806,8 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
/*
* If we are going to from a positive ref mod to a negative or vice
* versa we need to make sure to adjust pending_csums accordingly.
+ * We reserve bytes for csum deletion when adding or updating a ref head
+ * see add_delayed_ref_head() for more details.
*/
if (existing->is_data) {
u64 csum_leaves =
@@ -703,11 +816,11 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
if (existing->total_ref_mod >= 0 && old_ref_mod < 0) {
delayed_refs->pending_csums -= existing->num_bytes;
- btrfs_delayed_refs_rsv_release(fs_info, csum_leaves);
+ btrfs_delayed_refs_rsv_release(fs_info, 0, csum_leaves);
}
if (existing->total_ref_mod < 0 && old_ref_mod >= 0) {
delayed_refs->pending_csums += existing->num_bytes;
- trans->delayed_ref_updates += csum_leaves;
+ trans->delayed_ref_csum_deletions += csum_leaves;
}
}
@@ -718,7 +831,7 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
struct btrfs_qgroup_extent_record *qrecord,
u64 bytenr, u64 num_bytes, u64 ref_root,
u64 reserved, int action, bool is_data,
- bool is_system)
+ bool is_system, u64 owning_root)
{
int count_mod = 1;
bool must_insert_reserved = false;
@@ -758,7 +871,9 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
head_ref->bytenr = bytenr;
head_ref->num_bytes = num_bytes;
head_ref->ref_mod = count_mod;
+ head_ref->reserved_bytes = reserved;
head_ref->must_insert_reserved = must_insert_reserved;
+ head_ref->owning_root = owning_root;
head_ref->is_data = is_data;
head_ref->is_system = is_system;
head_ref->ref_tree = RB_ROOT_CACHED;
@@ -819,16 +934,21 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
head_ref = existing;
} else {
+ /*
+ * We reserve the amount of bytes needed to delete csums when
+ * adding the ref head and not when adding individual drop refs
+ * since the csum items are deleted only after running the last
+ * delayed drop ref (the data extent's ref count drops to 0).
+ */
if (head_ref->is_data && head_ref->ref_mod < 0) {
delayed_refs->pending_csums += head_ref->num_bytes;
- trans->delayed_ref_updates +=
+ trans->delayed_ref_csum_deletions +=
btrfs_csum_bytes_to_leaves(trans->fs_info,
head_ref->num_bytes);
}
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
atomic_inc(&delayed_refs->num_entries);
- trans->delayed_ref_updates++;
}
if (qrecord_inserted_ret)
*qrecord_inserted_ret = qrecord_inserted;
@@ -837,8 +957,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
}
/*
- * init_delayed_ref_common - Initialize the structure which represents a
- * modification to a an extent.
+ * Initialize the structure which represents a modification to a an extent.
*
* @fs_info: Internal to the mounted filesystem mount structure.
*
@@ -909,7 +1028,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
u64 parent = generic_ref->parent;
u8 ref_type;
- is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID);
+ is_system = (generic_ref->tree_ref.ref_root == BTRFS_CHUNK_TREE_OBJECTID);
ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
@@ -922,8 +1041,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
return -ENOMEM;
}
- if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
- !generic_ref->skip_qgroup) {
+ if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
@@ -938,15 +1056,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
ref_type = BTRFS_TREE_BLOCK_REF_KEY;
init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
- generic_ref->tree_ref.owning_root, action,
+ generic_ref->tree_ref.ref_root, action,
ref_type);
- ref->root = generic_ref->tree_ref.owning_root;
+ ref->root = generic_ref->tree_ref.ref_root;
ref->parent = parent;
ref->level = level;
init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
- generic_ref->tree_ref.owning_root, 0, action,
- false, is_system);
+ generic_ref->tree_ref.ref_root, 0, action,
+ false, is_system, generic_ref->owning_root);
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
@@ -959,7 +1077,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
head_ref = add_delayed_ref_head(trans, head_ref, record,
action, &qrecord_inserted);
- merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node);
+ merged = insert_delayed_ref(trans, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
/*
@@ -998,7 +1116,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
u64 bytenr = generic_ref->bytenr;
u64 num_bytes = generic_ref->len;
u64 parent = generic_ref->parent;
- u64 ref_root = generic_ref->data_ref.owning_root;
+ u64 ref_root = generic_ref->data_ref.ref_root;
u64 owner = generic_ref->data_ref.ino;
u64 offset = generic_ref->data_ref.offset;
u8 ref_type;
@@ -1026,8 +1144,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
return -ENOMEM;
}
- if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
- !generic_ref->skip_qgroup) {
+ if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
@@ -1038,7 +1155,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
}
init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root,
- reserved, action, true, false);
+ reserved, action, true, false, generic_ref->owning_root);
head_ref->extent_op = NULL;
delayed_refs = &trans->transaction->delayed_refs;
@@ -1051,7 +1168,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
head_ref = add_delayed_ref_head(trans, head_ref, record,
action, &qrecord_inserted);
- merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node);
+ merged = insert_delayed_ref(trans, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
/*
@@ -1084,7 +1201,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
return -ENOMEM;
init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0,
- BTRFS_UPDATE_DELAYED_HEAD, false, false);
+ BTRFS_UPDATE_DELAYED_HEAD, false, false, 0);
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index fd9bf2b709c0..62d679d40f4f 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -9,10 +9,16 @@
#include <linux/refcount.h>
/* these are the possible values of struct btrfs_delayed_ref_node->action */
-#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */
-#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */
-#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
-#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
+enum btrfs_delayed_ref_action {
+ /* Add one backref to the tree */
+ BTRFS_ADD_DELAYED_REF = 1,
+ /* Delete one backref from the tree */
+ BTRFS_DROP_DELAYED_REF,
+ /* Record a full extent allocation */
+ BTRFS_ADD_DELAYED_EXTENT,
+ /* Not changing ref count on head ref */
+ BTRFS_UPDATE_DELAYED_HEAD,
+} __packed;
struct btrfs_delayed_ref_node {
struct rb_node ref_node;
@@ -105,6 +111,18 @@ struct btrfs_delayed_ref_head {
int ref_mod;
/*
+ * The root that triggered the allocation when must_insert_reserved is
+ * set to true.
+ */
+ u64 owning_root;
+
+ /*
+ * Track reserved bytes when setting must_insert_reserved. On success
+ * or cleanup, we will need to free the reservation.
+ */
+ u64 reserved_bytes;
+
+ /*
* when a new extent is allocated, it is just reserved in memory
* The actual extent isn't inserted into the extent allocation tree
* until the delayed ref is processed. must_insert_reserved is
@@ -117,6 +135,7 @@ struct btrfs_delayed_ref_head {
* the free has happened.
*/
bool must_insert_reserved;
+
bool is_data;
bool is_system;
bool processing;
@@ -183,13 +202,13 @@ enum btrfs_ref_type {
BTRFS_REF_DATA,
BTRFS_REF_METADATA,
BTRFS_REF_LAST,
-};
+} __packed;
struct btrfs_data_ref {
/* For EXTENT_DATA_REF */
- /* Original root this data extent belongs to */
- u64 owning_root;
+ /* Root which owns this data reference. */
+ u64 ref_root;
/* Inode which refers to this data extent */
u64 ino;
@@ -212,18 +231,18 @@ struct btrfs_tree_ref {
int level;
/*
- * Root which owns this tree block.
+ * Root which owns this tree block reference.
*
* For TREE_BLOCK_REF (skinny metadata, either inline or keyed)
*/
- u64 owning_root;
+ u64 ref_root;
/* For non-skinny metadata, no special member needed */
};
struct btrfs_ref {
enum btrfs_ref_type type;
- int action;
+ enum btrfs_delayed_ref_action action;
/*
* Whether this extent should go through qgroup record.
@@ -239,6 +258,7 @@ struct btrfs_ref {
#endif
u64 bytenr;
u64 len;
+ u64 owning_root;
/* Bytenr of the parent tree block */
u64 parent;
@@ -277,24 +297,37 @@ static inline u64 btrfs_calc_delayed_ref_bytes(const struct btrfs_fs_info *fs_in
return num_bytes;
}
+static inline u64 btrfs_calc_delayed_ref_csum_bytes(const struct btrfs_fs_info *fs_info,
+ int num_csum_items)
+{
+ /*
+ * Deleting csum items does not result in new nodes/leaves and does not
+ * require changing the free space tree, only the csum tree, so this is
+ * all we need.
+ */
+ return btrfs_calc_metadata_size(fs_info, num_csum_items);
+}
+
static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref,
- int action, u64 bytenr, u64 len, u64 parent)
+ int action, u64 bytenr, u64 len,
+ u64 parent, u64 owning_root)
{
generic_ref->action = action;
generic_ref->bytenr = bytenr;
generic_ref->len = len;
generic_ref->parent = parent;
+ generic_ref->owning_root = owning_root;
}
-static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref,
- int level, u64 root, u64 mod_root, bool skip_qgroup)
+static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level,
+ u64 root, u64 mod_root, bool skip_qgroup)
{
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
/* If @real_root not set, use @root as fallback */
generic_ref->real_root = mod_root ?: root;
#endif
generic_ref->tree_ref.level = level;
- generic_ref->tree_ref.owning_root = root;
+ generic_ref->tree_ref.ref_root = root;
generic_ref->type = BTRFS_REF_METADATA;
if (skip_qgroup || !(is_fstree(root) &&
(!mod_root || is_fstree(mod_root))))
@@ -312,7 +345,7 @@ static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref,
/* If @real_root not set, use @root as fallback */
generic_ref->real_root = mod_root ?: ref_root;
#endif
- generic_ref->data_ref.owning_root = ref_root;
+ generic_ref->data_ref.ref_root = ref_root;
generic_ref->data_ref.ino = ino;
generic_ref->data_ref.offset = offset;
generic_ref->type = BTRFS_REF_DATA;
@@ -338,7 +371,6 @@ btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op)
static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
{
- WARN_ON(refcount_read(&ref->refs) == 0);
if (refcount_dec_and_test(&ref->refs)) {
WARN_ON(!RB_EMPTY_NODE(&ref->ref_node));
switch (ref->type) {
@@ -402,8 +434,12 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head(
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
-void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr);
+void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums);
void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans);
+void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info);
+void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info);
+void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info);
+void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info);
int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
enum btrfs_reserve_flush_enum flush);
void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index fff22ed55c42..f9544fda38e9 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -17,7 +17,6 @@
#include "print-tree.h"
#include "volumes.h"
#include "async-thread.h"
-#include "check-integrity.h"
#include "dev-replace.h"
#include "sysfs.h"
#include "zoned.h"
@@ -247,6 +246,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
+ struct bdev_handle *bdev_handle;
struct block_device *bdev;
u64 devid = BTRFS_DEV_REPLACE_DEVID;
int ret = 0;
@@ -257,12 +257,13 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
return -EINVAL;
}
- bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE,
- fs_info->bdev_holder, NULL);
- if (IS_ERR(bdev)) {
+ bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE,
+ fs_info->bdev_holder, NULL);
+ if (IS_ERR(bdev_handle)) {
btrfs_err(fs_info, "target device %s is invalid!", device_path);
- return PTR_ERR(bdev);
+ return PTR_ERR(bdev_handle);
}
+ bdev = bdev_handle->bdev;
if (!btrfs_check_device_zone_type(fs_info, bdev)) {
btrfs_err(fs_info,
@@ -313,9 +314,9 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
device->commit_bytes_used = device->bytes_used;
device->fs_info = fs_info;
device->bdev = bdev;
+ device->bdev_handle = bdev_handle;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
- device->holder = fs_info->bdev_holder;
device->dev_stats_valid = 1;
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
device->fs_devices = fs_devices;
@@ -334,7 +335,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
return 0;
error:
- blkdev_put(bdev, fs_info->bdev_holder);
+ bdev_release(bdev_handle);
return ret;
}
@@ -442,7 +443,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
dev_replace->item_needs_writeback = 0;
up_write(&dev_replace->rwsem);
- btrfs_mark_buffer_dirty(eb);
+ btrfs_mark_buffer_dirty(trans, eb);
out:
btrfs_free_path(path);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 082eb0e19598..9c07d5c3e5ad 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -38,7 +38,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
if (di)
return ERR_PTR(-EEXIST);
- btrfs_extend_item(path, data_size);
+ btrfs_extend_item(trans, path, data_size);
} else if (ret < 0)
return ERR_PTR(ret);
WARN_ON(ret > 0);
@@ -93,7 +93,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
write_extent_buffer(leaf, name, name_ptr, name_len);
write_extent_buffer(leaf, data, data_ptr, data_len);
- btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_mark_buffer_dirty(trans, path->nodes[0]);
return ret;
}
@@ -153,7 +153,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
name_ptr = (unsigned long)(dir_item + 1);
write_extent_buffer(leaf, name->name, name_ptr, name->len);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
second_insert:
/* FIXME, use some real flag for selecting the extra index */
@@ -439,7 +439,7 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
start = btrfs_item_ptr_offset(leaf, path->slots[0]);
memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
item_len - (ptr + sub_item_len - start));
- btrfs_truncate_item(path, item_len - sub_item_len, 1);
+ btrfs_truncate_item(trans, path, item_len - sub_item_len, 1);
}
return ret;
}
diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h
index aab4b7cc7fa0..e40a226373d7 100644
--- a/fs/btrfs/dir-item.h
+++ b/fs/btrfs/dir-item.h
@@ -3,6 +3,10 @@
#ifndef BTRFS_DIR_ITEM_H
#define BTRFS_DIR_ITEM_H
+#include <linux/crc32c.h>
+
+struct fscrypt_str;
+
int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
const struct fscrypt_str *name);
int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
@@ -39,4 +43,9 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
const char *name,
int name_len);
+static inline u64 btrfs_name_hash(const char *name, int len)
+{
+ return crc32c((u32)~1, name, len);
+}
+
#endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 68f60d50e1fd..62cb97f7c94f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -29,7 +29,6 @@
#include "tree-log.h"
#include "free-space-cache.h"
#include "free-space-tree.h"
-#include "check-integrity.h"
#include "rcu-string.h"
#include "dev-replace.h"
#include "raid56.h"
@@ -245,6 +244,7 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
struct extent_buffer *eb = bbio->private;
struct btrfs_fs_info *fs_info = eb->fs_info;
u64 found_start = btrfs_header_bytenr(eb);
+ u64 last_trans;
u8 result[BTRFS_CSUM_SIZE];
int ret;
@@ -282,12 +282,12 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
* Also check the generation, the eb reached here must be newer than
* last committed. Or something seriously wrong happened.
*/
- if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) {
+ last_trans = btrfs_get_last_trans_committed(fs_info);
+ if (unlikely(btrfs_header_generation(eb) <= last_trans)) {
ret = -EUCLEAN;
btrfs_err(fs_info,
"block=%llu bad generation, have %llu expect > %llu",
- eb->start, btrfs_header_generation(eb),
- fs_info->last_trans_committed);
+ eb->start, btrfs_header_generation(eb), last_trans);
goto error;
}
write_extent_buffer(eb, result, 0, fs_info->csum_size);
@@ -318,9 +318,10 @@ static bool check_tree_block_fsid(struct extent_buffer *eb)
BTRFS_FSID_SIZE);
/*
- * alloc_fs_devices() copies the fsid into metadata_uuid if the
- * metadata_uuid is unset in the superblock, including for a seed device.
- * So, we can use fs_devices->metadata_uuid.
+ * alloc_fsid_devices() copies the fsid into fs_devices::metadata_uuid.
+ * This is then overwritten by metadata_uuid if it is present in the
+ * device_list_add(). The same true for a seed device as well. So use of
+ * fs_devices::metadata_uuid is appropriate here.
*/
if (memcmp(fsid, fs_info->fs_devices->metadata_uuid, BTRFS_FSID_SIZE) == 0)
return false;
@@ -675,9 +676,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
refcount_set(&root->refs, 1);
atomic_set(&root->snapshot_force_cow, 0);
atomic_set(&root->nr_swapfiles, 0);
- root->log_transid = 0;
+ btrfs_set_root_log_transid(root, 0);
root->log_transid_committed = -1;
- root->last_log_commit = 0;
+ btrfs_set_root_last_log_commit(root, 0);
root->anon_dev = 0;
if (!dummy) {
extent_io_tree_init(fs_info, &root->dirty_log_pages,
@@ -859,7 +860,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
root->root_key.offset = 0;
leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
- BTRFS_NESTING_NORMAL);
+ 0, BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
leaf = NULL;
@@ -867,7 +868,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
}
root->node = leaf;
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
root->commit_root = btrfs_root_node(root);
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
@@ -936,13 +937,13 @@ int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
*/
leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
- NULL, 0, 0, 0, BTRFS_NESTING_NORMAL);
+ NULL, 0, 0, 0, 0, BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf))
return PTR_ERR(leaf);
root->node = leaf;
- btrfs_mark_buffer_dirty(root->node);
+ btrfs_mark_buffer_dirty(trans, root->node);
btrfs_tree_unlock(root->node);
return 0;
@@ -1004,9 +1005,9 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(root->log_root);
root->log_root = log_root;
- root->log_transid = 0;
+ btrfs_set_root_log_transid(root, 0);
root->log_transid_committed = -1;
- root->last_log_commit = 0;
+ btrfs_set_root_last_log_commit(root, 0);
return 0;
}
@@ -1179,6 +1180,8 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
return btrfs_grab_root(fs_info->block_group_root);
case BTRFS_FREE_SPACE_TREE_OBJECTID:
return btrfs_grab_root(btrfs_global_root(fs_info, &key));
+ case BTRFS_RAID_STRIPE_TREE_OBJECTID:
+ return btrfs_grab_root(fs_info->stripe_root);
default:
return NULL;
}
@@ -1259,6 +1262,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
btrfs_put_root(fs_info->fs_root);
btrfs_put_root(fs_info->data_reloc_root);
btrfs_put_root(fs_info->block_group_root);
+ btrfs_put_root(fs_info->stripe_root);
btrfs_check_leaked_roots(fs_info);
btrfs_extent_buffer_leak_debug_check(fs_info);
kfree(fs_info->super_copy);
@@ -1402,7 +1406,8 @@ struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
}
/*
- * btrfs_get_fs_root_commit_root - return a root for the given objectid
+ * Return a root for the given objectid.
+ *
* @fs_info: the fs_info
* @objectid: the objectid we need to lookup
*
@@ -1699,11 +1704,11 @@ static void backup_super_roots(struct btrfs_fs_info *info)
}
/*
- * read_backup_root - Reads a backup root based on the passed priority. Prio 0
- * is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
+ * Reads a backup root based on the passed priority. Prio 0 is the newest, prio
+ * 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
*
- * fs_info - filesystem whose backup roots need to be read
- * priority - priority of backup root required
+ * @fs_info: filesystem whose backup roots need to be read
+ * @priority: priority of backup root required
*
* Returns backup root index on success and -EINVAL otherwise.
*/
@@ -1803,6 +1808,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
free_root_extent_buffers(info->fs_root);
free_root_extent_buffers(info->data_reloc_root);
free_root_extent_buffers(info->block_group_root);
+ free_root_extent_buffers(info->stripe_root);
if (free_chunk_root)
free_root_extent_buffers(info->chunk_root);
}
@@ -2262,7 +2268,6 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
root = btrfs_read_tree_root(tree_root, &location);
if (!IS_ERR(root)) {
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
fs_info->quota_root = root;
}
@@ -2279,6 +2284,20 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
fs_info->uuid_root = root;
}
+ if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) {
+ location.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root)) {
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->stripe_root = root;
+ }
+ }
+
return 0;
out:
btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
@@ -2381,7 +2400,8 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
- if (memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) {
+ if (!fs_info->fs_devices->temp_fsid &&
+ memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) {
btrfs_err(fs_info,
"superblock fsid doesn't match fsid of fs_devices: %pU != %pU",
sb->fsid, fs_info->fs_devices->fsid);
@@ -2634,7 +2654,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
/* All successful */
fs_info->generation = btrfs_header_generation(tree_root->node);
- fs_info->last_trans_committed = fs_info->generation;
+ btrfs_set_last_trans_committed(fs_info, fs_info->generation);
fs_info->last_reloc_trans = 0;
/* Always begin writing backup roots after the one being used */
@@ -2735,9 +2755,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
spin_lock_init(&fs_info->ordered_root_lock);
btrfs_init_scrub(fs_info);
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- fs_info->check_integrity_print_mask = 0;
-#endif
btrfs_init_balance(fs_info);
btrfs_init_async_reclaim_work(fs_info);
@@ -3157,7 +3174,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
u32 nodesize;
u32 stripesize;
u64 generation;
- u64 features;
u16 csum_type;
struct btrfs_super_block *disk_super;
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -3197,6 +3213,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_alloc;
}
+ btrfs_info(fs_info, "first mount of filesystem %pU", disk_super->fsid);
/*
* Verify the type first, if that or the checksum value are
* corrupted, we'll find out
@@ -3239,15 +3256,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
disk_super = fs_info->super_copy;
-
- features = btrfs_super_flags(disk_super);
- if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
- features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2;
- btrfs_set_super_flags(disk_super, features);
- btrfs_info(fs_info,
- "found metadata UUID change in progress flag, clearing");
- }
-
memcpy(fs_info->super_for_commit, fs_info->super_copy,
sizeof(*fs_info->super_for_commit));
@@ -3509,18 +3517,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
"auto enabling async discard");
}
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) {
- ret = btrfsic_mount(fs_info, fs_devices,
- btrfs_test_opt(fs_info,
- CHECK_INTEGRITY_DATA) ? 1 : 0,
- fs_info->check_integrity_print_mask);
- if (ret)
- btrfs_warn(fs_info,
- "failed to initialize integrity check module: %d",
- ret);
- }
-#endif
ret = btrfs_read_qgroup_config(fs_info);
if (ret)
goto fail_trans_kthread;
@@ -3820,8 +3816,6 @@ static int write_dev_supers(struct btrfs_device *device,
*/
if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
bio->bi_opf |= REQ_FUA;
-
- btrfsic_check_bio(bio);
submit_bio(bio);
if (btrfs_advance_sb_log(device, i))
@@ -3917,28 +3911,11 @@ static void write_dev_flush(struct btrfs_device *device)
device->last_flush_error = BLK_STS_OK;
-#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- /*
- * When a disk has write caching disabled, we skip submission of a bio
- * with flush and sync requests before writing the superblock, since
- * it's not needed. However when the integrity checker is enabled, this
- * results in reports that there are metadata blocks referred by a
- * superblock that were not properly flushed. So don't skip the bio
- * submission only when the integrity checker is enabled for the sake
- * of simplicity, since this is a debug tool and not meant for use in
- * non-debug builds.
- */
- if (!bdev_write_cache(device->bdev))
- return;
-#endif
-
bio_init(bio, device->bdev, NULL, 0,
REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
bio->bi_end_io = btrfs_end_empty_barrier;
init_completion(&device->flush_wait);
bio->bi_private = &device->flush_wait;
-
- btrfsic_check_bio(bio);
submit_bio(bio);
set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
}
@@ -4414,16 +4391,12 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
iput(fs_info->btree_inode);
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- if (btrfs_test_opt(fs_info, CHECK_INTEGRITY))
- btrfsic_unmount(fs_info->fs_devices);
-#endif
-
btrfs_mapping_tree_free(&fs_info->mapping_tree);
btrfs_close_devices(fs_info->fs_devices);
}
-void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
+void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans,
+ struct extent_buffer *buf)
{
struct btrfs_fs_info *fs_info = buf->fs_info;
u64 transid = btrfs_header_generation(buf);
@@ -4437,21 +4410,16 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
return;
#endif
+ /* This is an active transaction (its state < TRANS_STATE_UNBLOCKED). */
+ ASSERT(trans->transid == fs_info->generation);
btrfs_assert_tree_write_locked(buf);
- if (transid != fs_info->generation)
- WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
- buf->start, transid, fs_info->generation);
- set_extent_buffer_dirty(buf);
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- /*
- * btrfs_check_leaf() won't check item data if we don't have WRITTEN
- * set, so this will only validate the basic structure of the items.
- */
- if (btrfs_header_level(buf) == 0 && btrfs_check_leaf(buf)) {
- btrfs_print_leaf(buf);
- ASSERT(0);
+ if (unlikely(transid != fs_info->generation)) {
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ btrfs_crit(fs_info,
+"dirty buffer transid mismatch, logical %llu found transid %llu running transid %llu",
+ buf->start, transid, fs_info->generation);
}
-#endif
+ set_extent_buffer_dirty(buf);
}
static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info,
@@ -4611,6 +4579,7 @@ static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
list_del(&ref->add_list);
atomic_dec(&delayed_refs->num_entries);
btrfs_put_delayed_ref(ref);
+ btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
}
if (head->must_insert_reserved)
pin_bytes = true;
@@ -4808,7 +4777,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
spin_unlock(&cur_trans->dirty_bgs_lock);
btrfs_put_block_group(cache);
- btrfs_delayed_refs_rsv_release(fs_info, 1);
+ btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
spin_lock(&cur_trans->dirty_bgs_lock);
}
spin_unlock(&cur_trans->dirty_bgs_lock);
@@ -4830,6 +4799,32 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
}
}
+static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *gang[8];
+ int i;
+ int ret;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ while (1) {
+ ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
+ (void **)gang, 0,
+ ARRAY_SIZE(gang),
+ BTRFS_ROOT_TRANS_TAG);
+ if (ret == 0)
+ break;
+ for (i = 0; i < ret; i++) {
+ struct btrfs_root *root = gang[i];
+
+ btrfs_qgroup_free_meta_all_pertrans(root);
+ radix_tree_tag_clear(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ BTRFS_ROOT_TRANS_TAG);
+ }
+ }
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+}
+
void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
struct btrfs_fs_info *fs_info)
{
@@ -4858,6 +4853,8 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
EXTENT_DIRTY);
btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
+ btrfs_free_all_qgroup_pertrans(fs_info);
+
cur_trans->state =TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 02b645744a82..50dab8f639dc 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -104,7 +104,8 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
}
void btrfs_put_root(struct btrfs_root *root);
-void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
+void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans,
+ struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
int btrfs_read_extent_buffer(struct extent_buffer *buf,
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index ff8e117a1ace..ea149be28dff 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -105,32 +105,40 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info,
lockdep_set_class(&tree->lock, &file_extent_tree_class);
}
+/*
+ * Empty an io tree, removing and freeing every extent state record from the
+ * tree. This should be called once we are sure no other task can access the
+ * tree anymore, so no tree updates happen after we empty the tree and there
+ * aren't any waiters on any extent state record (EXTENT_LOCKED bit is never
+ * set on any extent state when calling this function).
+ */
void extent_io_tree_release(struct extent_io_tree *tree)
{
+ struct rb_root root;
+ struct extent_state *state;
+ struct extent_state *tmp;
+
spin_lock(&tree->lock);
- /*
- * Do a single barrier for the waitqueue_active check here, the state
- * of the waitqueue should not change once extent_io_tree_release is
- * called.
- */
- smp_mb();
- while (!RB_EMPTY_ROOT(&tree->state)) {
- struct rb_node *node;
- struct extent_state *state;
-
- node = rb_first(&tree->state);
- state = rb_entry(node, struct extent_state, rb_node);
- rb_erase(&state->rb_node, &tree->state);
+ root = tree->state;
+ tree->state = RB_ROOT;
+ rbtree_postorder_for_each_entry_safe(state, tmp, &root, rb_node) {
+ /* Clear node to keep free_extent_state() happy. */
RB_CLEAR_NODE(&state->rb_node);
+ ASSERT(!(state->state & EXTENT_LOCKED));
/*
- * btree io trees aren't supposed to have tasks waiting for
- * changes in the flags of extent states ever.
+ * No need for a memory barrier here, as we are holding the tree
+ * lock and we only change the waitqueue while holding that lock
+ * (see wait_extent_bit()).
*/
ASSERT(!waitqueue_active(&state->wq));
free_extent_state(state);
-
cond_resched_lock(&tree->lock);
}
+ /*
+ * Should still be empty even after a reschedule, no other task should
+ * be accessing the tree anymore.
+ */
+ ASSERT(RB_EMPTY_ROOT(&tree->state));
spin_unlock(&tree->lock);
}
@@ -327,6 +335,36 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
"locking error: extent tree was modified by another thread while locked");
}
+static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *state)
+{
+ struct extent_state *prev;
+
+ prev = prev_state(state);
+ if (prev && prev->end == state->start - 1 && prev->state == state->state) {
+ if (tree->inode)
+ btrfs_merge_delalloc_extent(tree->inode, state, prev);
+ state->start = prev->start;
+ rb_erase(&prev->rb_node, &tree->state);
+ RB_CLEAR_NODE(&prev->rb_node);
+ free_extent_state(prev);
+ }
+}
+
+static void merge_next_state(struct extent_io_tree *tree, struct extent_state *state)
+{
+ struct extent_state *next;
+
+ next = next_state(state);
+ if (next && next->start == state->end + 1 && next->state == state->state) {
+ if (tree->inode)
+ btrfs_merge_delalloc_extent(tree->inode, state, next);
+ state->end = next->end;
+ rb_erase(&next->rb_node, &tree->state);
+ RB_CLEAR_NODE(&next->rb_node);
+ free_extent_state(next);
+ }
+}
+
/*
* Utility function to look for merge candidates inside a given range. Any
* extents with matching state are merged together into a single extent in the
@@ -338,31 +376,11 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
*/
static void merge_state(struct extent_io_tree *tree, struct extent_state *state)
{
- struct extent_state *other;
-
if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
return;
- other = prev_state(state);
- if (other && other->end == state->start - 1 &&
- other->state == state->state) {
- if (tree->inode)
- btrfs_merge_delalloc_extent(tree->inode, state, other);
- state->start = other->start;
- rb_erase(&other->rb_node, &tree->state);
- RB_CLEAR_NODE(&other->rb_node);
- free_extent_state(other);
- }
- other = next_state(state);
- if (other && other->start == state->end + 1 &&
- other->state == state->state) {
- if (tree->inode)
- btrfs_merge_delalloc_extent(tree->inode, state, other);
- state->end = other->end;
- rb_erase(&other->rb_node, &tree->state);
- RB_CLEAR_NODE(&other->rb_node);
- free_extent_state(other);
- }
+ merge_prev_state(tree, state);
+ merge_next_state(tree, state);
}
static void set_state_bits(struct extent_io_tree *tree,
@@ -384,19 +402,27 @@ static void set_state_bits(struct extent_io_tree *tree,
* Insert an extent_state struct into the tree. 'bits' are set on the
* struct before it is inserted.
*
- * This may return -EEXIST if the extent is already there, in which case the
- * state struct is freed.
+ * Returns a pointer to the struct extent_state record containing the range
+ * requested for insertion, which may be the same as the given struct or it
+ * may be an existing record in the tree that was expanded to accommodate the
+ * requested range. In case of an extent_state different from the one that was
+ * given, the later can be freed or reused by the caller.
+ *
+ * On error it returns an error pointer.
*
* The tree lock is not taken internally. This is a utility function and
* probably isn't what you want to call (see set/clear_extent_bit).
*/
-static int insert_state(struct extent_io_tree *tree,
- struct extent_state *state,
- u32 bits, struct extent_changeset *changeset)
+static struct extent_state *insert_state(struct extent_io_tree *tree,
+ struct extent_state *state,
+ u32 bits,
+ struct extent_changeset *changeset)
{
struct rb_node **node;
struct rb_node *parent = NULL;
- const u64 end = state->end;
+ const u64 start = state->start - 1;
+ const u64 end = state->end + 1;
+ const bool try_merge = !(bits & (EXTENT_LOCKED | EXTENT_BOUNDARY));
set_state_bits(tree, state, bits, changeset);
@@ -407,23 +433,42 @@ static int insert_state(struct extent_io_tree *tree,
parent = *node;
entry = rb_entry(parent, struct extent_state, rb_node);
- if (end < entry->start) {
+ if (state->end < entry->start) {
+ if (try_merge && end == entry->start &&
+ state->state == entry->state) {
+ if (tree->inode)
+ btrfs_merge_delalloc_extent(tree->inode,
+ state, entry);
+ entry->start = state->start;
+ merge_prev_state(tree, entry);
+ state->state = 0;
+ return entry;
+ }
node = &(*node)->rb_left;
- } else if (end > entry->end) {
+ } else if (state->end > entry->end) {
+ if (try_merge && entry->end == start &&
+ state->state == entry->state) {
+ if (tree->inode)
+ btrfs_merge_delalloc_extent(tree->inode,
+ state, entry);
+ entry->end = state->end;
+ merge_next_state(tree, entry);
+ state->state = 0;
+ return entry;
+ }
node = &(*node)->rb_right;
} else {
btrfs_err(tree->fs_info,
"found node %llu %llu on insert of %llu %llu",
- entry->start, entry->end, state->start, end);
- return -EEXIST;
+ entry->start, entry->end, state->start, state->end);
+ return ERR_PTR(-EEXIST);
}
}
rb_link_node(&state->rb_node, parent, node);
rb_insert_color(&state->rb_node, &tree->state);
- merge_state(tree, state);
- return 0;
+ return state;
}
/*
@@ -708,26 +753,13 @@ out:
}
-static void wait_on_state(struct extent_io_tree *tree,
- struct extent_state *state)
- __releases(tree->lock)
- __acquires(tree->lock)
-{
- DEFINE_WAIT(wait);
- prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
- spin_unlock(&tree->lock);
- schedule();
- spin_lock(&tree->lock);
- finish_wait(&state->wq, &wait);
-}
-
/*
* Wait for one or more bits to clear on a range in the state tree.
* The range [start, end] is inclusive.
* The tree lock is taken by this function
*/
-void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
- struct extent_state **cached_state)
+static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached_state)
{
struct extent_state *state;
@@ -758,9 +790,15 @@ process_node:
goto out;
if (state->state & bits) {
+ DEFINE_WAIT(wait);
+
start = state->start;
refcount_inc(&state->refs);
- wait_on_state(tree, state);
+ prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&tree->lock);
+ schedule();
+ spin_lock(&tree->lock);
+ finish_wait(&state->wq, &wait);
free_extent_state(state);
goto again;
}
@@ -847,10 +885,19 @@ bool find_first_extent_bit(struct extent_io_tree *tree, u64 start,
if (state->end == start - 1 && extent_state_in_tree(state)) {
while ((state = next_state(state)) != NULL) {
if (state->state & bits)
- goto got_it;
+ break;
}
+ /*
+ * If we found the next extent state, clear cached_state
+ * so that we can cache the next extent state below and
+ * avoid future calls going over the same extent state
+ * again. If we haven't found any, clear as well since
+ * it's now useless.
+ */
free_extent_state(*cached_state);
*cached_state = NULL;
+ if (state)
+ goto got_it;
goto out;
}
free_extent_state(*cached_state);
@@ -1133,6 +1180,8 @@ hit_next:
*/
if (state->start > start) {
u64 this_end;
+ struct extent_state *inserted_state;
+
if (end < last_start)
this_end = end;
else
@@ -1148,12 +1197,15 @@ hit_next:
*/
prealloc->start = start;
prealloc->end = this_end;
- err = insert_state(tree, prealloc, bits, changeset);
- if (err)
+ inserted_state = insert_state(tree, prealloc, bits, changeset);
+ if (IS_ERR(inserted_state)) {
+ err = PTR_ERR(inserted_state);
extent_io_tree_panic(tree, err);
+ }
- cache_state(prealloc, cached_state);
- prealloc = NULL;
+ cache_state(inserted_state, cached_state);
+ if (inserted_state == prealloc)
+ prealloc = NULL;
start = this_end + 1;
goto search_again;
}
@@ -1356,6 +1408,8 @@ hit_next:
*/
if (state->start > start) {
u64 this_end;
+ struct extent_state *inserted_state;
+
if (end < last_start)
this_end = end;
else
@@ -1373,11 +1427,14 @@ hit_next:
*/
prealloc->start = start;
prealloc->end = this_end;
- err = insert_state(tree, prealloc, bits, NULL);
- if (err)
+ inserted_state = insert_state(tree, prealloc, bits, NULL);
+ if (IS_ERR(inserted_state)) {
+ err = PTR_ERR(inserted_state);
extent_io_tree_panic(tree, err);
- cache_state(prealloc, cached_state);
- prealloc = NULL;
+ }
+ cache_state(inserted_state, cached_state);
+ if (inserted_state == prealloc)
+ prealloc = NULL;
start = this_end + 1;
goto search_again;
}
@@ -1640,15 +1697,46 @@ search:
}
/*
- * Search a range in the state tree for a given mask. If 'filled' == 1, this
- * returns 1 only if every extent in the tree has the bits set. Otherwise, 1
- * is returned if any bit in the range is found set.
+ * Check if the single @bit exists in the given range.
+ */
+bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit)
+{
+ struct extent_state *state = NULL;
+ bool bitset = false;
+
+ ASSERT(is_power_of_2(bit));
+
+ spin_lock(&tree->lock);
+ state = tree_search(tree, start);
+ while (state && start <= end) {
+ if (state->start > end)
+ break;
+
+ if (state->state & bit) {
+ bitset = true;
+ break;
+ }
+
+ /* If state->end is (u64)-1, start will overflow to 0 */
+ start = state->end + 1;
+ if (start > end || start == 0)
+ break;
+ state = next_state(state);
+ }
+ spin_unlock(&tree->lock);
+ return bitset;
+}
+
+/*
+ * Check if the whole range [@start,@end) contains the single @bit set.
*/
-int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, int filled, struct extent_state *cached)
+bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
+ struct extent_state *cached)
{
struct extent_state *state = NULL;
- int bitset = 0;
+ bool bitset = true;
+
+ ASSERT(is_power_of_2(bit));
spin_lock(&tree->lock);
if (cached && extent_state_in_tree(cached) && cached->start <= start &&
@@ -1657,35 +1745,35 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
else
state = tree_search(tree, start);
while (state && start <= end) {
- if (filled && state->start > start) {
- bitset = 0;
+ if (state->start > start) {
+ bitset = false;
break;
}
if (state->start > end)
break;
- if (state->state & bits) {
- bitset = 1;
- if (!filled)
- break;
- } else if (filled) {
- bitset = 0;
+ if ((state->state & bit) == 0) {
+ bitset = false;
break;
}
if (state->end == (u64)-1)
break;
+ /*
+ * Last entry (if state->end is (u64)-1 and overflow happens),
+ * or next entry starts after the range.
+ */
start = state->end + 1;
- if (start > end)
+ if (start > end || start == 0)
break;
state = next_state(state);
}
/* We ran out of states and were still inside of our range. */
- if (filled && !state)
- bitset = 0;
+ if (!state)
+ bitset = false;
spin_unlock(&tree->lock);
return bitset;
}
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 28c23a23d121..5602b0137fcd 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -131,8 +131,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
struct extent_state **cached_state);
void free_extent_state(struct extent_state *state);
-int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, int filled, struct extent_state *cached_state);
+bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
+ struct extent_state *cached_state);
+bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit);
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_changeset *changeset);
int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
@@ -192,7 +193,5 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
u64 *end, u64 max_bytes,
struct extent_state **cached_state);
-void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
- struct extent_state **cached_state);
#endif /* BTRFS_EXTENT_IO_TREE_H */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fc313fce5bbd..01423670bc8a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -42,14 +42,16 @@
#include "file-item.h"
#include "orphan.h"
#include "tree-checker.h"
+#include "raid-stripe-tree.h"
#undef SCRAMBLE_DELAYED_REFS
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *href,
struct btrfs_delayed_ref_node *node, u64 parent,
u64 root_objectid, u64 owner_objectid,
- u64 owner_offset, int refs_to_drop,
+ u64 owner_offset,
struct btrfs_delayed_extent_op *extra_op);
static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
struct extent_buffer *leaf,
@@ -57,7 +59,7 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
u64 flags, u64 owner, u64 offset,
- struct btrfs_key *ins, int ref_mod);
+ struct btrfs_key *ins, int ref_mod, u64 oref_root);
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op);
@@ -100,7 +102,8 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
*/
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 offset, int metadata, u64 *refs, u64 *flags)
+ u64 offset, int metadata, u64 *refs, u64 *flags,
+ u64 *owning_root)
{
struct btrfs_root *extent_root;
struct btrfs_delayed_ref_head *head;
@@ -112,6 +115,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
u32 item_size;
u64 num_refs;
u64 extent_flags;
+ u64 owner = 0;
int ret;
/*
@@ -165,6 +169,8 @@ search_again:
struct btrfs_extent_item);
num_refs = btrfs_extent_refs(leaf, ei);
extent_flags = btrfs_extent_flags(leaf, ei);
+ owner = btrfs_get_extent_owner_root(fs_info, leaf,
+ path->slots[0]);
} else {
ret = -EUCLEAN;
btrfs_err(fs_info,
@@ -224,6 +230,8 @@ out:
*refs = num_refs;
if (flags)
*flags = extent_flags;
+ if (owning_root)
+ *owning_root = owner;
out_free:
btrfs_free_path(path);
return ret;
@@ -344,9 +352,15 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
struct btrfs_extent_inline_ref *iref,
enum btrfs_inline_ref_type is_data)
{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
int type = btrfs_extent_inline_ref_type(eb, iref);
u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
+ if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
+ ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+ return type;
+ }
+
if (type == BTRFS_TREE_BLOCK_REF_KEY ||
type == BTRFS_SHARED_BLOCK_REF_KEY ||
type == BTRFS_SHARED_DATA_REF_KEY ||
@@ -355,26 +369,25 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
if (type == BTRFS_TREE_BLOCK_REF_KEY)
return type;
if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
- ASSERT(eb->fs_info);
+ ASSERT(fs_info);
/*
* Every shared one has parent tree block,
* which must be aligned to sector size.
*/
- if (offset &&
- IS_ALIGNED(offset, eb->fs_info->sectorsize))
+ if (offset && IS_ALIGNED(offset, fs_info->sectorsize))
return type;
}
} else if (is_data == BTRFS_REF_TYPE_DATA) {
if (type == BTRFS_EXTENT_DATA_REF_KEY)
return type;
if (type == BTRFS_SHARED_DATA_REF_KEY) {
- ASSERT(eb->fs_info);
+ ASSERT(fs_info);
/*
* Every shared one has parent tree block,
* which must be aligned to sector size.
*/
if (offset &&
- IS_ALIGNED(offset, eb->fs_info->sectorsize))
+ IS_ALIGNED(offset, fs_info->sectorsize))
return type;
}
} else {
@@ -385,7 +398,7 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
WARN_ON(1);
btrfs_print_leaf(eb);
- btrfs_err(eb->fs_info,
+ btrfs_err(fs_info,
"eb %llu iref 0x%lx invalid extent inline ref type %d",
eb->start, (unsigned long)iref, type);
@@ -399,11 +412,11 @@ u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
__le64 lenum;
lenum = cpu_to_le64(root_objectid);
- high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
+ high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
lenum = cpu_to_le64(owner);
- low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+ low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
lenum = cpu_to_le64(offset);
- low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+ low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
return ((u64)high_crc << 31) ^ (u64)low_crc;
}
@@ -575,7 +588,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
}
}
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
ret = 0;
fail:
btrfs_release_path(path);
@@ -623,7 +636,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
}
return ret;
}
@@ -789,7 +802,6 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
int type;
int want;
int ret;
- int err = 0;
bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
int needed;
@@ -816,10 +828,8 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
again:
ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- }
/*
* We may be a newly converted file system which still has the old fat
@@ -846,7 +856,7 @@ again:
}
if (ret && !insert) {
- err = -ENOENT;
+ ret = -ENOENT;
goto out;
} else if (WARN_ON(ret)) {
btrfs_print_leaf(path->nodes[0]);
@@ -854,18 +864,18 @@ again:
"extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu",
bytenr, num_bytes, parent, root_objectid, owner,
offset);
- err = -EIO;
+ ret = -EUCLEAN;
goto out;
}
leaf = path->nodes[0];
item_size = btrfs_item_size(leaf, path->slots[0]);
if (unlikely(item_size < sizeof(*ei))) {
- err = -EUCLEAN;
+ ret = -EUCLEAN;
btrfs_err(fs_info,
"unexpected extent item size, has %llu expect >= %zu",
item_size, sizeof(*ei));
- btrfs_abort_transaction(trans, err);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -885,22 +895,17 @@ again:
else
needed = BTRFS_REF_TYPE_BLOCK;
- err = -ENOENT;
- while (1) {
- if (ptr >= end) {
- if (ptr > end) {
- err = -EUCLEAN;
- btrfs_print_leaf(path->nodes[0]);
- btrfs_crit(fs_info,
-"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
- path->slots[0], root_objectid, owner, offset, parent);
- }
- break;
- }
+ ret = -ENOENT;
+ while (ptr < end) {
iref = (struct btrfs_extent_inline_ref *)ptr;
type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
+ if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
+ ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+ ptr += btrfs_extent_inline_ref_size(type);
+ continue;
+ }
if (type == BTRFS_REF_TYPE_INVALID) {
- err = -EUCLEAN;
+ ret = -EUCLEAN;
goto out;
}
@@ -916,7 +921,7 @@ again:
dref = (struct btrfs_extent_data_ref *)(&iref->offset);
if (match_extent_data_ref(leaf, dref, root_objectid,
owner, offset)) {
- err = 0;
+ ret = 0;
break;
}
if (hash_extent_data_ref_item(leaf, dref) <
@@ -927,14 +932,14 @@ again:
ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
if (parent > 0) {
if (parent == ref_offset) {
- err = 0;
+ ret = 0;
break;
}
if (ref_offset < parent)
break;
} else {
if (root_objectid == ref_offset) {
- err = 0;
+ ret = 0;
break;
}
if (ref_offset < root_objectid)
@@ -943,10 +948,20 @@ again:
}
ptr += btrfs_extent_inline_ref_size(type);
}
- if (err == -ENOENT && insert) {
+
+ if (unlikely(ptr > end)) {
+ ret = -EUCLEAN;
+ btrfs_print_leaf(path->nodes[0]);
+ btrfs_crit(fs_info,
+"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
+ path->slots[0], root_objectid, owner, offset, parent);
+ goto out;
+ }
+
+ if (ret == -ENOENT && insert) {
if (item_size + extra_size >=
BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
/*
@@ -958,7 +973,7 @@ again:
if (find_next_key(path, 0, &key) == 0 &&
key.objectid == bytenr &&
key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
}
@@ -969,14 +984,14 @@ out:
path->search_for_extension = 0;
btrfs_unlock_up_safe(path, 1);
}
- return err;
+ return ret;
}
/*
* helper to add new inline back ref
*/
static noinline_for_stack
-void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
+void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref,
u64 parent, u64 root_objectid,
@@ -999,7 +1014,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
type = extent_ref_type(parent, owner);
size = btrfs_extent_inline_ref_size(type);
- btrfs_extend_item(path, size);
+ btrfs_extend_item(trans, path, size);
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
refs = btrfs_extent_refs(leaf, ei);
@@ -1033,7 +1048,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
} else {
btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
}
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
}
static int lookup_extent_backref(struct btrfs_trans_handle *trans,
@@ -1066,7 +1081,9 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans,
/*
* helper to update/remove inline back ref
*/
-static noinline_for_stack int update_inline_extent_backref(struct btrfs_path *path,
+static noinline_for_stack int update_inline_extent_backref(
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref,
int refs_to_mod,
struct btrfs_delayed_extent_op *extent_op)
@@ -1174,9 +1191,9 @@ static noinline_for_stack int update_inline_extent_backref(struct btrfs_path *pa
memmove_extent_buffer(leaf, ptr, ptr + size,
end - ptr - size);
item_size -= size;
- btrfs_truncate_item(path, item_size, 1);
+ btrfs_truncate_item(trans, path, item_size, 1);
}
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
return 0;
}
@@ -1206,9 +1223,10 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
bytenr, num_bytes, root_objectid, path->slots[0]);
return -EUCLEAN;
}
- ret = update_inline_extent_backref(path, iref, refs_to_add, extent_op);
+ ret = update_inline_extent_backref(trans, path, iref,
+ refs_to_add, extent_op);
} else if (ret == -ENOENT) {
- setup_inline_extent_backref(trans->fs_info, path, iref, parent,
+ setup_inline_extent_backref(trans, path, iref, parent,
root_objectid, owner, offset,
refs_to_add, extent_op);
ret = 0;
@@ -1226,7 +1244,8 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
BUG_ON(!is_data && refs_to_drop != 1);
if (iref)
- ret = update_inline_extent_backref(path, iref, -refs_to_drop, NULL);
+ ret = update_inline_extent_backref(trans, path, iref,
+ -refs_to_drop, NULL);
else if (is_data)
ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
else
@@ -1422,7 +1441,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
generic_ref->action);
BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
- generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID);
+ generic_ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID);
if (generic_ref->type == BTRFS_REF_METADATA)
ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
@@ -1435,7 +1454,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
}
/*
- * __btrfs_inc_extent_ref - insert backreference for a given extent
+ * Insert backreference for a given extent.
*
* The counterpart is in __btrfs_free_extent(), with examples and more details
* how it works.
@@ -1465,8 +1484,6 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
* always passed as 0. For data extents it is the fileoffset
* this extent belongs to.
*
- * @refs_to_add Number of references to add
- *
* @extent_op Pointer to a structure, holding information necessary when
* updating a tree block's flags
*
@@ -1474,7 +1491,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node,
u64 parent, u64 root_objectid,
- u64 owner, u64 offset, int refs_to_add,
+ u64 owner, u64 offset,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_path *path;
@@ -1484,6 +1501,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
u64 bytenr = node->bytenr;
u64 num_bytes = node->num_bytes;
u64 refs;
+ int refs_to_add = node->ref_mod;
int ret;
path = btrfs_alloc_path();
@@ -1510,7 +1528,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
if (extent_op)
__run_delayed_extent_op(extent_op, leaf, item);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
/* now insert the actual backref */
@@ -1529,45 +1547,72 @@ out:
return ret;
}
+static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_head *href)
+{
+ u64 root = href->owning_root;
+
+ /*
+ * Don't check must_insert_reserved, as this is called from contexts
+ * where it has already been unset.
+ */
+ if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE ||
+ !href->is_data || !is_fstree(root))
+ return;
+
+ btrfs_qgroup_free_refroot(fs_info, root, href->reserved_bytes,
+ BTRFS_QGROUP_RSV_DATA);
+}
+
static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *href,
struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
bool insert_reserved)
{
int ret = 0;
struct btrfs_delayed_data_ref *ref;
- struct btrfs_key ins;
u64 parent = 0;
- u64 ref_root = 0;
u64 flags = 0;
- ins.objectid = node->bytenr;
- ins.offset = node->num_bytes;
- ins.type = BTRFS_EXTENT_ITEM_KEY;
-
ref = btrfs_delayed_node_to_data_ref(node);
trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
if (node->type == BTRFS_SHARED_DATA_REF_KEY)
parent = ref->parent;
- ref_root = ref->root;
if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
+ struct btrfs_key key;
+ struct btrfs_squota_delta delta = {
+ .root = href->owning_root,
+ .num_bytes = node->num_bytes,
+ .is_data = true,
+ .is_inc = true,
+ .generation = trans->transid,
+ };
+
if (extent_op)
flags |= extent_op->flags_to_set;
- ret = alloc_reserved_file_extent(trans, parent, ref_root,
+
+ key.objectid = node->bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = node->num_bytes;
+
+ ret = alloc_reserved_file_extent(trans, parent, ref->root,
flags, ref->objectid,
- ref->offset, &ins,
- node->ref_mod);
+ ref->offset, &key,
+ node->ref_mod, href->owning_root);
+ free_head_ref_squota_rsv(trans->fs_info, href);
+ if (!ret)
+ ret = btrfs_record_squota_delta(trans->fs_info, &delta);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
- ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
+ ret = __btrfs_inc_extent_ref(trans, node, parent, ref->root,
ref->objectid, ref->offset,
- node->ref_mod, extent_op);
+ extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
- ret = __btrfs_free_extent(trans, node, parent,
- ref_root, ref->objectid,
- ref->offset, node->ref_mod,
- extent_op);
+ ret = __btrfs_free_extent(trans, href, node, parent,
+ ref->root, ref->objectid,
+ ref->offset, extent_op);
} else {
BUG();
}
@@ -1604,7 +1649,6 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
u32 item_size;
int ret;
- int err = 0;
int metadata = 1;
if (TRANS_ABORTED(trans))
@@ -1631,10 +1675,8 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
again:
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0) {
- err = ret;
goto out;
- }
- if (ret > 0) {
+ } else if (ret > 0) {
if (metadata) {
if (path->slots[0] > 0) {
path->slots[0]--;
@@ -1655,7 +1697,7 @@ again:
goto again;
}
} else {
- err = -EUCLEAN;
+ ret = -EUCLEAN;
btrfs_err(fs_info,
"missing extent item for extent %llu num_bytes %llu level %d",
head->bytenr, head->num_bytes, extent_op->level);
@@ -1667,29 +1709,31 @@ again:
item_size = btrfs_item_size(leaf, path->slots[0]);
if (unlikely(item_size < sizeof(*ei))) {
- err = -EUCLEAN;
+ ret = -EUCLEAN;
btrfs_err(fs_info,
"unexpected extent item size, has %u expect >= %zu",
item_size, sizeof(*ei));
- btrfs_abort_transaction(trans, err);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
__run_delayed_extent_op(extent_op, leaf, ei);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
out:
btrfs_free_path(path);
- return err;
+ return ret;
}
static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *href,
struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
bool insert_reserved)
{
int ret = 0;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_tree_ref *ref;
u64 parent = 0;
u64 ref_root = 0;
@@ -1709,14 +1753,24 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
return -EUCLEAN;
}
if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
+ struct btrfs_squota_delta delta = {
+ .root = href->owning_root,
+ .num_bytes = fs_info->nodesize,
+ .is_data = false,
+ .is_inc = true,
+ .generation = trans->transid,
+ };
+
BUG_ON(!extent_op || !extent_op->update_flags);
ret = alloc_reserved_tree_block(trans, node, extent_op);
+ if (!ret)
+ btrfs_record_squota_delta(fs_info, &delta);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
- ref->level, 0, 1, extent_op);
+ ref->level, 0, extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
- ret = __btrfs_free_extent(trans, node, parent, ref_root,
- ref->level, 0, 1, extent_op);
+ ret = __btrfs_free_extent(trans, href, node, parent, ref_root,
+ ref->level, 0, extent_op);
} else {
BUG();
}
@@ -1725,6 +1779,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
/* helper function to actually process a single delayed ref entry */
static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *href,
struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
bool insert_reserved)
@@ -1732,19 +1787,23 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
int ret = 0;
if (TRANS_ABORTED(trans)) {
- if (insert_reserved)
+ if (insert_reserved) {
btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
+ free_head_ref_squota_rsv(trans->fs_info, href);
+ }
return 0;
}
if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
node->type == BTRFS_SHARED_BLOCK_REF_KEY)
- ret = run_delayed_tree_ref(trans, node, extent_op,
+ ret = run_delayed_tree_ref(trans, href, node, extent_op,
insert_reserved);
else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
node->type == BTRFS_SHARED_DATA_REF_KEY)
- ret = run_delayed_data_ref(trans, node, extent_op,
+ ret = run_delayed_data_ref(trans, href, node, extent_op,
insert_reserved);
+ else if (node->type == BTRFS_EXTENT_OWNER_REF_KEY)
+ ret = 0;
else
BUG();
if (ret && insert_reserved)
@@ -1823,28 +1882,38 @@ static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
return ret ? ret : 1;
}
-void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
+u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head)
{
- int nr_items = 1; /* Dropping this ref head update. */
+ u64 ret = 0;
/*
* We had csum deletions accounted for in our delayed refs rsv, we need
* to drop the csum leaves for this update from our delayed_refs_rsv.
*/
if (head->total_ref_mod < 0 && head->is_data) {
+ int nr_csums;
+
spin_lock(&delayed_refs->lock);
delayed_refs->pending_csums -= head->num_bytes;
spin_unlock(&delayed_refs->lock);
- nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
+ nr_csums = btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
+
+ btrfs_delayed_refs_rsv_release(fs_info, 0, nr_csums);
+
+ ret = btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums);
}
+ /* must_insert_reserved can be set only if we didn't run the head ref. */
+ if (head->must_insert_reserved)
+ free_head_ref_squota_rsv(fs_info, head);
- btrfs_delayed_refs_rsv_release(fs_info, nr_items);
+ return ret;
}
static int cleanup_ref_head(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_head *head)
+ struct btrfs_delayed_ref_head *head,
+ u64 *bytes_released)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -1889,7 +1958,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
}
}
- btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
+ *bytes_released += btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
trace_run_delayed_ref_head(fs_info, head, 0);
btrfs_delayed_ref_unlock(head);
@@ -1931,7 +2000,8 @@ static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
}
static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_head *locked_ref)
+ struct btrfs_delayed_ref_head *locked_ref,
+ u64 *bytes_released)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_root *delayed_refs;
@@ -1979,14 +2049,22 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
* spin lock.
*/
must_insert_reserved = locked_ref->must_insert_reserved;
+ /*
+ * Unsetting this on the head ref relinquishes ownership of
+ * the rsv_bytes, so it is critical that every possible code
+ * path from here forward frees all reserves including qgroup
+ * reserve.
+ */
locked_ref->must_insert_reserved = false;
extent_op = locked_ref->extent_op;
locked_ref->extent_op = NULL;
spin_unlock(&locked_ref->lock);
- ret = run_one_delayed_ref(trans, ref, extent_op,
+ ret = run_one_delayed_ref(trans, locked_ref, ref, extent_op,
must_insert_reserved);
+ btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
+ *bytes_released += btrfs_calc_delayed_ref_bytes(fs_info, 1);
btrfs_free_delayed_extent_op(extent_op);
if (ret) {
@@ -2010,15 +2088,22 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
* Returns -ENOMEM or -EIO on failure and will abort the transaction.
*/
static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
- unsigned long nr)
+ u64 min_bytes)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_head *locked_ref = NULL;
int ret;
unsigned long count = 0;
+ unsigned long max_count = 0;
+ u64 bytes_processed = 0;
delayed_refs = &trans->transaction->delayed_refs;
+ if (min_bytes == 0) {
+ max_count = delayed_refs->num_heads_ready;
+ min_bytes = U64_MAX;
+ }
+
do {
if (!locked_ref) {
locked_ref = btrfs_obtain_ref_head(trans);
@@ -2046,7 +2131,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
spin_lock(&locked_ref->lock);
btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
- ret = btrfs_run_delayed_refs_for_head(trans, locked_ref);
+ ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &bytes_processed);
if (ret < 0 && ret != -EAGAIN) {
/*
* Error, btrfs_run_delayed_refs_for_head already
@@ -2058,7 +2143,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
* Success, perform the usual cleanup of a processed
* head
*/
- ret = cleanup_ref_head(trans, locked_ref);
+ ret = cleanup_ref_head(trans, locked_ref, &bytes_processed);
if (ret > 0 ) {
/* We dropped our lock, we need to loop. */
ret = 0;
@@ -2075,7 +2160,9 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
locked_ref = NULL;
cond_resched();
- } while ((nr != -1 && count < nr) || locked_ref);
+ } while ((min_bytes != U64_MAX && bytes_processed < min_bytes) ||
+ (max_count > 0 && count < max_count) ||
+ locked_ref);
return 0;
}
@@ -2124,24 +2211,25 @@ static u64 find_middle(struct rb_root *root)
#endif
/*
- * this starts processing the delayed reference count updates and
- * extent insertions we have queued up so far. count can be
- * 0, which means to process everything in the tree at the start
- * of the run (but not newly added entries), or it can be some target
- * number you'd like to process.
+ * Start processing the delayed reference count updates and extent insertions
+ * we have queued up so far.
+ *
+ * @trans: Transaction handle.
+ * @min_bytes: How many bytes of delayed references to process. After this
+ * many bytes we stop processing delayed references if there are
+ * any more. If 0 it means to run all existing delayed references,
+ * but not new ones added after running all existing ones.
+ * Use (u64)-1 (U64_MAX) to run all existing delayed references
+ * plus any new ones that are added.
*
* Returns 0 on success or if called with an aborted transaction
* Returns <0 on error and aborts the transaction
*/
-int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
- unsigned long count)
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct rb_node *node;
struct btrfs_delayed_ref_root *delayed_refs;
- struct btrfs_delayed_ref_head *head;
int ret;
- int run_all = count == (unsigned long)-1;
/* We'll clean this up in btrfs_cleanup_transaction */
if (TRANS_ABORTED(trans))
@@ -2151,42 +2239,30 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
return 0;
delayed_refs = &trans->transaction->delayed_refs;
- if (count == 0)
- count = delayed_refs->num_heads_ready;
-
again:
#ifdef SCRAMBLE_DELAYED_REFS
delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
- ret = __btrfs_run_delayed_refs(trans, count);
+ ret = __btrfs_run_delayed_refs(trans, min_bytes);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
return ret;
}
- if (run_all) {
+ if (min_bytes == U64_MAX) {
btrfs_create_pending_block_groups(trans);
spin_lock(&delayed_refs->lock);
- node = rb_first_cached(&delayed_refs->href_root);
- if (!node) {
+ if (RB_EMPTY_ROOT(&delayed_refs->href_root.rb_root)) {
spin_unlock(&delayed_refs->lock);
- goto out;
+ return 0;
}
- head = rb_entry(node, struct btrfs_delayed_ref_head,
- href_node);
- refcount_inc(&head->refs);
spin_unlock(&delayed_refs->lock);
- /* Mutex was contended, block until it's released and retry. */
- mutex_lock(&head->mutex);
- mutex_unlock(&head->mutex);
-
- btrfs_put_delayed_ref_head(head);
cond_resched();
goto again;
}
-out:
+
return 0;
}
@@ -2311,6 +2387,7 @@ static noinline int check_committed_ref(struct btrfs_root *root,
struct btrfs_extent_item *ei;
struct btrfs_key key;
u32 item_size;
+ u32 expected_size;
int type;
int ret;
@@ -2337,10 +2414,22 @@ static noinline int check_committed_ref(struct btrfs_root *root,
ret = 1;
item_size = btrfs_item_size(leaf, path->slots[0]);
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY);
+
+ /* No inline refs; we need to bail before checking for owner ref. */
+ if (item_size == sizeof(*ei))
+ goto out;
+
+ /* Check for an owner ref; skip over it to the real inline refs. */
+ iref = (struct btrfs_extent_inline_ref *)(ei + 1);
+ type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
+ if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) {
+ expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
+ iref = (struct btrfs_extent_inline_ref *)(iref + 1);
+ }
/* If extent item has more than 1 inline ref then it's shared */
- if (item_size != sizeof(*ei) +
- btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
+ if (item_size != expected_size)
goto out;
/*
@@ -2352,8 +2441,6 @@ static noinline int check_committed_ref(struct btrfs_root *root,
btrfs_root_last_snapshot(&root->root_item)))
goto out;
- iref = (struct btrfs_extent_inline_ref *)(ei + 1);
-
/* If this extent has SHARED_DATA_REF then it's shared */
type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
if (type != BTRFS_EXTENT_DATA_REF_KEY)
@@ -2450,7 +2537,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
key.offset -= btrfs_file_extent_offset(buf, fi);
btrfs_init_generic_ref(&generic_ref, action, bytenr,
- num_bytes, parent);
+ num_bytes, parent, ref_root);
btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
key.offset, root->root_key.objectid,
for_reloc);
@@ -2463,8 +2550,9 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
} else {
bytenr = btrfs_node_blockptr(buf, i);
num_bytes = fs_info->nodesize;
+ /* We don't know the owning_root, use 0. */
btrfs_init_generic_ref(&generic_ref, action, bytenr,
- num_bytes, parent);
+ num_bytes, parent, 0);
btrfs_init_tree_ref(&generic_ref, level - 1, ref_root,
root->root_key.objectid, for_reloc);
if (inc)
@@ -2565,16 +2653,13 @@ int btrfs_pin_extent(struct btrfs_trans_handle *trans,
return 0;
}
-/*
- * this function must be called within transaction
- */
int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes)
+ const struct extent_buffer *eb)
{
struct btrfs_block_group *cache;
int ret;
- cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
+ cache = btrfs_lookup_block_group(trans->fs_info, eb->start);
if (!cache)
return -EINVAL;
@@ -2586,10 +2671,10 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- pin_down_extent(trans, cache, bytenr, num_bytes, 0);
+ pin_down_extent(trans, cache, eb->start, eb->len, 0);
/* remove us from the free space cache (if we're there at all) */
- ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
+ ret = btrfs_remove_free_space(cache, eb->start, eb->len);
out:
btrfs_put_block_group(cache);
return ret;
@@ -2844,12 +2929,61 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
return 0;
}
+/*
+ * Parse an extent item's inline extents looking for a simple quotas owner ref.
+ *
+ * @fs_info: the btrfs_fs_info for this mount
+ * @leaf: a leaf in the extent tree containing the extent item
+ * @slot: the slot in the leaf where the extent item is found
+ *
+ * Returns the objectid of the root that originally allocated the extent item
+ * if the inline owner ref is expected and present, otherwise 0.
+ *
+ * If an extent item has an owner ref item, it will be the first inline ref
+ * item. Therefore the logic is to check whether there are any inline ref
+ * items, then check the type of the first one.
+ */
+u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *leaf, int slot)
+{
+ struct btrfs_extent_item *ei;
+ struct btrfs_extent_inline_ref *iref;
+ struct btrfs_extent_owner_ref *oref;
+ unsigned long ptr;
+ unsigned long end;
+ int type;
+
+ if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA))
+ return 0;
+
+ ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+ ptr = (unsigned long)(ei + 1);
+ end = (unsigned long)ei + btrfs_item_size(leaf, slot);
+
+ /* No inline ref items of any kind, can't check type. */
+ if (ptr == end)
+ return 0;
+
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
+
+ /* We found an owner ref, get the root out of it. */
+ if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
+ oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
+ return btrfs_extent_owner_ref_root_id(leaf, oref);
+ }
+
+ /* We have inline refs, but not an owner ref. */
+ return 0;
+}
+
static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes, bool is_data)
+ u64 bytenr, struct btrfs_squota_delta *delta)
{
int ret;
+ u64 num_bytes = delta->num_bytes;
- if (is_data) {
+ if (delta->is_data) {
struct btrfs_root *csum_root;
csum_root = btrfs_csum_root(trans->fs_info, bytenr);
@@ -2858,6 +2992,18 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
btrfs_abort_transaction(trans, ret);
return ret;
}
+
+ ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ }
+
+ ret = btrfs_record_squota_delta(trans->fs_info, delta);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
}
ret = add_to_free_space_tree(trans, bytenr, num_bytes);
@@ -2940,9 +3086,10 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
* And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
*/
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *href,
struct btrfs_delayed_ref_node *node, u64 parent,
u64 root_objectid, u64 owner_objectid,
- u64 owner_offset, int refs_to_drop,
+ u64 owner_offset,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *info = trans->fs_info;
@@ -2957,11 +3104,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
int extent_slot = 0;
int found_extent = 0;
int num_to_del = 1;
+ int refs_to_drop = node->ref_mod;
u32 item_size;
u64 refs;
u64 bytenr = node->bytenr;
u64 num_bytes = node->num_bytes;
bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
+ u64 delayed_ref_root = href->owning_root;
extent_root = btrfs_extent_root(info, bytenr);
ASSERT(extent_root);
@@ -3151,7 +3300,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
} else {
btrfs_set_extent_refs(leaf, ei, refs);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
}
if (found_extent) {
ret = remove_extent_backref(trans, extent_root, path,
@@ -3162,6 +3311,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
} else {
+ struct btrfs_squota_delta delta = {
+ .root = delayed_ref_root,
+ .num_bytes = num_bytes,
+ .is_data = is_data,
+ .is_inc = false,
+ .generation = btrfs_extent_generation(leaf, ei),
+ };
+
/* In this branch refs == 1 */
if (found_extent) {
if (is_data && refs_to_drop !=
@@ -3200,6 +3357,16 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
num_to_del = 2;
}
}
+ /*
+ * We can't infer the data owner from the delayed ref, so we need
+ * to try to get it from the owning ref item.
+ *
+ * If it is not present, then that extent was not written under
+ * simple quotas mode, so we don't need to account for its deletion.
+ */
+ if (is_data)
+ delta.root = btrfs_get_extent_owner_root(trans->fs_info,
+ leaf, extent_slot);
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
@@ -3209,7 +3376,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data);
+ ret = do_free_extent_accounting(trans, bytenr, &delta);
}
btrfs_release_path(path);
@@ -3283,7 +3450,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
int ret;
btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
- buf->start, buf->len, parent);
+ buf->start, buf->len, parent, btrfs_header_owner(buf));
btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
root_id, 0, false);
@@ -3370,10 +3537,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
* tree, just update pinning info and exit early.
*/
if ((ref->type == BTRFS_REF_METADATA &&
- ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
+ ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID) ||
(ref->type == BTRFS_REF_DATA &&
- ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) {
- /* unlocks the pinned mutex */
+ ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
ret = 0;
} else if (ref->type == BTRFS_REF_METADATA) {
@@ -3383,9 +3549,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
}
if (!((ref->type == BTRFS_REF_METADATA &&
- ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
+ ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID) ||
(ref->type == BTRFS_REF_DATA &&
- ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)))
+ ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
btrfs_ref_tree_mod(fs_info, ref);
return ret;
@@ -4442,8 +4608,8 @@ loop:
}
/*
- * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
- * hole that is at least as big as @num_bytes.
+ * Entry point to the extent allocator. Tries to find a hole that is at least
+ * as big as @num_bytes.
*
* @root - The root that will contain this extent
*
@@ -4562,20 +4728,20 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
return 0;
}
-int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
- u64 len)
+int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans,
+ const struct extent_buffer *eb)
{
struct btrfs_block_group *cache;
int ret = 0;
- cache = btrfs_lookup_block_group(trans->fs_info, start);
+ cache = btrfs_lookup_block_group(trans->fs_info, eb->start);
if (!cache) {
btrfs_err(trans->fs_info, "unable to find block group for %llu",
- start);
+ eb->start);
return -ENOSPC;
}
- ret = pin_down_extent(trans, cache, start, len, 1);
+ ret = pin_down_extent(trans, cache, eb->start, eb->len, 1);
btrfs_put_block_group(cache);
return ret;
}
@@ -4605,24 +4771,29 @@ static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr,
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
u64 flags, u64 owner, u64 offset,
- struct btrfs_key *ins, int ref_mod)
+ struct btrfs_key *ins, int ref_mod, u64 oref_root)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *extent_root;
int ret;
struct btrfs_extent_item *extent_item;
+ struct btrfs_extent_owner_ref *oref;
struct btrfs_extent_inline_ref *iref;
struct btrfs_path *path;
struct extent_buffer *leaf;
int type;
u32 size;
+ const bool simple_quota = (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE);
if (parent > 0)
type = BTRFS_SHARED_DATA_REF_KEY;
else
type = BTRFS_EXTENT_DATA_REF_KEY;
- size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
+ size = sizeof(*extent_item);
+ if (simple_quota)
+ size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
+ size += btrfs_extent_inline_ref_size(type);
path = btrfs_alloc_path();
if (!path)
@@ -4644,7 +4815,14 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
flags | BTRFS_EXTENT_FLAG_DATA);
iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
+ if (simple_quota) {
+ btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_EXTENT_OWNER_REF_KEY);
+ oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
+ btrfs_set_extent_owner_ref_root_id(leaf, oref, oref_root);
+ iref = (struct btrfs_extent_inline_ref *)(oref + 1);
+ }
btrfs_set_extent_inline_ref_type(leaf, iref, type);
+
if (parent > 0) {
struct btrfs_shared_data_ref *ref;
ref = (struct btrfs_shared_data_ref *)(iref + 1);
@@ -4659,7 +4837,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
}
- btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_mark_buffer_dirty(trans, path->nodes[0]);
btrfs_free_path(path);
return alloc_reserved_extent(trans, ins->objectid, ins->offset);
@@ -4734,7 +4912,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
}
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_free_path(path);
return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize);
@@ -4746,12 +4924,17 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_key *ins)
{
struct btrfs_ref generic_ref = { 0 };
+ u64 root_objectid = root->root_key.objectid;
+ u64 owning_root = root_objectid;
+
+ BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
- BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+ if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root))
+ owning_root = root->relocation_src_root;
btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
- ins->objectid, ins->offset, 0);
- btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner,
+ ins->objectid, ins->offset, 0, owning_root);
+ btrfs_init_data_ref(&generic_ref, root_objectid, owner,
offset, 0, false);
btrfs_ref_tree_mod(root->fs_info, &generic_ref);
@@ -4771,6 +4954,13 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
int ret;
struct btrfs_block_group *block_group;
struct btrfs_space_info *space_info;
+ struct btrfs_squota_delta delta = {
+ .root = root_objectid,
+ .num_bytes = ins->offset,
+ .generation = trans->transid,
+ .is_data = true,
+ .is_inc = true,
+ };
/*
* Mixed block groups will exclude before processing the log so we only
@@ -4796,13 +4986,36 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
spin_unlock(&space_info->lock);
ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
- offset, ins, 1);
+ offset, ins, 1, root_objectid);
if (ret)
btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
+ ret = btrfs_record_squota_delta(fs_info, &delta);
btrfs_put_block_group(block_group);
return ret;
}
+#ifdef CONFIG_BTRFS_DEBUG
+/*
+ * Extra safety check in case the extent tree is corrupted and extent allocator
+ * chooses to use a tree block which is already used and locked.
+ */
+static bool check_eb_lock_owner(const struct extent_buffer *eb)
+{
+ if (eb->lock_owner == current->pid) {
+ btrfs_err_rl(eb->fs_info,
+"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
+ eb->start, btrfs_header_owner(eb), current->pid);
+ return true;
+ }
+ return false;
+}
+#else
+static bool check_eb_lock_owner(struct extent_buffer *eb)
+{
+ return false;
+}
+#endif
+
static struct extent_buffer *
btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
u64 bytenr, int level, u64 owner,
@@ -4816,15 +5029,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (IS_ERR(buf))
return buf;
- /*
- * Extra safety check in case the extent tree is corrupted and extent
- * allocator chooses to use a tree block which is already used and
- * locked.
- */
- if (buf->lock_owner == current->pid) {
- btrfs_err_rl(fs_info,
-"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
- buf->start, btrfs_header_owner(buf), current->pid);
+ if (check_eb_lock_owner(buf)) {
free_extent_buffer(buf);
return ERR_PTR(-EUCLEAN);
}
@@ -4901,6 +5106,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
const struct btrfs_disk_key *key,
int level, u64 hint,
u64 empty_size,
+ u64 reloc_src_root,
enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4913,6 +5119,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
int ret;
u32 blocksize = fs_info->nodesize;
bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
+ u64 owning_root;
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (btrfs_is_testing(fs_info)) {
@@ -4939,11 +5146,13 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
ret = PTR_ERR(buf);
goto out_free_reserved;
}
+ owning_root = btrfs_header_owner(buf);
if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
if (parent == 0)
parent = ins.objectid;
flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ owning_root = reloc_src_root;
} else
BUG_ON(parent > 0);
@@ -4963,7 +5172,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
extent_op->level = level;
btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
- ins.objectid, ins.offset, parent);
+ ins.objectid, ins.offset, parent, owning_root);
btrfs_init_tree_ref(&generic_ref, level, root_objectid,
root->root_key.objectid, false);
btrfs_ref_tree_mod(fs_info, &generic_ref);
@@ -5051,7 +5260,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
/* We don't lock the tree block, it's OK to be racy here */
ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
wc->level - 1, 1, &refs,
- &flags);
+ &flags, NULL);
/* We don't care about errors in readahead. */
if (ret < 0)
continue;
@@ -5118,7 +5327,8 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_extent_info(trans, fs_info,
eb->start, level, 1,
&wc->refs[level],
- &wc->flags[level]);
+ &wc->flags[level],
+ NULL);
BUG_ON(ret == -ENOMEM);
if (ret)
return ret;
@@ -5208,6 +5418,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
u64 bytenr;
u64 generation;
u64 parent;
+ u64 owner_root = 0;
struct btrfs_tree_parent_check check = { 0 };
struct btrfs_key key;
struct btrfs_ref ref = { 0 };
@@ -5251,7 +5462,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
&wc->refs[level - 1],
- &wc->flags[level - 1]);
+ &wc->flags[level - 1],
+ &owner_root);
if (ret < 0)
goto out_unlock;
@@ -5384,7 +5596,7 @@ skip:
find_next_key(path, level, &wc->drop_progress);
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
- fs_info->nodesize, parent);
+ fs_info->nodesize, parent, owner_root);
btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid,
0, false);
ret = btrfs_free_extent(trans, &ref);
@@ -5451,7 +5663,8 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_extent_info(trans, fs_info,
eb->start, level, 1,
&wc->refs[level],
- &wc->flags[level]);
+ &wc->flags[level],
+ NULL);
if (ret < 0) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
path->locks[level] = 0;
@@ -5696,7 +5909,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
ret = btrfs_lookup_extent_info(trans, fs_info,
path->nodes[level]->start,
level, 1, &wc->refs[level],
- &wc->flags[level]);
+ &wc->flags[level], NULL);
if (ret < 0) {
err = ret;
goto out_end_trans;
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index 88c249c37516..2e066035ccee 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -7,6 +7,7 @@
#include "block-group.h"
struct btrfs_free_cluster;
+struct btrfs_delayed_ref_head;
enum btrfs_extent_allocation_policy {
BTRFS_EXTENT_ALLOC_CLUSTERED,
@@ -91,18 +92,19 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
enum btrfs_inline_ref_type is_data);
u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
-int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count);
-void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes);
+u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head);
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 offset, int metadata, u64 *refs, u64 *flags);
+ u64 offset, int metadata, u64 *refs, u64 *flags,
+ u64 *owner_root);
int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
int reserved);
int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes);
+ const struct extent_buffer *eb);
int btrfs_exclude_logged_extents(struct extent_buffer *eb);
int btrfs_cross_ref_exist(struct btrfs_root *root,
u64 objectid, u64 offset, u64 bytenr, bool strict,
@@ -113,6 +115,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
const struct btrfs_disk_key *key,
int level, u64 hint,
u64 empty_size,
+ u64 reloc_src_root,
enum btrfs_lock_nesting nest);
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
u64 root_id,
@@ -136,12 +139,15 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, u64 flags);
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
+u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *leaf, int slot);
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len, int delalloc);
-int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, u64 len);
+int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans,
+ const struct extent_buffer *eb);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref);
-int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref,
+int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref,
int for_reloc);
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index caccd0376342..8f724c54fc8e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -21,7 +21,6 @@
#include "ctree.h"
#include "btrfs_inode.h"
#include "bio.h"
-#include "check-integrity.h"
#include "locking.h"
#include "rcu-string.h"
#include "backref.h"
@@ -395,7 +394,7 @@ again:
/* then test to make sure it is all still delalloc */
ret = test_range_bit(tree, delalloc_start, delalloc_end,
- EXTENT_DELALLOC, 1, cached_state);
+ EXTENT_DELALLOC, cached_state);
if (!ret) {
unlock_extent(tree, delalloc_start, delalloc_end,
&cached_state);
@@ -675,8 +674,8 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
* the array will be skipped
*
* Return: 0 if all pages were able to be allocated;
- * -ENOMEM otherwise, and the caller is responsible for freeing all
- * non-null page pointers in the array.
+ * -ENOMEM otherwise, the partially allocated pages would be freed and
+ * the array slots zeroed
*/
int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
{
@@ -695,8 +694,13 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
* though alloc_pages_bulk_array() falls back to alloc_page()
* if it could not bulk-allocate. So we must be out of memory.
*/
- if (allocated == last)
+ if (allocated == last) {
+ for (int i = 0; i < allocated; i++) {
+ __free_page(page_array[i]);
+ page_array[i] = NULL;
+ }
return -ENOMEM;
+ }
memalloc_retry_wait(GFP_NOFS);
}
@@ -2294,11 +2298,12 @@ static int try_release_extent_state(struct extent_io_tree *tree,
u64 end = start + PAGE_SIZE - 1;
int ret = 1;
- if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
+ if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) {
ret = 0;
} else {
u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM |
- EXTENT_DELALLOC_NEW | EXTENT_CTLBITS);
+ EXTENT_DELALLOC_NEW | EXTENT_CTLBITS |
+ EXTENT_QGROUP_RESERVED);
/*
* At this point we can safely clear everything except the
@@ -2353,9 +2358,9 @@ int try_release_extent_mapping(struct page *page, gfp_t mask)
free_extent_map(em);
break;
}
- if (test_range_bit(tree, em->start,
- extent_map_end(em) - 1,
- EXTENT_LOCKED, 0, NULL))
+ if (test_range_bit_exists(tree, em->start,
+ extent_map_end(em) - 1,
+ EXTENT_LOCKED))
goto next;
/*
* If it's not in the list of modified extents, used
@@ -3455,6 +3460,12 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
start, fs_info->nodesize);
return -EINVAL;
}
+ if (!IS_ALIGNED(start, fs_info->nodesize) &&
+ !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags)) {
+ btrfs_warn(fs_info,
+"tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance",
+ start, fs_info->nodesize);
+ }
return 0;
}
@@ -4248,14 +4259,14 @@ void copy_extent_buffer(const struct extent_buffer *dst,
}
/*
- * eb_bitmap_offset() - calculate the page and offset of the byte containing the
- * given bit number
- * @eb: the extent buffer
- * @start: offset of the bitmap item in the extent buffer
- * @nr: bit number
- * @page_index: return index of the page in the extent buffer that contains the
- * given bit number
- * @page_offset: return offset into the page given by page_index
+ * Calculate the page and offset of the byte containing the given bit number.
+ *
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @nr: bit number
+ * @page_index: return index of the page in the extent buffer that contains
+ * the given bit number
+ * @page_offset: return offset into the page given by page_index
*
* This helper hides the ugliness of finding the byte in an extent buffer which
* contains a given bit.
@@ -4614,7 +4625,8 @@ int try_release_extent_buffer(struct page *page)
}
/*
- * btrfs_readahead_tree_block - attempt to readahead a child block
+ * Attempt to readahead a child block.
+ *
* @fs_info: the fs_info
* @bytenr: bytenr to read
* @owner_root: objectid of the root that owns this eb
@@ -4653,7 +4665,8 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
}
/*
- * btrfs_readahead_node_child - readahead a node's child block
+ * Readahead a node's child block.
+ *
* @node: parent node we're reading from
* @slot: slot in the parent node for the child we want to read
*
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 68368ba99321..2171057a4477 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -80,16 +80,16 @@ struct extent_buffer {
spinlock_t refs_lock;
atomic_t refs;
int read_mirror;
- struct rcu_head rcu_head;
- pid_t lock_owner;
/* >= 0 if eb belongs to a log tree, -1 otherwise */
s8 log_index;
+ struct rcu_head rcu_head;
struct rw_semaphore lock;
struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
#ifdef CONFIG_BTRFS_DEBUG
struct list_head leak_list;
+ pid_t lock_owner;
#endif
};
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 1ce5dd154499..45cae356e89b 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -194,7 +194,7 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_encryption(leaf, item, 0);
btrfs_set_file_extent_other_encoding(leaf, item, 0);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
out:
btrfs_free_path(path);
return ret;
@@ -811,11 +811,12 @@ blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio)
* This calls btrfs_truncate_item with the correct args based on the overlap,
* and fixes up the key as required.
*/
-static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
+static noinline void truncate_one_csum(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_key *key,
u64 bytenr, u64 len)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct extent_buffer *leaf;
const u32 csum_size = fs_info->csum_size;
u64 csum_end;
@@ -836,7 +837,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
*/
u32 new_size = (bytenr - key->offset) >> blocksize_bits;
new_size *= csum_size;
- btrfs_truncate_item(path, new_size, 1);
+ btrfs_truncate_item(trans, path, new_size, 1);
} else if (key->offset >= bytenr && csum_end > end_byte &&
end_byte > key->offset) {
/*
@@ -848,10 +849,10 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
u32 new_size = (csum_end - end_byte) >> blocksize_bits;
new_size *= csum_size;
- btrfs_truncate_item(path, new_size, 0);
+ btrfs_truncate_item(trans, path, new_size, 0);
key->offset = end_byte;
- btrfs_set_item_key_safe(fs_info, path, key);
+ btrfs_set_item_key_safe(trans, path, key);
} else {
BUG();
}
@@ -994,7 +995,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
key.offset = end_byte - 1;
} else {
- truncate_one_csum(fs_info, path, &key, bytenr, len);
+ truncate_one_csum(trans, path, &key, bytenr, len);
if (key.offset < bytenr)
break;
}
@@ -1202,7 +1203,7 @@ extend_csum:
diff /= csum_size;
diff *= csum_size;
- btrfs_extend_item(path, diff);
+ btrfs_extend_item(trans, path, diff);
ret = 0;
goto csum;
}
@@ -1249,7 +1250,7 @@ found:
ins_size /= csum_size;
total_bytes += ins_size * fs_info->sectorsize;
- btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_mark_buffer_dirty(trans, path->nodes[0]);
if (total_bytes < sums->len) {
btrfs_release_path(path);
cond_resched();
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 361535c71c0f..32611a4edd6b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -17,6 +17,7 @@
#include <linux/uio.h>
#include <linux/iversion.h>
#include <linux/fsverity.h>
+#include <linux/iomap.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -368,12 +369,13 @@ next_slot:
btrfs_set_file_extent_offset(leaf, fi, extent_offset);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - args->start);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
if (update_refs && disk_bytenr > 0) {
btrfs_init_generic_ref(&ref,
BTRFS_ADD_DELAYED_REF,
- disk_bytenr, num_bytes, 0);
+ disk_bytenr, num_bytes, 0,
+ root->root_key.objectid);
btrfs_init_data_ref(&ref,
root->root_key.objectid,
new_key.objectid,
@@ -405,13 +407,13 @@ next_slot:
memcpy(&new_key, &key, sizeof(new_key));
new_key.offset = args->end;
- btrfs_set_item_key_safe(fs_info, path, &new_key);
+ btrfs_set_item_key_safe(trans, path, &new_key);
extent_offset += args->end - key.offset;
btrfs_set_file_extent_offset(leaf, fi, extent_offset);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - args->end);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
if (update_refs && disk_bytenr > 0)
args->bytes_found += args->end - key.offset;
break;
@@ -431,7 +433,7 @@ next_slot:
btrfs_set_file_extent_num_bytes(leaf, fi,
args->start - key.offset);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
if (update_refs && disk_bytenr > 0)
args->bytes_found += extent_end - args->start;
if (args->end == extent_end)
@@ -463,7 +465,8 @@ delete_extent_item:
} else if (update_refs && disk_bytenr > 0) {
btrfs_init_generic_ref(&ref,
BTRFS_DROP_DELAYED_REF,
- disk_bytenr, num_bytes, 0);
+ disk_bytenr, num_bytes, 0,
+ root->root_key.objectid);
btrfs_init_data_ref(&ref,
root->root_key.objectid,
key.objectid,
@@ -536,7 +539,8 @@ delete_extent_item:
if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
path->slots[0]++;
}
- btrfs_setup_item_for_insert(root, path, &key, args->extent_item_size);
+ btrfs_setup_item_for_insert(trans, root, path, &key,
+ args->extent_item_size);
args->extent_inserted = true;
}
@@ -593,7 +597,6 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot,
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u64 start, u64 end)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = inode->root;
struct extent_buffer *leaf;
struct btrfs_path *path;
@@ -664,7 +667,7 @@ again:
ino, bytenr, orig_offset,
&other_start, &other_end)) {
new_key.offset = end;
- btrfs_set_item_key_safe(fs_info, path, &new_key);
+ btrfs_set_item_key_safe(trans, path, &new_key);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
btrfs_set_file_extent_generation(leaf, fi,
@@ -679,7 +682,7 @@ again:
trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
end - other_start);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
goto out;
}
}
@@ -698,7 +701,7 @@ again:
trans->transid);
path->slots[0]++;
new_key.offset = start;
- btrfs_set_item_key_safe(fs_info, path, &new_key);
+ btrfs_set_item_key_safe(trans, path, &new_key);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
@@ -708,7 +711,7 @@ again:
other_end - start);
btrfs_set_file_extent_offset(leaf, fi,
start - orig_offset);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
goto out;
}
}
@@ -742,10 +745,10 @@ again:
btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - split);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
- num_bytes, 0);
+ num_bytes, 0, root->root_key.objectid);
btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
orig_offset, 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
@@ -771,7 +774,7 @@ again:
other_start = end;
other_end = 0;
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
- num_bytes, 0);
+ num_bytes, 0, root->root_key.objectid);
btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset,
0, false);
if (extent_mergeable(leaf, path->slots[0] + 1,
@@ -814,7 +817,7 @@ again:
btrfs_set_file_extent_type(leaf, fi,
BTRFS_FILE_EXTENT_REG);
btrfs_set_file_extent_generation(leaf, fi, trans->transid);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
} else {
fi = btrfs_item_ptr(leaf, del_slot - 1,
struct btrfs_file_extent_item);
@@ -823,7 +826,7 @@ again:
btrfs_set_file_extent_generation(leaf, fi, trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - key.offset);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
if (ret < 0) {
@@ -1108,17 +1111,18 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
static void update_time_for_write(struct inode *inode)
{
- struct timespec64 now, ctime;
+ struct timespec64 now, ts;
if (IS_NOCMTIME(inode))
return;
now = current_time(inode);
- if (!timespec64_equal(&inode->i_mtime, &now))
- inode->i_mtime = now;
+ ts = inode_get_mtime(inode);
+ if (!timespec64_equal(&ts, &now))
+ inode_set_mtime_to_ts(inode, now);
- ctime = inode_get_ctime(inode);
- if (!timespec64_equal(&ctime, &now))
+ ts = inode_get_ctime(inode);
+ if (!timespec64_equal(&ts, &now))
inode_set_ctime_to_ts(inode, now);
if (IS_I_VERSION(inode))
@@ -1746,7 +1750,7 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
struct btrfs_inode *inode = BTRFS_I(ctx->inode);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- if (btrfs_inode_in_log(inode, fs_info->generation) &&
+ if (btrfs_inode_in_log(inode, btrfs_get_fs_generation(fs_info)) &&
list_empty(&ctx->ordered_extents))
return true;
@@ -1757,7 +1761,7 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
* and for a fast fsync we don't wait for that, we only wait for the
* writeback to complete.
*/
- if (inode->last_trans <= fs_info->last_trans_committed &&
+ if (inode->last_trans <= btrfs_get_last_trans_committed(fs_info) &&
(test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
list_empty(&ctx->ordered_extents)))
return true;
@@ -1886,7 +1890,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
atomic_inc(&root->log_batch);
- smp_mb();
if (skip_inode_logging(&ctx)) {
/*
* We've had everything committed since the last time we were
@@ -2104,7 +2107,7 @@ static int fill_holes(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
btrfs_set_file_extent_offset(leaf, fi, 0);
btrfs_set_file_extent_generation(leaf, fi, trans->transid);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
goto out;
}
@@ -2112,7 +2115,7 @@ static int fill_holes(struct btrfs_trans_handle *trans,
u64 num_bytes;
key.offset = offset;
- btrfs_set_item_key_safe(fs_info, path, &key);
+ btrfs_set_item_key_safe(trans, path, &key);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
@@ -2121,7 +2124,7 @@ static int fill_holes(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
btrfs_set_file_extent_offset(leaf, fi, 0);
btrfs_set_file_extent_generation(leaf, fi, trans->transid);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
goto out;
}
btrfs_release_path(path);
@@ -2273,7 +2276,7 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
if (extent_info->is_new_extent)
btrfs_set_file_extent_generation(leaf, extent, trans->transid);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
@@ -2303,7 +2306,8 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
extent_info->disk_offset,
- extent_info->disk_len, 0);
+ extent_info->disk_len, 0,
+ root->root_key.objectid);
ref_offset = extent_info->file_offset - extent_info->data_offset;
btrfs_init_data_ref(&ref, root->root_key.objectid,
btrfs_ino(inode), ref_offset, 0, false);
@@ -2473,9 +2477,10 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
inode_inc_iversion(&inode->vfs_inode);
if (!extent_info || extent_info->update_times)
- inode->vfs_inode.i_mtime = inode_set_ctime_current(&inode->vfs_inode);
+ inode_set_mtime_to_ts(&inode->vfs_inode,
+ inode_set_ctime_current(&inode->vfs_inode));
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode);
if (ret)
break;
@@ -2714,8 +2719,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
ASSERT(trans != NULL);
inode_inc_iversion(inode);
- inode->i_mtime = inode_set_ctime_current(inode);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
updated_inode = true;
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
@@ -2734,14 +2739,14 @@ out_only_mutex:
struct timespec64 now = inode_set_ctime_current(inode);
inode_inc_iversion(inode);
- inode->i_mtime = now;
+ inode_set_mtime_to_ts(inode, now);
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
} else {
int ret2;
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
ret2 = btrfs_end_transaction(trans);
if (!ret)
ret = ret2;
@@ -2808,7 +2813,7 @@ static int btrfs_fallocate_update_isize(struct inode *inode,
inode_set_ctime_current(inode);
i_size_write(inode, end);
btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
ret2 = btrfs_end_transaction(trans);
return ret ? ret : ret2;
@@ -3187,7 +3192,7 @@ static long btrfs_fallocate(struct file *file, int mode,
qgroup_reserved -= range->len;
} else if (qgroup_reserved > 0) {
btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved,
- range->start, range->len);
+ range->start, range->len, NULL);
qgroup_reserved -= range->len;
}
list_del(&range->list);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 27fad70451aa..6f93c9a2c3e3 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -57,6 +57,11 @@ static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, u64 offset,
u64 bytes, bool update_stats);
+static void btrfs_crc32c_final(u32 crc, u8 *result)
+{
+ put_unaligned_le32(~crc, result);
+}
+
static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
{
struct btrfs_free_space *info;
@@ -195,7 +200,7 @@ static int __create_free_space_inode(struct btrfs_root *root,
btrfs_set_inode_nlink(leaf, inode_item, 1);
btrfs_set_inode_transid(leaf, inode_item, trans->transid);
btrfs_set_inode_block_group(leaf, inode_item, offset);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -213,7 +218,7 @@ static int __create_free_space_inode(struct btrfs_root *root,
struct btrfs_free_space_header);
memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
btrfs_set_free_space_key(leaf, header, &disk_key);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
return 0;
@@ -354,7 +359,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
if (ret)
goto fail;
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode);
fail:
if (locked)
@@ -540,7 +545,7 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
if (index == 0)
offset = sizeof(u32) * io_ctl->num_pages;
- crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
+ crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
btrfs_crc32c_final(crc, (u8 *)&crc);
io_ctl_unmap_page(io_ctl);
tmp = page_address(io_ctl->pages[0]);
@@ -562,7 +567,7 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
val = *tmp;
io_ctl_map_page(io_ctl, 0);
- crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
+ crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
btrfs_crc32c_final(crc, (u8 *)&crc);
if (val != crc) {
btrfs_err_rl(io_ctl->fs_info,
@@ -1185,7 +1190,7 @@ update_cache_item(struct btrfs_trans_handle *trans,
btrfs_set_free_space_entries(leaf, header, entries);
btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
btrfs_set_free_space_generation(leaf, header, trans->transid);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
return 0;
@@ -1321,7 +1326,7 @@ out:
"failed to write free space cache for block group %llu error %d",
block_group->start, ret);
}
- btrfs_update_inode(trans, root, BTRFS_I(inode));
+ btrfs_update_inode(trans, BTRFS_I(inode));
if (block_group) {
/* the dirty list is protected by the dirty_bgs_lock */
@@ -1362,7 +1367,6 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
/*
* Write out cached info to an inode.
*
- * @root: root the inode belongs to
* @inode: freespace inode we are writing out
* @ctl: free space cache we are going to write out
* @block_group: block_group for this cache if it belongs to a block_group
@@ -1373,7 +1377,7 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
* on mount. This will return 0 if it was successful in writing the cache out,
* or an errno if it was not.
*/
-static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
+static int __btrfs_write_out_cache(struct inode *inode,
struct btrfs_free_space_ctl *ctl,
struct btrfs_block_group *block_group,
struct btrfs_io_ctl *io_ctl,
@@ -1506,7 +1510,7 @@ out:
invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
}
- btrfs_update_inode(trans, root, BTRFS_I(inode));
+ btrfs_update_inode(trans, BTRFS_I(inode));
if (must_iput)
iput(inode);
return ret;
@@ -1532,8 +1536,8 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
if (IS_ERR(inode))
return 0;
- ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
- block_group, &block_group->io_ctl, trans);
+ ret = __btrfs_write_out_cache(inode, ctl, block_group,
+ &block_group->io_ctl, trans);
if (ret) {
btrfs_debug(fs_info,
"failed to write free space cache for block group %llu error %d",
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index c0e734082dcc..7b598b070700 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -89,7 +89,7 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans,
struct btrfs_free_space_info);
btrfs_set_free_space_extent_count(leaf, info, 0);
btrfs_set_free_space_flags(leaf, info, 0);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
ret = 0;
out:
@@ -287,7 +287,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
flags |= BTRFS_FREE_SPACE_USING_BITMAPS;
btrfs_set_free_space_flags(leaf, info, flags);
expected_extent_count = btrfs_free_space_extent_count(leaf, info);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
if (extent_count != expected_extent_count) {
@@ -324,7 +324,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
write_extent_buffer(leaf, bitmap_cursor, ptr,
data_size);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
i += extent_size;
@@ -430,7 +430,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS;
btrfs_set_free_space_flags(leaf, info, flags);
expected_extent_count = btrfs_free_space_extent_count(leaf, info);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
nrbits = block_group->length >> block_group->fs_info->sectorsize_bits;
@@ -495,7 +495,7 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
extent_count += new_extents;
btrfs_set_free_space_extent_count(path->nodes[0], info, extent_count);
- btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_mark_buffer_dirty(trans, path->nodes[0]);
btrfs_release_path(path);
if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
@@ -533,7 +533,8 @@ int free_space_test_bit(struct btrfs_block_group *block_group,
return !!extent_buffer_test_bit(leaf, ptr, i);
}
-static void free_space_set_bits(struct btrfs_block_group *block_group,
+static void free_space_set_bits(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path, u64 *start, u64 *size,
int bit)
{
@@ -563,7 +564,7 @@ static void free_space_set_bits(struct btrfs_block_group *block_group,
extent_buffer_bitmap_set(leaf, ptr, first, last - first);
else
extent_buffer_bitmap_clear(leaf, ptr, first, last - first);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
*size -= end - *start;
*start = end;
@@ -656,7 +657,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
cur_start = start;
cur_size = size;
while (1) {
- free_space_set_bits(block_group, path, &cur_start, &cur_size,
+ free_space_set_bits(trans, block_group, path, &cur_start, &cur_size,
!remove);
if (cur_size == 0)
break;
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index a523d64d5491..318df6f9d9cb 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -139,6 +139,12 @@ enum {
*/
BTRFS_FS_FEATURE_CHANGED,
+ /*
+ * Indicate that we have found a tree block which is only aligned to
+ * sectorsize, but not to nodesize. This should be rare nowadays.
+ */
+ BTRFS_FS_UNALIGNED_TREE_BLOCK,
+
#if BITS_PER_LONG == 32
/* Indicate if we have error/warn message printed on 32bit systems */
BTRFS_FS_32BIT_ERROR,
@@ -171,19 +177,17 @@ enum {
BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16),
BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17),
BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18),
- BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19),
- BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20),
- BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21),
- BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22),
- BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23),
- BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24),
- BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25),
- BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26),
- BTRFS_MOUNT_REF_VERIFY = (1UL << 27),
- BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28),
- BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29),
- BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30),
- BTRFS_MOUNT_NODISCARD = (1UL << 31),
+ BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 19),
+ BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 20),
+ BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 21),
+ BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 22),
+ BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 23),
+ BTRFS_MOUNT_NOLOGREPLAY = (1UL << 24),
+ BTRFS_MOUNT_REF_VERIFY = (1UL << 25),
+ BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 26),
+ BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 27),
+ BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 28),
+ BTRFS_MOUNT_NODISCARD = (1UL << 29),
};
/*
@@ -216,7 +220,8 @@ enum {
BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
- BTRFS_FEATURE_INCOMPAT_ZONED)
+ BTRFS_FEATURE_INCOMPAT_ZONED | \
+ BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA)
#ifdef CONFIG_BTRFS_DEBUG
/*
@@ -225,6 +230,7 @@ enum {
*/
#define BTRFS_FEATURE_INCOMPAT_SUPP \
(BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \
+ BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE | \
BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2)
#else
@@ -369,6 +375,7 @@ struct btrfs_fs_info {
struct btrfs_root *uuid_root;
struct btrfs_root *data_reloc_root;
struct btrfs_root *block_group_root;
+ struct btrfs_root *stripe_root;
/* The log root tree is a directory of all the other log roots */
struct btrfs_root *log_root_tree;
@@ -409,7 +416,17 @@ struct btrfs_fs_info {
struct btrfs_block_rsv empty_block_rsv;
+ /*
+ * Updated while holding the lock 'trans_lock'. Due to the life cycle of
+ * a transaction, it can be directly read while holding a transaction
+ * handle, everywhere else must be read with btrfs_get_fs_generation().
+ * Should always be updated using btrfs_set_fs_generation().
+ */
u64 generation;
+ /*
+ * Always use btrfs_get_last_trans_committed() and
+ * btrfs_set_last_trans_committed() to read and update this field.
+ */
u64 last_trans_committed;
/*
* Generation of the last transaction used for block group relocation
@@ -645,9 +662,6 @@ struct btrfs_fs_info {
struct btrfs_discard_ctl discard_ctl;
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- u32 check_integrity_print_mask;
-#endif
/* Is qgroup tracking in a consistent state? */
u64 qgroup_flags;
@@ -683,6 +697,7 @@ struct btrfs_fs_info {
/* Protected by qgroup_rescan_lock */
bool qgroup_rescan_running;
u8 qgroup_drop_subtree_thres;
+ u64 qgroup_enable_gen;
/*
* If this is not 0, then it indicates a serious filesystem error has
@@ -812,6 +827,26 @@ struct btrfs_fs_info {
#endif
};
+static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info)
+{
+ return READ_ONCE(fs_info->generation);
+}
+
+static inline void btrfs_set_fs_generation(struct btrfs_fs_info *fs_info, u64 gen)
+{
+ WRITE_ONCE(fs_info->generation, gen);
+}
+
+static inline u64 btrfs_get_last_trans_committed(const struct btrfs_fs_info *fs_info)
+{
+ return READ_ONCE(fs_info->last_trans_committed);
+}
+
+static inline void btrfs_set_last_trans_committed(struct btrfs_fs_info *fs_info, u64 gen)
+{
+ WRITE_ONCE(fs_info->last_trans_committed, gen);
+}
+
static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info,
u64 gen)
{
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 4c322b720a80..7d734830e514 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -167,7 +167,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
memmove_extent_buffer(leaf, ptr, ptr + del_len,
item_size - (ptr + del_len - item_start));
- btrfs_truncate_item(path, item_size - del_len, 1);
+ btrfs_truncate_item(trans, path, item_size - del_len, 1);
out:
btrfs_free_path(path);
@@ -229,7 +229,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
item_size - (ptr + sub_item_len - item_start));
- btrfs_truncate_item(path, item_size - sub_item_len, 1);
+ btrfs_truncate_item(trans, path, item_size - sub_item_len, 1);
out:
btrfs_free_path(path);
@@ -247,7 +247,7 @@ out:
}
/*
- * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree.
+ * Insert an extended inode ref into a tree.
*
* The caller must have checked against BTRFS_LINK_MAX already.
*/
@@ -282,7 +282,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
name))
goto out;
- btrfs_extend_item(path, ins_len);
+ btrfs_extend_item(trans, path, ins_len);
ret = 0;
}
if (ret < 0)
@@ -299,7 +299,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
ptr = (unsigned long)&extref->name;
write_extent_buffer(path->nodes[0], name->name, ptr, name->len);
- btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_mark_buffer_dirty(trans, path->nodes[0]);
out:
btrfs_free_path(path);
@@ -338,7 +338,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
goto out;
old_size = btrfs_item_size(path->nodes[0], path->slots[0]);
- btrfs_extend_item(path, ins_len);
+ btrfs_extend_item(trans, path, ins_len);
ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_ref);
ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
@@ -364,7 +364,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
ptr = (unsigned long)(ref + 1);
}
write_extent_buffer(path->nodes[0], name->name, ptr, name->len);
- btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_mark_buffer_dirty(trans, path->nodes[0]);
out:
btrfs_free_path(path);
@@ -591,7 +591,7 @@ search_again:
num_dec = (orig_num_bytes - extent_num_bytes);
if (extent_start != 0)
control->sub_bytes += num_dec;
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
} else {
extent_num_bytes =
btrfs_file_extent_disk_num_bytes(leaf, fi);
@@ -617,7 +617,7 @@ search_again:
btrfs_set_file_extent_ram_bytes(leaf, fi, size);
size = btrfs_file_extent_calc_inline_size(size);
- btrfs_truncate_item(path, size, 1);
+ btrfs_truncate_item(trans, path, size, 1);
} else if (!del_item) {
/*
* We have to bail so the last_size is set to
@@ -676,7 +676,8 @@ delete:
bytes_deleted += extent_num_bytes;
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
- extent_start, extent_num_bytes, 0);
+ extent_start, extent_num_bytes, 0,
+ root->root_key.objectid);
btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
control->ino, extent_offset,
root->root_key.objectid, false);
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
index ede43b6c6559..4337bb26f419 100644
--- a/fs/btrfs/inode-item.h
+++ b/fs/btrfs/inode-item.h
@@ -4,6 +4,7 @@
#define BTRFS_INODE_ITEM_H
#include <linux/types.h>
+#include <linux/crc32c.h>
struct btrfs_trans_handle;
struct btrfs_root;
@@ -12,6 +13,7 @@ struct btrfs_key;
struct btrfs_inode_extref;
struct btrfs_inode;
struct extent_buffer;
+struct fscrypt_str;
/*
* Return this if we need to call truncate_block for the last bit of the
@@ -76,6 +78,12 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags,
*ro_flags = (u32)(inode_item_flags >> 32);
}
+/* Figure the key offset of an extended inode ref. */
+static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, int len)
+{
+ return (u64)crc32c(parent_objectid, name, len);
+}
+
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_truncate_control *control);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7814b9d654ce..fb3c3f43c3fa 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -71,6 +71,7 @@
#include "super.h"
#include "orphan.h"
#include "backref.h"
+#include "raid-stripe-tree.h"
struct btrfs_iget_args {
u64 ino;
@@ -348,7 +349,7 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
}
/*
- * btrfs_inode_lock - lock inode i_rwsem based on arguments passed
+ * Lock inode i_rwsem based on arguments passed.
*
* ilock_flags can have the following bit set:
*
@@ -382,7 +383,7 @@ int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
}
/*
- * btrfs_inode_unlock - unock inode i_rwsem
+ * Unock inode i_rwsem.
*
* ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
* to decide whether the lock acquired is shared or exclusive.
@@ -573,7 +574,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
kunmap_local(kaddr);
put_page(page);
}
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
/*
@@ -670,7 +671,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
}
btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode);
if (ret && ret != -ENOSPC) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -687,7 +688,7 @@ out:
* And at reserve time, it's always aligned to page size, so
* just free one page here.
*/
- btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
+ btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL);
btrfs_free_path(path);
btrfs_end_transaction(trans);
return ret;
@@ -1565,8 +1566,11 @@ out_unlock:
* Phase two of compressed writeback. This is the ordered portion of the code,
* which only gets called in the order the work was queued. We walk all the
* async extents created by compress_file_range and send them down to the disk.
+ *
+ * If called with @do_free == true then it'll try to finish the work and free
+ * the work struct eventually.
*/
-static noinline void submit_compressed_extents(struct btrfs_work *work)
+static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_free)
{
struct async_chunk *async_chunk = container_of(work, struct async_chunk,
work);
@@ -1575,6 +1579,21 @@ static noinline void submit_compressed_extents(struct btrfs_work *work)
unsigned long nr_pages;
u64 alloc_hint = 0;
+ if (do_free) {
+ struct async_chunk *async_chunk;
+ struct async_cow *async_cow;
+
+ async_chunk = container_of(work, struct async_chunk, work);
+ btrfs_add_delayed_iput(async_chunk->inode);
+ if (async_chunk->blkcg_css)
+ css_put(async_chunk->blkcg_css);
+
+ async_cow = async_chunk->async_cow;
+ if (atomic_dec_and_test(&async_cow->num_chunks))
+ kvfree(async_cow);
+ return;
+ }
+
nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
PAGE_SHIFT;
@@ -1591,21 +1610,6 @@ static noinline void submit_compressed_extents(struct btrfs_work *work)
cond_wake_up_nomb(&fs_info->async_submit_wait);
}
-static noinline void async_cow_free(struct btrfs_work *work)
-{
- struct async_chunk *async_chunk;
- struct async_cow *async_cow;
-
- async_chunk = container_of(work, struct async_chunk, work);
- btrfs_add_delayed_iput(async_chunk->inode);
- if (async_chunk->blkcg_css)
- css_put(async_chunk->blkcg_css);
-
- async_cow = async_chunk->async_cow;
- if (atomic_dec_and_test(&async_cow->num_chunks))
- kvfree(async_cow);
-}
-
static bool run_delalloc_compressed(struct btrfs_inode *inode,
struct page *locked_page, u64 start,
u64 end, struct writeback_control *wbc)
@@ -1683,7 +1687,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
}
btrfs_init_work(&async_chunk[i].work, compress_file_range,
- submit_compressed_extents, async_cow_free);
+ submit_compressed_extents);
nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
atomic_add(nr_pages, &fs_info->async_delalloc_pages);
@@ -2235,8 +2239,7 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
{
if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
if (inode->defrag_bytes &&
- test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG,
- 0, NULL))
+ test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG))
return false;
return true;
}
@@ -2847,7 +2850,7 @@ int btrfs_writepage_cow_fixup(struct page *page)
ihold(inode);
btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
get_page(page);
- btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
+ btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL);
fixup->page = page;
fixup->inode = BTRFS_I(inode);
btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
@@ -2912,7 +2915,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_item_ptr_offset(leaf, path->slots[0]),
sizeof(struct btrfs_file_extent_item));
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
/*
@@ -3070,7 +3073,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
trans->block_rsv = &inode->block_rsv;
- ret = btrfs_update_inode_fallback(trans, root, inode);
+ ret = btrfs_update_inode_fallback(trans, inode);
if (ret) /* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3091,6 +3094,10 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
trans->block_rsv = &inode->block_rsv;
+ ret = btrfs_insert_raid_extent(trans, ordered_extent);
+ if (ret)
+ goto out;
+
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
compress_type = ordered_extent->compress_type;
if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
@@ -3136,7 +3143,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
&cached_state);
btrfs_inode_safe_disk_i_size_write(inode, 0);
- ret = btrfs_update_inode_fallback(trans, root, inode);
+ ret = btrfs_update_inode_fallback(trans, inode);
if (ret) { /* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3224,7 +3231,8 @@ out:
int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
{
if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) &&
- !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
+ !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
+ list_empty(&ordered->bioc_list))
btrfs_finish_ordered_zoned(ordered);
return btrfs_finish_one_ordered(ordered);
}
@@ -3282,7 +3290,7 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
if (btrfs_is_data_reloc_root(inode->root) &&
test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
- 1, NULL)) {
+ NULL)) {
/* Skip the range without csum for data reloc inode */
clear_extent_bits(&inode->io_tree, file_offset, end,
EXTENT_NODATASUM);
@@ -3306,7 +3314,7 @@ zeroit:
}
/*
- * btrfs_add_delayed_iput - perform a delayed iput on @inode
+ * Perform a delayed iput on @inode.
*
* @inode: The inode we want to perform iput on
*
@@ -3754,19 +3762,17 @@ static int btrfs_read_locked_inode(struct inode *inode,
btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
round_up(i_size_read(inode), fs_info->sectorsize));
- inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
- inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
+ inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime),
+ btrfs_timespec_nsec(leaf, &inode_item->atime));
- inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
- inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
+ inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime),
+ btrfs_timespec_nsec(leaf, &inode_item->mtime));
inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
btrfs_timespec_nsec(leaf, &inode_item->ctime));
- BTRFS_I(inode)->i_otime.tv_sec =
- btrfs_timespec_sec(leaf, &inode_item->otime);
- BTRFS_I(inode)->i_otime.tv_nsec =
- btrfs_timespec_nsec(leaf, &inode_item->otime);
+ BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime);
+ BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime);
inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
@@ -3792,7 +3798,7 @@ cache_index:
* This is required for both inode re-read from disk and delayed inode
* in delayed_nodes_tree.
*/
- if (BTRFS_I(inode)->last_trans == fs_info->generation)
+ if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info))
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
@@ -3922,24 +3928,22 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
btrfs_set_token_timespec_sec(&token, &item->atime,
- inode->i_atime.tv_sec);
+ inode_get_atime_sec(inode));
btrfs_set_token_timespec_nsec(&token, &item->atime,
- inode->i_atime.tv_nsec);
+ inode_get_atime_nsec(inode));
btrfs_set_token_timespec_sec(&token, &item->mtime,
- inode->i_mtime.tv_sec);
+ inode_get_mtime_sec(inode));
btrfs_set_token_timespec_nsec(&token, &item->mtime,
- inode->i_mtime.tv_nsec);
+ inode_get_mtime_nsec(inode));
btrfs_set_token_timespec_sec(&token, &item->ctime,
- inode_get_ctime(inode).tv_sec);
+ inode_get_ctime_sec(inode));
btrfs_set_token_timespec_nsec(&token, &item->ctime,
- inode_get_ctime(inode).tv_nsec);
+ inode_get_ctime_nsec(inode));
- btrfs_set_token_timespec_sec(&token, &item->otime,
- BTRFS_I(inode)->i_otime.tv_sec);
- btrfs_set_token_timespec_nsec(&token, &item->otime,
- BTRFS_I(inode)->i_otime.tv_nsec);
+ btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime_sec);
+ btrfs_set_token_timespec_nsec(&token, &item->otime, BTRFS_I(inode)->i_otime_nsec);
btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
btrfs_set_token_inode_generation(&token, item,
@@ -3957,8 +3961,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
* copy everything in the in-memory inode into the btree.
*/
static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_inode *inode)
+ struct btrfs_inode *inode)
{
struct btrfs_inode_item *inode_item;
struct btrfs_path *path;
@@ -3969,7 +3972,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1);
+ ret = btrfs_lookup_inode(trans, inode->root, path, &inode->location, 1);
if (ret) {
if (ret > 0)
ret = -ENOENT;
@@ -3981,7 +3984,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode_item);
fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_set_inode_last_trans(trans, inode);
ret = 0;
failed:
@@ -3992,10 +3995,10 @@ failed:
/*
* copy everything in the in-memory inode into the btree.
*/
-noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_inode *inode)
+int btrfs_update_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode)
{
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
@@ -4011,23 +4014,23 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
&& !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
btrfs_update_root_times(trans, root);
- ret = btrfs_delayed_update_inode(trans, root, inode);
+ ret = btrfs_delayed_update_inode(trans, inode);
if (!ret)
btrfs_set_inode_last_trans(trans, inode);
return ret;
}
- return btrfs_update_inode_item(trans, root, inode);
+ return btrfs_update_inode_item(trans, inode);
}
int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode)
+ struct btrfs_inode *inode)
{
int ret;
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode);
if (ret == -ENOSPC)
- return btrfs_update_inode_item(trans, root, inode);
+ return btrfs_update_inode_item(trans, inode);
return ret;
}
@@ -4132,9 +4135,8 @@ err:
btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
inode_inc_iversion(&inode->vfs_inode);
inode_inc_iversion(&dir->vfs_inode);
- inode_set_ctime_current(&inode->vfs_inode);
- dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode);
- ret = btrfs_update_inode(trans, root, dir);
+ inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
+ ret = btrfs_update_inode(trans, dir);
out:
return ret;
}
@@ -4148,7 +4150,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL);
if (!ret) {
drop_nlink(&inode->vfs_inode);
- ret = btrfs_update_inode(trans, inode->root, inode);
+ ret = btrfs_update_inode(trans, inode);
}
return ret;
}
@@ -4306,8 +4308,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2);
inode_inc_iversion(&dir->vfs_inode);
- dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode);
- ret = btrfs_update_inode_fallback(trans, root, dir);
+ inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
+ ret = btrfs_update_inode_fallback(trans, dir);
if (ret)
btrfs_abort_transaction(trans, ret);
out:
@@ -4641,7 +4643,8 @@ out_notrans:
}
/*
- * btrfs_truncate_block - read, zero a chunk and write a block
+ * Read, zero a chunk and write a block.
+ *
* @inode - inode that we're zeroing
* @from - the offset to start zeroing
* @len - the length to zero, 0 to zero the entire range respective to the
@@ -4791,9 +4794,9 @@ out:
return ret;
}
-static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
- u64 offset, u64 len)
+static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len)
{
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
struct btrfs_drop_extents_args drop_args = { 0 };
@@ -4833,7 +4836,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
btrfs_abort_transaction(trans, ret);
} else {
btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
- btrfs_update_inode(trans, root, inode);
+ btrfs_update_inode(trans, inode);
}
btrfs_end_transaction(trans);
return ret;
@@ -4889,8 +4892,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
struct extent_map *hole_em;
- err = maybe_insert_hole(root, inode, cur_offset,
- hole_size);
+ err = maybe_insert_hole(inode, cur_offset, hole_size);
if (err)
break;
@@ -4916,7 +4918,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
hole_em->orig_block_len = 0;
hole_em->ram_bytes = hole_size;
hole_em->compress_type = BTRFS_COMPRESS_NONE;
- hole_em->generation = fs_info->generation;
+ hole_em->generation = btrfs_get_fs_generation(fs_info);
err = btrfs_replace_extent_map_range(inode, hole_em, true);
free_extent_map(hole_em);
@@ -4956,7 +4958,8 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
if (newsize != oldsize) {
inode_inc_iversion(inode);
if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode,
+ inode_set_ctime_current(inode));
}
}
@@ -4984,7 +4987,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
i_size_write(inode, newsize);
btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
pagecache_isize_extended(inode, oldsize, newsize);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
btrfs_drew_write_unlock(&root->snapshot_lock);
btrfs_end_transaction(trans);
} else {
@@ -5129,7 +5132,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
*/
if (state_flags & EXTENT_DELALLOC)
btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
- end - start + 1);
+ end - start + 1, NULL);
clear_extent_bit(io_tree, start, end,
EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
@@ -5582,6 +5585,7 @@ static struct inode *new_simple_dir(struct inode *dir,
struct btrfs_key *key,
struct btrfs_root *root)
{
+ struct timespec64 ts;
struct inode *inode = new_inode(dir->i_sb);
if (!inode)
@@ -5600,9 +5604,13 @@ static struct inode *new_simple_dir(struct inode *dir,
inode->i_opflags &= ~IOP_XATTR;
inode->i_fop = &simple_dir_operations;
inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
- inode->i_mtime = inode_set_ctime_current(inode);
- inode->i_atime = dir->i_atime;
- BTRFS_I(inode)->i_otime = inode->i_mtime;
+
+ ts = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, ts);
+ inode_set_atime_to_ts(inode, inode_get_atime(dir));
+ BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
+ BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
+
inode->i_uid = dir->i_uid;
inode->i_gid = dir->i_gid;
@@ -6000,15 +6008,15 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode)
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_update_inode(trans, root, inode);
- if (ret && (ret == -ENOSPC || ret == -EDQUOT)) {
+ ret = btrfs_update_inode(trans, inode);
+ if (ret == -ENOSPC || ret == -EDQUOT) {
/* whoops, lets try again with the full transaction */
btrfs_end_transaction(trans);
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode);
}
btrfs_end_transaction(trans);
if (inode->delayed_node)
@@ -6024,7 +6032,7 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode)
static int btrfs_update_time(struct inode *inode, int flags)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
- bool dirty = flags & ~S_VERSION;
+ bool dirty;
if (btrfs_root_readonly(root))
return -EROFS;
@@ -6160,6 +6168,7 @@ static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *
int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
struct btrfs_new_inode_args *args)
{
+ struct timespec64 ts;
struct inode *dir = args->dir;
struct inode *inode = args->inode;
const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name;
@@ -6277,9 +6286,9 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
goto discard;
}
- inode->i_mtime = inode_set_ctime_current(inode);
- inode->i_atime = inode->i_mtime;
- BTRFS_I(inode)->i_otime = inode->i_mtime;
+ ts = simple_inode_init_ts(inode);
+ BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
+ BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
/*
* We're going to fill the inode item now, so at this point the inode
@@ -6310,7 +6319,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
}
}
- btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_mark_buffer_dirty(trans, path->nodes[0]);
/*
* We don't need the path anymore, plus inheriting properties, adding
* ACLs, security xattrs, orphan item or adding the link, will result in
@@ -6444,10 +6453,10 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
* values (the ones it had when the fsync was done).
*/
if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags))
- parent_inode->vfs_inode.i_mtime =
- inode_set_ctime_current(&parent_inode->vfs_inode);
+ inode_set_mtime_to_ts(&parent_inode->vfs_inode,
+ inode_set_ctime_current(&parent_inode->vfs_inode));
- ret = btrfs_update_inode(trans, root, parent_inode);
+ ret = btrfs_update_inode(trans, parent_inode);
if (ret)
btrfs_abort_transaction(trans, ret);
return ret;
@@ -6598,7 +6607,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
} else {
struct dentry *parent = dentry->d_parent;
- err = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ err = btrfs_update_inode(trans, BTRFS_I(inode));
if (err)
goto fail;
if (inode->i_nlink == 1) {
@@ -6974,8 +6983,15 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
int ret;
alloc_hint = get_extent_allocation_hint(inode, start, len);
+again:
ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
0, alloc_hint, &ins, 1, 1);
+ if (ret == -EAGAIN) {
+ ASSERT(btrfs_is_zoned(fs_info));
+ wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
+ TASK_UNINTERRUPTIBLE);
+ goto again;
+ }
if (ret)
return ERR_PTR(ret);
@@ -7103,8 +7119,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
range_end = round_up(offset + nocow_args.num_bytes,
root->fs_info->sectorsize) - 1;
- ret = test_range_bit(io_tree, offset, range_end,
- EXTENT_DELALLOC, 0, NULL);
+ ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC);
if (ret) {
ret = -EAGAIN;
goto out;
@@ -8005,11 +8020,11 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, &cached_state);
- spin_lock_irq(&inode->ordered_tree.lock);
+ spin_lock_irq(&inode->ordered_tree_lock);
set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
ordered->truncated_len = min(ordered->truncated_len,
cur - ordered->file_offset);
- spin_unlock_irq(&inode->ordered_tree.lock);
+ spin_unlock_irq(&inode->ordered_tree_lock);
/*
* If the ordered extent has finished, we're safe to delete all
@@ -8044,7 +8059,7 @@ next:
* reserved data space.
* Since the IO will never happen for this page.
*/
- btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur);
+ btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL);
if (!inode_evicting) {
clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
EXTENT_DELALLOC | EXTENT_UPTODATE |
@@ -8339,7 +8354,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
if (ret != -ENOSPC && ret != -EAGAIN)
break;
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode);
if (ret)
break;
@@ -8392,7 +8407,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
int ret2;
trans->block_rsv = &fs_info->trans_block_rsv;
- ret2 = btrfs_update_inode(trans, root, inode);
+ ret2 = btrfs_update_inode(trans, inode);
if (ret2 && !ret)
ret = ret2;
@@ -8481,8 +8496,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->delayed_node = NULL;
- ei->i_otime.tv_sec = 0;
- ei->i_otime.tv_nsec = 0;
+ ei->i_otime_sec = 0;
+ ei->i_otime_nsec = 0;
inode = &ei->vfs_inode;
extent_map_tree_init(&ei->extent_tree);
@@ -8491,7 +8506,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
extent_io_tree_init(fs_info, &ei->file_extent_tree,
IO_TREE_INODE_FILE_EXTENT);
mutex_init(&ei->log_mutex);
- btrfs_ordered_inode_tree_init(&ei->ordered_tree);
+ spin_lock_init(&ei->ordered_tree_lock);
+ ei->ordered_tree = RB_ROOT;
+ ei->ordered_tree_last = NULL;
INIT_LIST_HEAD(&ei->delalloc_inodes);
INIT_LIST_HEAD(&ei->delayed_iput);
RB_CLEAR_NODE(&ei->rb_node);
@@ -8634,8 +8651,8 @@ static int btrfs_getattr(struct mnt_idmap *idmap,
u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
stat->result_mask |= STATX_BTIME;
- stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
- stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec;
+ stat->btime.tv_sec = BTRFS_I(inode)->i_otime_sec;
+ stat->btime.tv_nsec = BTRFS_I(inode)->i_otime_nsec;
if (bi_flags & BTRFS_INODE_APPEND)
stat->attributes |= STATX_ATTR_APPEND;
if (bi_flags & BTRFS_INODE_COMPRESS)
@@ -8823,7 +8840,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(old_dentry->d_inode),
old_name, &old_rename_ctx);
if (!ret)
- ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
}
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -8838,7 +8855,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(new_dentry->d_inode),
new_name, &new_rename_ctx);
if (!ret)
- ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(new_inode));
}
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -9083,7 +9100,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
BTRFS_I(d_inode(old_dentry)),
&old_fname.disk_name, &rename_ctx);
if (!ret)
- ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
}
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -9208,7 +9225,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
init_completion(&work->completion);
INIT_LIST_HEAD(&work->list);
work->inode = inode;
- btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
+ btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL);
return work;
}
@@ -9446,7 +9463,7 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
ptr = btrfs_file_extent_inline_start(ei);
write_extent_buffer(leaf, symname, ptr, name_len);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_free_path(path);
d_instantiate_new(dentry, inode);
@@ -9474,7 +9491,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
struct btrfs_path *path;
u64 start = ins->objectid;
u64 len = ins->offset;
- int qgroup_released;
+ u64 qgroup_released = 0;
int ret;
memset(&stack_fi, 0, sizeof(stack_fi));
@@ -9487,9 +9504,9 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
/* Encryption and other encoding is reserved and all 0 */
- qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len);
- if (qgroup_released < 0)
- return ERR_PTR(qgroup_released);
+ ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released);
+ if (ret < 0)
+ return ERR_PTR(ret);
if (trans) {
ret = insert_reserved_file_extent(trans, inode,
@@ -9639,7 +9656,7 @@ next:
btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
}
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -10384,7 +10401,7 @@ out_delalloc_release:
btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
out_qgroup_free_data:
if (ret < 0)
- btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes);
+ btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL);
out_free_data_space:
/*
* If btrfs_reserve_extent() succeeded, then we already decremented
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8e7d03bc1b56..a1743904202b 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -385,7 +385,7 @@ update_flags:
btrfs_sync_inode_flags_to_i_flags(inode);
inode_inc_iversion(inode);
inode_set_ctime_current(inode);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
out_end_trans:
btrfs_end_transaction(trans);
@@ -652,18 +652,18 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
/* Tree log can't currently deal with an inode which is a new root. */
btrfs_set_log_full_commit(trans);
- ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit);
+ ret = btrfs_qgroup_inherit(trans, 0, objectid, root->root_key.objectid, inherit);
if (ret)
goto out;
leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
- BTRFS_NESTING_NORMAL);
+ 0, BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
goto out;
}
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
inode_item = &root_item->inode;
btrfs_set_stack_inode_generation(inode_item, 1);
@@ -1290,6 +1290,15 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
* are limited to own subvolumes only
*/
ret = -EPERM;
+ } else if (btrfs_ino(BTRFS_I(src_inode)) != BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * Snapshots must be made with the src_inode referring
+ * to the subvolume inode, otherwise the permission
+ * checking above is useless because we may have
+ * permission on a lower directory but not the subvol
+ * itself.
+ */
+ ret = -EINVAL;
} else {
ret = btrfs_mksnapshot(&file->f_path, idmap,
name, namelen,
@@ -1528,7 +1537,7 @@ static noinline int key_in_sk(struct btrfs_key *key,
static noinline int copy_to_sk(struct btrfs_path *path,
struct btrfs_key *key,
struct btrfs_ioctl_search_key *sk,
- size_t *buf_size,
+ u64 *buf_size,
char __user *ubuf,
unsigned long *sk_offset,
int *num_found)
@@ -1660,7 +1669,7 @@ out:
static noinline int search_ioctl(struct inode *inode,
struct btrfs_ioctl_search_key *sk,
- size_t *buf_size,
+ u64 *buf_size,
char __user *ubuf)
{
struct btrfs_fs_info *info = btrfs_sb(inode->i_sb);
@@ -1733,7 +1742,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode,
struct btrfs_ioctl_search_args __user *uargs = argp;
struct btrfs_ioctl_search_key sk;
int ret;
- size_t buf_size;
+ u64 buf_size;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -1763,8 +1772,8 @@ static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
struct btrfs_ioctl_search_args_v2 __user *uarg = argp;
struct btrfs_ioctl_search_args_v2 args;
int ret;
- size_t buf_size;
- const size_t buf_limit = SZ_16M;
+ u64 buf_size;
+ const u64 buf_limit = SZ_16M;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -2635,6 +2644,12 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
return -EINVAL;
}
+ if (fs_info->fs_devices->temp_fsid) {
+ btrfs_err(fs_info,
+ "device add not supported on cloned temp-fsid mount");
+ return -EINVAL;
+ }
+
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) {
if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD))
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
@@ -2676,8 +2691,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args_v2 *vol_args;
- struct block_device *bdev = NULL;
- void *holder;
+ struct bdev_handle *bdev_handle = NULL;
int ret;
bool cancel = false;
@@ -2714,7 +2728,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
goto err_drop;
/* Exclusive operation is now claimed */
- ret = btrfs_rm_device(fs_info, &args, &bdev, &holder);
+ ret = btrfs_rm_device(fs_info, &args, &bdev_handle);
btrfs_exclop_finish(fs_info);
@@ -2728,8 +2742,8 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
}
err_drop:
mnt_drop_write_file(file);
- if (bdev)
- blkdev_put(bdev, holder);
+ if (bdev_handle)
+ bdev_release(bdev_handle);
out:
btrfs_put_dev_args_from_path(&args);
kfree(vol_args);
@@ -2742,8 +2756,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args *vol_args;
- struct block_device *bdev = NULL;
- void *holder;
+ struct bdev_handle *bdev_handle = NULL;
int ret;
bool cancel = false;
@@ -2770,15 +2783,15 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
cancel);
if (ret == 0) {
- ret = btrfs_rm_device(fs_info, &args, &bdev, &holder);
+ ret = btrfs_rm_device(fs_info, &args, &bdev_handle);
if (!ret)
btrfs_info(fs_info, "disk deleted %s", vol_args->name);
btrfs_exclop_finish(fs_info);
}
mnt_drop_write_file(file);
- if (bdev)
- blkdev_put(bdev, holder);
+ if (bdev_handle)
+ bdev_release(bdev_handle);
out:
btrfs_put_dev_args_from_path(&args);
kfree(vol_args);
@@ -2822,7 +2835,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
}
if (flags_in & BTRFS_FS_INFO_FLAG_GENERATION) {
- fi_args->generation = fs_info->generation;
+ fi_args->generation = btrfs_get_fs_generation(fs_info);
fi_args->flags |= BTRFS_FS_INFO_FLAG_GENERATION;
}
@@ -2947,7 +2960,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
- btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_mark_buffer_dirty(trans, path->nodes[0]);
btrfs_release_path(path);
btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL);
@@ -3131,7 +3144,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
return PTR_ERR(trans);
/* No running transaction, don't bother */
- transid = root->fs_info->last_trans_committed;
+ transid = btrfs_get_last_trans_committed(root->fs_info);
goto out;
}
transid = trans->transid;
@@ -3697,7 +3710,8 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
switch (sa->cmd) {
case BTRFS_QUOTA_CTL_ENABLE:
- ret = btrfs_quota_enable(fs_info);
+ case BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA:
+ ret = btrfs_quota_enable(fs_info, sa);
break;
case BTRFS_QUOTA_CTL_DISABLE:
ret = btrfs_quota_disable(fs_info);
@@ -4351,6 +4365,7 @@ static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat
arg->clone_sources = compat_ptr(args32.clone_sources);
arg->parent_root = args32.parent_root;
arg->flags = args32.flags;
+ arg->version = args32.version;
memcpy(arg->reserved, args32.reserved,
sizeof(args32.reserved));
#else
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 7979449a58d6..74d8e2003f58 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -8,6 +8,7 @@
#include <linux/spinlock.h>
#include <linux/page-flags.h>
#include <asm/bug.h>
+#include <trace/events/btrfs.h>
#include "misc.h"
#include "ctree.h"
#include "extent_io.h"
@@ -73,6 +74,7 @@ static struct btrfs_lockdep_keyset {
{ .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") },
{ .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") },
{ .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") },
+ { .id = BTRFS_RAID_STRIPE_TREE_OBJECTID, DEFINE_NAME("raid-stripe") },
{ .id = 0, DEFINE_NAME("tree") },
};
@@ -102,6 +104,15 @@ void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buff
#endif
+#ifdef CONFIG_BTRFS_DEBUG
+static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner)
+{
+ eb->lock_owner = owner;
+}
+#else
+static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) { }
+#endif
+
/*
* Extent buffer locking
* =====================
@@ -164,7 +175,7 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb)
int btrfs_try_tree_write_lock(struct extent_buffer *eb)
{
if (down_write_trylock(&eb->lock)) {
- eb->lock_owner = current->pid;
+ btrfs_set_eb_lock_owner(eb, current->pid);
trace_btrfs_try_tree_write_lock(eb);
return 1;
}
@@ -181,7 +192,8 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
}
/*
- * __btrfs_tree_lock - lock eb for write
+ * Lock eb for write.
+ *
* @eb: the eb to lock
* @nest: the nesting to use for the lock
*
@@ -196,7 +208,7 @@ void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
start_ns = ktime_get_ns();
down_write_nested(&eb->lock, nest);
- eb->lock_owner = current->pid;
+ btrfs_set_eb_lock_owner(eb, current->pid);
trace_btrfs_tree_lock(eb, start_ns);
}
@@ -211,7 +223,7 @@ void btrfs_tree_lock(struct extent_buffer *eb)
void btrfs_tree_unlock(struct extent_buffer *eb)
{
trace_btrfs_tree_unlock(eb);
- eb->lock_owner = 0;
+ btrfs_set_eb_lock_owner(eb, 0);
up_write(&eb->lock);
}
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
index 7695decc7243..b8f9c9e56c8c 100644
--- a/fs/btrfs/messages.c
+++ b/fs/btrfs/messages.c
@@ -72,11 +72,11 @@ static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf)
* over the error. Each subsequent error that doesn't have any context
* of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR.
*/
-const char * __attribute_const__ btrfs_decode_error(int errno)
+const char * __attribute_const__ btrfs_decode_error(int error)
{
char *errstr = "unknown";
- switch (errno) {
+ switch (error) {
case -ENOENT: /* -2 */
errstr = "No such entry";
break;
@@ -110,12 +110,12 @@ const char * __attribute_const__ btrfs_decode_error(int errno)
}
/*
- * __btrfs_handle_fs_error decodes expected errors from the caller and
- * invokes the appropriate error response.
+ * Decodes expected errors from the caller and invokes the appropriate error
+ * response.
*/
__cold
void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno, const char *fmt, ...)
+ unsigned int line, int error, const char *fmt, ...)
{
struct super_block *sb = fs_info->sb;
#ifdef CONFIG_PRINTK
@@ -132,11 +132,11 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
* Special case: if the error is EROFS, and we're already under
* SB_RDONLY, then it is safe here.
*/
- if (errno == -EROFS && sb_rdonly(sb))
+ if (error == -EROFS && sb_rdonly(sb))
return;
#ifdef CONFIG_PRINTK
- errstr = btrfs_decode_error(errno);
+ errstr = btrfs_decode_error(error);
btrfs_state_to_string(fs_info, statestr);
if (fmt) {
struct va_format vaf;
@@ -147,11 +147,11 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
vaf.va = &args;
pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n",
- sb->s_id, statestr, function, line, errno, errstr, &vaf);
+ sb->s_id, statestr, function, line, error, errstr, &vaf);
va_end(args);
} else {
pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n",
- sb->s_id, statestr, function, line, errno, errstr);
+ sb->s_id, statestr, function, line, error, errstr);
}
#endif
@@ -159,7 +159,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
* Today we only save the error info to memory. Long term we'll also
* send it down to the disk.
*/
- WRITE_ONCE(fs_info->fs_error, errno);
+ WRITE_ONCE(fs_info->fs_error, error);
/* Don't go through full error handling during mount. */
if (!(sb->s_flags & SB_BORN))
@@ -283,12 +283,12 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
#endif
/*
- * __btrfs_panic decodes unexpected, fatal errors from the caller, issues an
- * alert, and either panics or BUGs, depending on mount options.
+ * Decode unexpected, fatal errors from the caller, issue an alert, and either
+ * panic or BUGs, depending on mount options.
*/
__cold
void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno, const char *fmt, ...)
+ unsigned int line, int error, const char *fmt, ...)
{
char *s_id = "<unknown>";
const char *errstr;
@@ -301,13 +301,13 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
va_start(args, fmt);
vaf.va = &args;
- errstr = btrfs_decode_error(errno);
+ errstr = btrfs_decode_error(error);
if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR)))
panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
- s_id, function, line, &vaf, errno, errstr);
+ s_id, function, line, &vaf, error, errstr);
btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)",
- function, line, &vaf, errno, errstr);
+ function, line, &vaf, error, errstr);
va_end(args);
/* Caller calls BUG() */
}
diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
index 1ae6f8e23e07..4d04c1fa5899 100644
--- a/fs/btrfs/messages.h
+++ b/fs/btrfs/messages.h
@@ -184,25 +184,25 @@ do { \
__printf(5, 6)
__cold
void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno, const char *fmt, ...);
+ unsigned int line, int error, const char *fmt, ...);
-const char * __attribute_const__ btrfs_decode_error(int errno);
+const char * __attribute_const__ btrfs_decode_error(int error);
-#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \
+#define btrfs_handle_fs_error(fs_info, error, fmt, args...) \
__btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
- (errno), fmt, ##args)
+ (error), fmt, ##args)
__printf(5, 6)
__cold
void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno, const char *fmt, ...);
+ unsigned int line, int error, const char *fmt, ...);
/*
* If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
* will panic(). Otherwise we BUG() here.
*/
-#define btrfs_panic(fs_info, errno, fmt, args...) \
+#define btrfs_panic(fs_info, error, fmt, args...) \
do { \
- __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \
+ __btrfs_panic(fs_info, __func__, __LINE__, error, fmt, ##args); \
BUG(); \
} while (0)
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 345c449d588c..a82e1417c4d2 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -124,25 +124,24 @@ static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
* look find the first ordered struct that has this offset, otherwise
* the first one less than this offset
*/
-static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
- u64 file_offset)
+static inline struct rb_node *ordered_tree_search(struct btrfs_inode *inode,
+ u64 file_offset)
{
- struct rb_root *root = &tree->tree;
struct rb_node *prev = NULL;
struct rb_node *ret;
struct btrfs_ordered_extent *entry;
- if (tree->last) {
- entry = rb_entry(tree->last, struct btrfs_ordered_extent,
+ if (inode->ordered_tree_last) {
+ entry = rb_entry(inode->ordered_tree_last, struct btrfs_ordered_extent,
rb_node);
if (in_range(file_offset, entry->file_offset, entry->num_bytes))
- return tree->last;
+ return inode->ordered_tree_last;
}
- ret = __tree_search(root, file_offset, &prev);
+ ret = __tree_search(&inode->ordered_tree, file_offset, &prev);
if (!ret)
ret = prev;
if (ret)
- tree->last = ret;
+ inode->ordered_tree_last = ret;
return ret;
}
@@ -153,11 +152,12 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
{
struct btrfs_ordered_extent *entry;
int ret;
+ u64 qgroup_rsv = 0;
if (flags &
((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) {
/* For nocow write, we can release the qgroup rsv right now */
- ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
+ ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv);
if (ret < 0)
return ERR_PTR(ret);
} else {
@@ -165,7 +165,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
* The ordered extent has reserved qgroup space, release now
* and pass the reserved number for qgroup_record to free.
*/
- ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes);
+ ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv);
if (ret < 0)
return ERR_PTR(ret);
}
@@ -183,7 +183,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
entry->inode = igrab(&inode->vfs_inode);
entry->compress_type = compress_type;
entry->truncated_len = (u64)-1;
- entry->qgroup_rsv = ret;
+ entry->qgroup_rsv = qgroup_rsv;
entry->flags = flags;
refcount_set(&entry->refs, 1);
init_waitqueue_head(&entry->wait);
@@ -191,6 +191,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
INIT_LIST_HEAD(&entry->log_list);
INIT_LIST_HEAD(&entry->root_extent_list);
INIT_LIST_HEAD(&entry->work_list);
+ INIT_LIST_HEAD(&entry->bioc_list);
init_completion(&entry->completion);
/*
@@ -208,7 +209,6 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
{
struct btrfs_inode *inode = BTRFS_I(entry->inode);
- struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *node;
@@ -221,13 +221,14 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
/* One ref for the tree. */
refcount_inc(&entry->refs);
- spin_lock_irq(&tree->lock);
- node = tree_insert(&tree->tree, entry->file_offset, &entry->rb_node);
+ spin_lock_irq(&inode->ordered_tree_lock);
+ node = tree_insert(&inode->ordered_tree, entry->file_offset,
+ &entry->rb_node);
if (node)
btrfs_panic(fs_info, -EEXIST,
"inconsistency in ordered tree at offset %llu",
entry->file_offset);
- spin_unlock_irq(&tree->lock);
+ spin_unlock_irq(&inode->ordered_tree_lock);
spin_lock(&root->ordered_extent_lock);
list_add_tail(&entry->root_extent_list,
@@ -287,12 +288,11 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum)
{
- struct btrfs_ordered_inode_tree *tree;
+ struct btrfs_inode *inode = BTRFS_I(entry->inode);
- tree = &BTRFS_I(entry->inode)->ordered_tree;
- spin_lock_irq(&tree->lock);
+ spin_lock_irq(&inode->ordered_tree_lock);
list_add_tail(&sum->list, &entry->list);
- spin_unlock_irq(&tree->lock);
+ spin_unlock_irq(&inode->ordered_tree_lock);
}
static void finish_ordered_fn(struct btrfs_work *work)
@@ -310,7 +310,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
struct btrfs_inode *inode = BTRFS_I(ordered->inode);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- lockdep_assert_held(&inode->ordered_tree.lock);
+ lockdep_assert_held(&inode->ordered_tree_lock);
if (page) {
ASSERT(page->mapping);
@@ -364,7 +364,7 @@ static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered)
struct btrfs_workqueue *wq = btrfs_is_free_space_inode(inode) ?
fs_info->endio_freespace_worker : fs_info->endio_write_workers;
- btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
+ btrfs_init_work(&ordered->work, finish_ordered_fn, NULL);
btrfs_queue_work(wq, &ordered->work);
}
@@ -378,9 +378,9 @@ bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate);
- spin_lock_irqsave(&inode->ordered_tree.lock, flags);
+ spin_lock_irqsave(&inode->ordered_tree_lock, flags);
ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate);
- spin_unlock_irqrestore(&inode->ordered_tree.lock, flags);
+ spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
if (ret)
btrfs_queue_ordered_fn(ordered);
@@ -404,7 +404,6 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
struct page *page, u64 file_offset,
u64 num_bytes, bool uptodate)
{
- struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
unsigned long flags;
@@ -414,13 +413,13 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
file_offset + num_bytes - 1,
uptodate);
- spin_lock_irqsave(&tree->lock, flags);
+ spin_lock_irqsave(&inode->ordered_tree_lock, flags);
while (cur < file_offset + num_bytes) {
u64 entry_end;
u64 end;
u32 len;
- node = tree_search(tree, cur);
+ node = ordered_tree_search(inode, cur);
/* No ordered extents at all */
if (!node)
break;
@@ -467,13 +466,13 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
len = end + 1 - cur;
if (can_finish_ordered_extent(entry, page, cur, len, uptodate)) {
- spin_unlock_irqrestore(&tree->lock, flags);
+ spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
btrfs_queue_ordered_fn(entry);
- spin_lock_irqsave(&tree->lock, flags);
+ spin_lock_irqsave(&inode->ordered_tree_lock, flags);
}
cur += len;
}
- spin_unlock_irqrestore(&tree->lock, flags);
+ spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
}
/*
@@ -497,19 +496,18 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size)
{
- struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
unsigned long flags;
bool finished = false;
- spin_lock_irqsave(&tree->lock, flags);
+ spin_lock_irqsave(&inode->ordered_tree_lock, flags);
if (cached && *cached) {
entry = *cached;
goto have_entry;
}
- node = tree_search(tree, file_offset);
+ node = ordered_tree_search(inode, file_offset);
if (!node)
goto out;
@@ -540,7 +538,7 @@ out:
refcount_inc(&entry->refs);
trace_btrfs_ordered_extent_dec_test_pending(inode, entry);
}
- spin_unlock_irqrestore(&tree->lock, flags);
+ spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
return finished;
}
@@ -578,7 +576,6 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry)
{
- struct btrfs_ordered_inode_tree *tree;
struct btrfs_root *root = btrfs_inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *node;
@@ -603,22 +600,23 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
release = entry->disk_num_bytes;
else
release = entry->num_bytes;
- btrfs_delalloc_release_metadata(btrfs_inode, release, false);
+ btrfs_delalloc_release_metadata(btrfs_inode, release,
+ test_bit(BTRFS_ORDERED_IOERR,
+ &entry->flags));
}
percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
fs_info->delalloc_batch);
- tree = &btrfs_inode->ordered_tree;
- spin_lock_irq(&tree->lock);
+ spin_lock_irq(&btrfs_inode->ordered_tree_lock);
node = &entry->rb_node;
- rb_erase(node, &tree->tree);
+ rb_erase(node, &btrfs_inode->ordered_tree);
RB_CLEAR_NODE(node);
- if (tree->last == node)
- tree->last = NULL;
+ if (btrfs_inode->ordered_tree_last == node)
+ btrfs_inode->ordered_tree_last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
- spin_unlock_irq(&tree->lock);
+ spin_unlock_irq(&btrfs_inode->ordered_tree_lock);
/*
* The current running transaction is waiting on us, we need to let it
@@ -711,7 +709,7 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
spin_unlock(&root->ordered_extent_lock);
btrfs_init_work(&ordered->flush_work,
- btrfs_run_ordered_extent_work, NULL, NULL);
+ btrfs_run_ordered_extent_work, NULL);
list_add_tail(&ordered->work_list, &works);
btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work);
@@ -875,14 +873,12 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
u64 file_offset)
{
- struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
unsigned long flags;
- tree = &inode->ordered_tree;
- spin_lock_irqsave(&tree->lock, flags);
- node = tree_search(tree, file_offset);
+ spin_lock_irqsave(&inode->ordered_tree_lock, flags);
+ node = ordered_tree_search(inode, file_offset);
if (!node)
goto out;
@@ -894,7 +890,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino
trace_btrfs_ordered_extent_lookup(inode, entry);
}
out:
- spin_unlock_irqrestore(&tree->lock, flags);
+ spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
return entry;
}
@@ -904,15 +900,13 @@ out:
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
struct btrfs_inode *inode, u64 file_offset, u64 len)
{
- struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- tree = &inode->ordered_tree;
- spin_lock_irq(&tree->lock);
- node = tree_search(tree, file_offset);
+ spin_lock_irq(&inode->ordered_tree_lock);
+ node = ordered_tree_search(inode, file_offset);
if (!node) {
- node = tree_search(tree, file_offset + len);
+ node = ordered_tree_search(inode, file_offset + len);
if (!node)
goto out;
}
@@ -936,7 +930,7 @@ out:
refcount_inc(&entry->refs);
trace_btrfs_ordered_extent_lookup_range(inode, entry);
}
- spin_unlock_irq(&tree->lock);
+ spin_unlock_irq(&inode->ordered_tree_lock);
return entry;
}
@@ -947,13 +941,12 @@ out:
void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
struct list_head *list)
{
- struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *n;
ASSERT(inode_is_locked(&inode->vfs_inode));
- spin_lock_irq(&tree->lock);
- for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
+ spin_lock_irq(&inode->ordered_tree_lock);
+ for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) {
struct btrfs_ordered_extent *ordered;
ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
@@ -966,7 +959,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
refcount_inc(&ordered->refs);
trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered);
}
- spin_unlock_irq(&tree->lock);
+ spin_unlock_irq(&inode->ordered_tree_lock);
}
/*
@@ -976,13 +969,11 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
struct btrfs_ordered_extent *
btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
{
- struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- tree = &inode->ordered_tree;
- spin_lock_irq(&tree->lock);
- node = tree_search(tree, file_offset);
+ spin_lock_irq(&inode->ordered_tree_lock);
+ node = ordered_tree_search(inode, file_offset);
if (!node)
goto out;
@@ -990,7 +981,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
refcount_inc(&entry->refs);
trace_btrfs_ordered_extent_lookup_first(inode, entry);
out:
- spin_unlock_irq(&tree->lock);
+ spin_unlock_irq(&inode->ordered_tree_lock);
return entry;
}
@@ -1006,15 +997,14 @@ out:
struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
struct btrfs_inode *inode, u64 file_offset, u64 len)
{
- struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct rb_node *cur;
struct rb_node *prev;
struct rb_node *next;
struct btrfs_ordered_extent *entry = NULL;
- spin_lock_irq(&tree->lock);
- node = tree->tree.rb_node;
+ spin_lock_irq(&inode->ordered_tree_lock);
+ node = inode->ordered_tree.rb_node;
/*
* Here we don't want to use tree_search() which will use tree->last
* and screw up the search order.
@@ -1068,7 +1058,7 @@ out:
trace_btrfs_ordered_extent_lookup_first_range(inode, entry);
}
- spin_unlock_irq(&tree->lock);
+ spin_unlock_irq(&inode->ordered_tree_lock);
return entry;
}
@@ -1147,7 +1137,6 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
struct btrfs_ordered_extent *ordered, u64 len)
{
struct btrfs_inode *inode = BTRFS_I(ordered->inode);
- struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 file_offset = ordered->file_offset;
@@ -1187,13 +1176,13 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
refcount_inc(&new->refs);
spin_lock_irq(&root->ordered_extent_lock);
- spin_lock(&tree->lock);
+ spin_lock(&inode->ordered_tree_lock);
/* Remove from tree once */
node = &ordered->rb_node;
- rb_erase(node, &tree->tree);
+ rb_erase(node, &inode->ordered_tree);
RB_CLEAR_NODE(node);
- if (tree->last == node)
- tree->last = NULL;
+ if (inode->ordered_tree_last == node)
+ inode->ordered_tree_last = NULL;
ordered->file_offset += len;
ordered->disk_bytenr += len;
@@ -1224,18 +1213,19 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
}
/* Re-insert the node */
- node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node);
+ node = tree_insert(&inode->ordered_tree, ordered->file_offset,
+ &ordered->rb_node);
if (node)
btrfs_panic(fs_info, -EEXIST,
"zoned: inconsistency in ordered tree at offset %llu",
ordered->file_offset);
- node = tree_insert(&tree->tree, new->file_offset, &new->rb_node);
+ node = tree_insert(&inode->ordered_tree, new->file_offset, &new->rb_node);
if (node)
btrfs_panic(fs_info, -EEXIST,
"zoned: inconsistency in ordered tree at offset %llu",
new->file_offset);
- spin_unlock(&tree->lock);
+ spin_unlock(&inode->ordered_tree_lock);
list_add_tail(&new->root_extent_list, &root->ordered_extents);
root->nr_ordered_extents++;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 173bd5c5df26..567a6d3d4712 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -6,13 +6,6 @@
#ifndef BTRFS_ORDERED_DATA_H
#define BTRFS_ORDERED_DATA_H
-/* one of these per inode */
-struct btrfs_ordered_inode_tree {
- spinlock_t lock;
- struct rb_root tree;
- struct rb_node *last;
-};
-
struct btrfs_ordered_sum {
/*
* Logical start address and length for of the blocks covered by
@@ -151,15 +144,9 @@ struct btrfs_ordered_extent {
struct completion completion;
struct btrfs_work flush_work;
struct list_head work_list;
-};
-static inline void
-btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
-{
- spin_lock_init(&t->lock);
- t->tree = RB_ROOT;
- t->last = NULL;
-}
+ struct list_head bioc_list;
+};
int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent);
int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 0c93439e929f..7e46aa8a0444 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -9,6 +9,8 @@
#include "print-tree.h"
#include "accessors.h"
#include "tree-checker.h"
+#include "volumes.h"
+#include "raid-stripe-tree.h"
struct root_name_map {
u64 id;
@@ -28,6 +30,7 @@ static const struct root_name_map root_map[] = {
{ BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" },
{ BTRFS_BLOCK_GROUP_TREE_OBJECTID, "BLOCK_GROUP_TREE" },
{ BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" },
+ { BTRFS_RAID_STRIPE_TREE_OBJECTID, "RAID_STRIPE_TREE" },
};
const char *btrfs_root_name(const struct btrfs_key *key, char *buf)
@@ -80,12 +83,20 @@ static void print_extent_data_ref(const struct extent_buffer *eb,
btrfs_extent_data_ref_count(eb, ref));
}
+static void print_extent_owner_ref(const struct extent_buffer *eb,
+ const struct btrfs_extent_owner_ref *ref)
+{
+ ASSERT(btrfs_fs_incompat(eb->fs_info, SIMPLE_QUOTA));
+ pr_cont("extent data owner root %llu\n", btrfs_extent_owner_ref_root_id(eb, ref));
+}
+
static void print_extent_item(const struct extent_buffer *eb, int slot, int type)
{
struct btrfs_extent_item *ei;
struct btrfs_extent_inline_ref *iref;
struct btrfs_extent_data_ref *dref;
struct btrfs_shared_data_ref *sref;
+ struct btrfs_extent_owner_ref *oref;
struct btrfs_disk_key key;
unsigned long end;
unsigned long ptr;
@@ -161,6 +172,10 @@ static void print_extent_item(const struct extent_buffer *eb, int slot, int type
"\t\t\t(parent %llu not aligned to sectorsize %u)\n",
offset, eb->fs_info->sectorsize);
break;
+ case BTRFS_EXTENT_OWNER_REF_KEY:
+ oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
+ print_extent_owner_ref(eb, oref);
+ break;
default:
pr_cont("(extent %llu has INVALID ref type %d)\n",
eb->start, type);
@@ -189,6 +204,22 @@ static void print_uuid_item(const struct extent_buffer *l, unsigned long offset,
}
}
+static void print_raid_stripe_key(const struct extent_buffer *eb, u32 item_size,
+ struct btrfs_stripe_extent *stripe)
+{
+ const int num_stripes = btrfs_num_raid_stripes(item_size);
+ const u8 encoding = btrfs_stripe_extent_encoding(eb, stripe);
+
+ pr_info("\t\t\tencoding: %s\n",
+ (encoding && encoding < BTRFS_NR_RAID_TYPES) ?
+ btrfs_raid_array[encoding].raid_name : "unknown");
+
+ for (int i = 0; i < num_stripes; i++)
+ pr_info("\t\t\tstride %d devid %llu physical %llu\n",
+ i, btrfs_raid_stride_devid(eb, &stripe->strides[i]),
+ btrfs_raid_stride_physical(eb, &stripe->strides[i]));
+}
+
/*
* Helper to output refs and locking status of extent buffer. Useful to debug
* race condition related problems.
@@ -349,6 +380,10 @@ void btrfs_print_leaf(const struct extent_buffer *l)
print_uuid_item(l, btrfs_item_ptr_offset(l, i),
btrfs_item_size(l, i));
break;
+ case BTRFS_RAID_STRIPE_KEY:
+ print_raid_stripe_key(l, btrfs_item_size(l, i),
+ btrfs_item_ptr(l, i, struct btrfs_stripe_extent));
+ break;
}
}
}
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 0755af0e53e3..f9bf591a0718 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -15,6 +15,7 @@
#include "fs.h"
#include "accessors.h"
#include "super.h"
+#include "dir-item.h"
#define BTRFS_PROP_HANDLERS_HT_BITS 8
static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index b99230db3c82..e46774e8f49f 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -30,6 +30,25 @@
#include "root-tree.h"
#include "tree-checker.h"
+enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info)
+{
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return BTRFS_QGROUP_MODE_DISABLED;
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE)
+ return BTRFS_QGROUP_MODE_SIMPLE;
+ return BTRFS_QGROUP_MODE_FULL;
+}
+
+bool btrfs_qgroup_enabled(struct btrfs_fs_info *fs_info)
+{
+ return btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_DISABLED;
+}
+
+bool btrfs_qgroup_full_accounting(struct btrfs_fs_info *fs_info)
+{
+ return btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL;
+}
+
/*
* Helpers to access qgroup reservation
*
@@ -146,16 +165,6 @@ struct btrfs_qgroup_list {
struct btrfs_qgroup *member;
};
-static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg)
-{
- return (u64)(uintptr_t)qg;
-}
-
-static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n)
-{
- return (struct btrfs_qgroup *)(uintptr_t)n->aux;
-}
-
static int
qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
int init_flags);
@@ -180,34 +189,46 @@ static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
return NULL;
}
-/* must be called with qgroup_lock held */
+/*
+ * Add qgroup to the filesystem's qgroup tree.
+ *
+ * Must be called with qgroup_lock held and @prealloc preallocated.
+ *
+ * The control on the lifespan of @prealloc would be transfered to this
+ * function, thus caller should no longer touch @prealloc.
+ */
static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *prealloc,
u64 qgroupid)
{
struct rb_node **p = &fs_info->qgroup_tree.rb_node;
struct rb_node *parent = NULL;
struct btrfs_qgroup *qgroup;
+ /* Caller must have pre-allocated @prealloc. */
+ ASSERT(prealloc);
+
while (*p) {
parent = *p;
qgroup = rb_entry(parent, struct btrfs_qgroup, node);
- if (qgroup->qgroupid < qgroupid)
+ if (qgroup->qgroupid < qgroupid) {
p = &(*p)->rb_left;
- else if (qgroup->qgroupid > qgroupid)
+ } else if (qgroup->qgroupid > qgroupid) {
p = &(*p)->rb_right;
- else
+ } else {
+ kfree(prealloc);
return qgroup;
+ }
}
- qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
- if (!qgroup)
- return ERR_PTR(-ENOMEM);
-
+ qgroup = prealloc;
qgroup->qgroupid = qgroupid;
INIT_LIST_HEAD(&qgroup->groups);
INIT_LIST_HEAD(&qgroup->members);
INIT_LIST_HEAD(&qgroup->dirty);
+ INIT_LIST_HEAD(&qgroup->iterator);
+ INIT_LIST_HEAD(&qgroup->nested_iterator);
rb_link_node(&qgroup->node, parent, p);
rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
@@ -254,27 +275,26 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
/*
* Add relation specified by two qgroups.
*
- * Must be called with qgroup_lock held.
+ * Must be called with qgroup_lock held, the ownership of @prealloc is
+ * transferred to this function and caller should not touch it anymore.
*
* Return: 0 on success
* -ENOENT if one of the qgroups is NULL
* <0 other errors
*/
-static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent)
+static int __add_relation_rb(struct btrfs_qgroup_list *prealloc,
+ struct btrfs_qgroup *member,
+ struct btrfs_qgroup *parent)
{
- struct btrfs_qgroup_list *list;
-
- if (!member || !parent)
+ if (!member || !parent) {
+ kfree(prealloc);
return -ENOENT;
+ }
- list = kzalloc(sizeof(*list), GFP_ATOMIC);
- if (!list)
- return -ENOMEM;
-
- list->group = parent;
- list->member = member;
- list_add_tail(&list->next_group, &member->groups);
- list_add_tail(&list->next_member, &parent->members);
+ prealloc->group = parent;
+ prealloc->member = member;
+ list_add_tail(&prealloc->next_group, &member->groups);
+ list_add_tail(&prealloc->next_member, &parent->members);
return 0;
}
@@ -288,7 +308,9 @@ static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *p
* -ENOENT if one of the ids does not exist
* <0 other errors
*/
-static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid)
+static int add_relation_rb(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_list *prealloc,
+ u64 memberid, u64 parentid)
{
struct btrfs_qgroup *member;
struct btrfs_qgroup *parent;
@@ -296,7 +318,7 @@ static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 pare
member = find_qgroup_rb(fs_info, memberid);
parent = find_qgroup_rb(fs_info, parentid);
- return __add_relation_rb(member, parent);
+ return __add_relation_rb(prealloc, member, parent);
}
/* Must be called with qgroup_lock held */
@@ -340,11 +362,22 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info)
{
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
+ return;
fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT |
BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
}
+static void qgroup_read_enable_gen(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *leaf, int slot,
+ struct btrfs_qgroup_status_item *ptr)
+{
+ ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+ ASSERT(btrfs_item_size(leaf, slot) >= sizeof(*ptr));
+ fs_info->qgroup_enable_gen = btrfs_qgroup_status_enable_gen(leaf, ptr);
+}
+
/*
* The full config is read in one go, only called from open_ctree()
* It doesn't use any locking, as at this point we're still single-threaded
@@ -361,7 +394,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
u64 flags = 0;
u64 rescan_progress = 0;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (!fs_info->quota_root)
return 0;
fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
@@ -411,14 +444,14 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
"old qgroup version, quota disabled");
goto out;
}
- if (btrfs_qgroup_status_generation(l, ptr) !=
- fs_info->generation) {
+ fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr);
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) {
+ qgroup_read_enable_gen(fs_info, l, slot, ptr);
+ } else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) {
qgroup_mark_inconsistent(fs_info);
btrfs_err(fs_info,
"qgroup generation mismatch, marked as inconsistent");
}
- fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
- ptr);
rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
goto next1;
}
@@ -434,11 +467,14 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
qgroup_mark_inconsistent(fs_info);
}
if (!qgroup) {
- qgroup = add_qgroup_rb(fs_info, found_key.offset);
- if (IS_ERR(qgroup)) {
- ret = PTR_ERR(qgroup);
+ struct btrfs_qgroup *prealloc;
+
+ prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);
+ if (!prealloc) {
+ ret = -ENOMEM;
goto out;
}
+ qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset);
}
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
if (ret < 0)
@@ -489,6 +525,8 @@ next1:
if (ret)
goto out;
while (1) {
+ struct btrfs_qgroup_list *list = NULL;
+
slot = path->slots[0];
l = path->nodes[0];
btrfs_item_key_to_cpu(l, &found_key, slot);
@@ -502,8 +540,14 @@ next1:
goto next2;
}
- ret = add_relation_rb(fs_info, found_key.objectid,
+ list = kzalloc(sizeof(*list), GFP_KERNEL);
+ if (!list) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = add_relation_rb(fs_info, list, found_key.objectid,
found_key.offset);
+ list = NULL;
if (ret == -ENOENT) {
btrfs_warn(fs_info,
"orphan qgroup relation 0x%llx->0x%llx",
@@ -522,13 +566,12 @@ next2:
out:
btrfs_free_path(path);
fs_info->qgroup_flags |= flags;
- if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
- clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
- else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
- ret >= 0)
- ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
-
- if (ret < 0) {
+ if (ret >= 0) {
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)
+ set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
+ ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
+ } else {
ulist_free(fs_info->qgroup_ulist);
fs_info->qgroup_ulist = NULL;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
@@ -550,7 +593,7 @@ bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
struct rb_node *node;
bool ret = false;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED)
return ret;
/*
* Since we're unmounting, there is no race and no need to grab qgroup
@@ -622,7 +665,7 @@ static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
- btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_mark_buffer_dirty(trans, path->nodes[0]);
btrfs_free_path(path);
return ret;
@@ -700,7 +743,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
@@ -719,7 +762,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
ret = 0;
out:
@@ -808,7 +851,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
- btrfs_mark_buffer_dirty(l);
+ btrfs_mark_buffer_dirty(trans, l);
out:
btrfs_free_path(path);
@@ -854,7 +897,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
- btrfs_mark_buffer_dirty(l);
+ btrfs_mark_buffer_dirty(trans, l);
out:
btrfs_free_path(path);
@@ -896,7 +939,7 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
btrfs_set_qgroup_status_rescan(l, ptr,
fs_info->qgroup_rescan_progress.objectid);
- btrfs_mark_buffer_dirty(l);
+ btrfs_mark_buffer_dirty(trans, l);
out:
btrfs_free_path(path);
@@ -949,7 +992,8 @@ out:
return ret;
}
-int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
+int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_quota_ctl_args *quota_ctl_args)
{
struct btrfs_root *quota_root;
struct btrfs_root *tree_root = fs_info->tree_root;
@@ -959,8 +1003,10 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_qgroup *qgroup = NULL;
+ struct btrfs_qgroup *prealloc = NULL;
struct btrfs_trans_handle *trans = NULL;
struct ulist *ulist = NULL;
+ const bool simple = (quota_ctl_args->cmd == BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA);
int ret = 0;
int slot;
@@ -1063,13 +1109,18 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
struct btrfs_qgroup_status_item);
btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
- fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
- BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON;
+ if (simple) {
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
+ btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid);
+ } else {
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ }
btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags &
BTRFS_QGROUP_STATUS_FLAGS_MASK);
btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
key.objectid = 0;
key.type = BTRFS_ROOT_REF_KEY;
@@ -1094,6 +1145,15 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
/* Release locks on tree_root before we access quota_root */
btrfs_release_path(path);
+ /* We should not have a stray @prealloc pointer. */
+ ASSERT(prealloc == NULL);
+ prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
+ if (!prealloc) {
+ ret = -ENOMEM;
+ btrfs_abort_transaction(trans, ret);
+ goto out_free_path;
+ }
+
ret = add_qgroup_item(trans, quota_root,
found_key.offset);
if (ret) {
@@ -1101,7 +1161,8 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
goto out_free_path;
}
- qgroup = add_qgroup_rb(fs_info, found_key.offset);
+ qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset);
+ prealloc = NULL;
if (IS_ERR(qgroup)) {
ret = PTR_ERR(qgroup);
btrfs_abort_transaction(trans, ret);
@@ -1144,18 +1205,22 @@ out_add_root:
goto out_free_path;
}
- qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID);
- if (IS_ERR(qgroup)) {
- ret = PTR_ERR(qgroup);
- btrfs_abort_transaction(trans, ret);
+ ASSERT(prealloc == NULL);
+ prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
+ if (!prealloc) {
+ ret = -ENOMEM;
goto out_free_path;
}
+ qgroup = add_qgroup_rb(fs_info, prealloc, BTRFS_FS_TREE_OBJECTID);
+ prealloc = NULL;
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
+ fs_info->qgroup_enable_gen = trans->transid;
+
mutex_unlock(&fs_info->qgroup_ioctl_lock);
/*
* Commit the transaction while not holding qgroup_ioctl_lock, to avoid
@@ -1180,8 +1245,14 @@ out_add_root:
spin_lock(&fs_info->qgroup_lock);
fs_info->quota_root = quota_root;
set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+ if (simple)
+ btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA);
spin_unlock(&fs_info->qgroup_lock);
+ /* Skip rescan for simple qgroups. */
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
+ goto out_free_path;
+
ret = qgroup_rescan_init(fs_info, 0, 1);
if (!ret) {
qgroup_rescan_zero_tracking(fs_info);
@@ -1222,6 +1293,39 @@ out:
else if (trans)
ret = btrfs_end_transaction(trans);
ulist_free(ulist);
+ kfree(prealloc);
+ return ret;
+}
+
+/*
+ * It is possible to have outstanding ordered extents which reserved bytes
+ * before we disabled. We need to fully flush delalloc, ordered extents, and a
+ * commit to ensure that we don't leak such reservations, only to have them
+ * come back if we re-enable.
+ *
+ * - enable simple quotas
+ * - reserve space
+ * - release it, store rsv_bytes in OE
+ * - disable quotas
+ * - enable simple quotas (qgroup rsv are all 0)
+ * - OE finishes
+ * - run delayed refs
+ * - free rsv_bytes, resulting in miscounting or even underflow
+ */
+static int flush_reservations(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
+ if (ret)
+ return ret;
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+ trans = btrfs_join_transaction(fs_info->tree_root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ btrfs_commit_transaction(trans);
+
return ret;
}
@@ -1269,6 +1373,10 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
btrfs_qgroup_wait_for_completion(fs_info, false);
+ ret = flush_reservations(fs_info);
+ if (ret)
+ goto out_unlock_cleaner;
+
/*
* 1 For the root item
*
@@ -1295,6 +1403,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
quota_root = fs_info->quota_root;
fs_info->quota_root = NULL;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
spin_unlock(&fs_info->qgroup_lock);
@@ -1329,7 +1438,8 @@ out:
if (ret && trans)
btrfs_end_transaction(trans);
else if (trans)
- ret = btrfs_end_transaction(trans);
+ ret = btrfs_commit_transaction(trans);
+out_unlock_cleaner:
mutex_unlock(&fs_info->cleaner_mutex);
return ret;
@@ -1342,6 +1452,24 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info,
list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
}
+static void qgroup_iterator_add(struct list_head *head, struct btrfs_qgroup *qgroup)
+{
+ if (!list_empty(&qgroup->iterator))
+ return;
+
+ list_add_tail(&qgroup->iterator, head);
+}
+
+static void qgroup_iterator_clean(struct list_head *head)
+{
+ while (!list_empty(head)) {
+ struct btrfs_qgroup *qgroup;
+
+ qgroup = list_first_entry(head, struct btrfs_qgroup, iterator);
+ list_del_init(&qgroup->iterator);
+ }
+}
+
/*
* The easy accounting, we're updating qgroup relationship whose child qgroup
* only has exclusive extents.
@@ -1356,14 +1484,12 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info,
*
* Caller should hold fs_info->qgroup_lock.
*/
-static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
- struct ulist *tmp, u64 ref_root,
+static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,
struct btrfs_qgroup *src, int sign)
{
struct btrfs_qgroup *qgroup;
- struct btrfs_qgroup_list *glist;
- struct ulist_node *unode;
- struct ulist_iterator uiter;
+ struct btrfs_qgroup *cur;
+ LIST_HEAD(qgroup_list);
u64 num_bytes = src->excl;
int ret = 0;
@@ -1371,53 +1497,30 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
if (!qgroup)
goto out;
- qgroup->rfer += sign * num_bytes;
- qgroup->rfer_cmpr += sign * num_bytes;
-
- WARN_ON(sign < 0 && qgroup->excl < num_bytes);
- qgroup->excl += sign * num_bytes;
- qgroup->excl_cmpr += sign * num_bytes;
-
- if (sign > 0)
- qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
- else
- qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
-
- qgroup_dirty(fs_info, qgroup);
-
- /* Get all of the parent groups that contain this qgroup */
- list_for_each_entry(glist, &qgroup->groups, next_group) {
- ret = ulist_add(tmp, glist->group->qgroupid,
- qgroup_to_aux(glist->group), GFP_ATOMIC);
- if (ret < 0)
- goto out;
- }
+ qgroup_iterator_add(&qgroup_list, qgroup);
+ list_for_each_entry(cur, &qgroup_list, iterator) {
+ struct btrfs_qgroup_list *glist;
- /* Iterate all of the parents and adjust their reference counts */
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(tmp, &uiter))) {
- qgroup = unode_aux_to_qgroup(unode);
qgroup->rfer += sign * num_bytes;
qgroup->rfer_cmpr += sign * num_bytes;
+
WARN_ON(sign < 0 && qgroup->excl < num_bytes);
qgroup->excl += sign * num_bytes;
+ qgroup->excl_cmpr += sign * num_bytes;
+
if (sign > 0)
qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
else
qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
- qgroup->excl_cmpr += sign * num_bytes;
qgroup_dirty(fs_info, qgroup);
- /* Add any parents of the parents */
- list_for_each_entry(glist, &qgroup->groups, next_group) {
- ret = ulist_add(tmp, glist->group->qgroupid,
- qgroup_to_aux(glist->group), GFP_ATOMIC);
- if (ret < 0)
- goto out;
- }
+ /* Append parent qgroups to @qgroup_list. */
+ list_for_each_entry(glist, &qgroup->groups, next_group)
+ qgroup_iterator_add(&qgroup_list, glist->group);
}
ret = 0;
out:
+ qgroup_iterator_clean(&qgroup_list);
return ret;
}
@@ -1434,8 +1537,7 @@ out:
* Return < 0 for other error.
*/
static int quick_update_accounting(struct btrfs_fs_info *fs_info,
- struct ulist *tmp, u64 src, u64 dst,
- int sign)
+ u64 src, u64 dst, int sign)
{
struct btrfs_qgroup *qgroup;
int ret = 1;
@@ -1446,8 +1548,7 @@ static int quick_update_accounting(struct btrfs_fs_info *fs_info,
goto out;
if (qgroup->excl == qgroup->rfer) {
ret = 0;
- err = __qgroup_excl_accounting(fs_info, tmp, dst,
- qgroup, sign);
+ err = __qgroup_excl_accounting(fs_info, dst, qgroup, sign);
if (err < 0) {
ret = err;
goto out;
@@ -1459,28 +1560,19 @@ out:
return ret;
}
-int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
- u64 dst)
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_qgroup *parent;
struct btrfs_qgroup *member;
struct btrfs_qgroup_list *list;
- struct ulist *tmp;
- unsigned int nofs_flag;
+ struct btrfs_qgroup_list *prealloc = NULL;
int ret = 0;
/* Check the level of src and dst first */
if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
return -EINVAL;
- /* We hold a transaction handle open, must do a NOFS allocation. */
- nofs_flag = memalloc_nofs_save();
- tmp = ulist_alloc(GFP_KERNEL);
- memalloc_nofs_restore(nofs_flag);
- if (!tmp)
- return -ENOMEM;
-
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root) {
ret = -ENOTCONN;
@@ -1501,6 +1593,11 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
}
}
+ prealloc = kzalloc(sizeof(*list), GFP_NOFS);
+ if (!prealloc) {
+ ret = -ENOMEM;
+ goto out;
+ }
ret = add_qgroup_relation_item(trans, src, dst);
if (ret)
goto out;
@@ -1512,16 +1609,17 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
}
spin_lock(&fs_info->qgroup_lock);
- ret = __add_relation_rb(member, parent);
+ ret = __add_relation_rb(prealloc, member, parent);
+ prealloc = NULL;
if (ret < 0) {
spin_unlock(&fs_info->qgroup_lock);
goto out;
}
- ret = quick_update_accounting(fs_info, tmp, src, dst, 1);
+ ret = quick_update_accounting(fs_info, src, dst, 1);
spin_unlock(&fs_info->qgroup_lock);
out:
+ kfree(prealloc);
mutex_unlock(&fs_info->qgroup_ioctl_lock);
- ulist_free(tmp);
return ret;
}
@@ -1532,19 +1630,10 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
struct btrfs_qgroup *parent;
struct btrfs_qgroup *member;
struct btrfs_qgroup_list *list;
- struct ulist *tmp;
bool found = false;
- unsigned int nofs_flag;
int ret = 0;
int ret2;
- /* We hold a transaction handle open, must do a NOFS allocation. */
- nofs_flag = memalloc_nofs_save();
- tmp = ulist_alloc(GFP_KERNEL);
- memalloc_nofs_restore(nofs_flag);
- if (!tmp)
- return -ENOMEM;
-
if (!fs_info->quota_root) {
ret = -ENOTCONN;
goto out;
@@ -1582,11 +1671,10 @@ delete_item:
if (found) {
spin_lock(&fs_info->qgroup_lock);
del_relation_rb(fs_info, src, dst);
- ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
+ ret = quick_update_accounting(fs_info, src, dst, -1);
spin_unlock(&fs_info->qgroup_lock);
}
out:
- ulist_free(tmp);
return ret;
}
@@ -1608,8 +1696,12 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
+ struct btrfs_qgroup *prealloc = NULL;
int ret = 0;
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED)
+ return 0;
+
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root) {
ret = -ENOTCONN;
@@ -1622,21 +1714,25 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
goto out;
}
+ prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
+ if (!prealloc) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
ret = add_qgroup_item(trans, quota_root, qgroupid);
if (ret)
goto out;
spin_lock(&fs_info->qgroup_lock);
- qgroup = add_qgroup_rb(fs_info, qgroupid);
+ qgroup = add_qgroup_rb(fs_info, prealloc, qgroupid);
spin_unlock(&fs_info->qgroup_lock);
+ prealloc = NULL;
- if (IS_ERR(qgroup)) {
- ret = PTR_ERR(qgroup);
- goto out;
- }
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ kfree(prealloc);
return ret;
}
@@ -1771,6 +1867,17 @@ out:
return ret;
}
+/*
+ * Inform qgroup to trace one dirty extent, its info is recorded in @record.
+ * So qgroup can account it at transaction committing time.
+ *
+ * No lock version, caller must acquire delayed ref lock and allocated memory,
+ * then call btrfs_qgroup_trace_extent_post() after exiting lock context.
+ *
+ * Return 0 for success insert
+ * Return >0 for existing record, caller can free @record safely.
+ * Error is not possible
+ */
int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_qgroup_extent_record *record)
@@ -1780,6 +1887,9 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup_extent_record *entry;
u64 bytenr = record->bytenr;
+ if (!btrfs_qgroup_full_accounting(fs_info))
+ return 1;
+
lockdep_assert_held(&delayed_refs->lock);
trace_btrfs_qgroup_trace_extent(fs_info, record);
@@ -1806,12 +1916,35 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
return 0;
}
+/*
+ * Post handler after qgroup_trace_extent_nolock().
+ *
+ * NOTE: Current qgroup does the expensive backref walk at transaction
+ * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming
+ * new transaction.
+ * This is designed to allow btrfs_find_all_roots() to get correct new_roots
+ * result.
+ *
+ * However for old_roots there is no need to do backref walk at that time,
+ * since we search commit roots to walk backref and result will always be
+ * correct.
+ *
+ * Due to the nature of no lock version, we can't do backref there.
+ * So we must call btrfs_qgroup_trace_extent_post() after exiting
+ * spinlock context.
+ *
+ * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result
+ * using current root, then we can move all expensive backref walk out of
+ * transaction committing, but not now as qgroup accounting will be wrong again.
+ */
int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
struct btrfs_qgroup_extent_record *qrecord)
{
struct btrfs_backref_walk_ctx ctx = { 0 };
int ret;
+ if (!btrfs_qgroup_full_accounting(trans->fs_info))
+ return 0;
/*
* We are always called in a context where we are already holding a
* transaction handle. Often we are called when adding a data delayed
@@ -1859,6 +1992,19 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
return 0;
}
+/*
+ * Inform qgroup to trace one dirty extent, specified by @bytenr and
+ * @num_bytes.
+ * So qgroup can account it at commit trans time.
+ *
+ * Better encapsulated version, with memory allocation and backref walk for
+ * commit roots.
+ * So this can sleep.
+ *
+ * Return 0 if the operation is done.
+ * Return <0 for error, like memory allocation failure or invalid parameter
+ * (NULL trans)
+ */
int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 num_bytes)
{
@@ -1867,8 +2013,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
struct btrfs_delayed_ref_root *delayed_refs;
int ret;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
- || bytenr == 0 || num_bytes == 0)
+ if (!btrfs_qgroup_full_accounting(fs_info) || bytenr == 0 || num_bytes == 0)
return 0;
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record)
@@ -1889,6 +2034,12 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
return btrfs_qgroup_trace_extent_post(trans, record);
}
+/*
+ * Inform qgroup to trace all leaf items of data
+ *
+ * Return 0 for success
+ * Return <0 for error(ENOMEM)
+ */
int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
struct extent_buffer *eb)
{
@@ -1900,7 +2051,7 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
u64 bytenr, num_bytes;
/* We can be called directly from walk_up_proc() */
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
for (i = 0; i < nr; i++) {
@@ -2276,7 +2427,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
int level;
int ret;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
/* Wrong parameter order */
@@ -2319,6 +2470,16 @@ out:
return ret;
}
+/*
+ * Inform qgroup to trace a whole subtree, including all its child tree
+ * blocks and data.
+ * The root tree block is specified by @root_eb.
+ *
+ * Normally used by relocation(tree block swap) and subvolume deletion.
+ *
+ * Return 0 for success
+ * Return <0 for error(ENOMEM or tree search error)
+ */
int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *root_eb,
u64 root_gen, int root_level)
@@ -2333,7 +2494,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL);
BUG_ON(root_eb == NULL);
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
spin_lock(&fs_info->qgroup_lock);
@@ -2445,62 +2606,64 @@ out:
return ret;
}
+static void qgroup_iterator_nested_add(struct list_head *head, struct btrfs_qgroup *qgroup)
+{
+ if (!list_empty(&qgroup->nested_iterator))
+ return;
+
+ list_add_tail(&qgroup->nested_iterator, head);
+}
+
+static void qgroup_iterator_nested_clean(struct list_head *head)
+{
+ while (!list_empty(head)) {
+ struct btrfs_qgroup *qgroup;
+
+ qgroup = list_first_entry(head, struct btrfs_qgroup, nested_iterator);
+ list_del_init(&qgroup->nested_iterator);
+ }
+}
+
#define UPDATE_NEW 0
#define UPDATE_OLD 1
/*
* Walk all of the roots that points to the bytenr and adjust their refcnts.
*/
-static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
- struct ulist *roots, struct ulist *tmp,
- struct ulist *qgroups, u64 seq, int update_old)
+static void qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
+ struct ulist *roots, struct list_head *qgroups,
+ u64 seq, int update_old)
{
struct ulist_node *unode;
struct ulist_iterator uiter;
- struct ulist_node *tmp_unode;
- struct ulist_iterator tmp_uiter;
struct btrfs_qgroup *qg;
- int ret = 0;
if (!roots)
- return 0;
+ return;
ULIST_ITER_INIT(&uiter);
while ((unode = ulist_next(roots, &uiter))) {
+ LIST_HEAD(tmp);
+
qg = find_qgroup_rb(fs_info, unode->val);
if (!qg)
continue;
- ulist_reinit(tmp);
- ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg),
- GFP_ATOMIC);
- if (ret < 0)
- return ret;
- ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC);
- if (ret < 0)
- return ret;
- ULIST_ITER_INIT(&tmp_uiter);
- while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
+ qgroup_iterator_nested_add(qgroups, qg);
+ qgroup_iterator_add(&tmp, qg);
+ list_for_each_entry(qg, &tmp, iterator) {
struct btrfs_qgroup_list *glist;
- qg = unode_aux_to_qgroup(tmp_unode);
if (update_old)
btrfs_qgroup_update_old_refcnt(qg, seq, 1);
else
btrfs_qgroup_update_new_refcnt(qg, seq, 1);
+
list_for_each_entry(glist, &qg->groups, next_group) {
- ret = ulist_add(qgroups, glist->group->qgroupid,
- qgroup_to_aux(glist->group),
- GFP_ATOMIC);
- if (ret < 0)
- return ret;
- ret = ulist_add(tmp, glist->group->qgroupid,
- qgroup_to_aux(glist->group),
- GFP_ATOMIC);
- if (ret < 0)
- return ret;
+ qgroup_iterator_nested_add(qgroups, glist->group);
+ qgroup_iterator_add(&tmp, glist->group);
}
}
+ qgroup_iterator_clean(&tmp);
}
- return 0;
}
/*
@@ -2539,22 +2702,16 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
* But this time we don't need to consider other things, the codes and logic
* is easy to understand now.
*/
-static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
- struct ulist *qgroups,
- u64 nr_old_roots,
- u64 nr_new_roots,
- u64 num_bytes, u64 seq)
+static void qgroup_update_counters(struct btrfs_fs_info *fs_info,
+ struct list_head *qgroups, u64 nr_old_roots,
+ u64 nr_new_roots, u64 num_bytes, u64 seq)
{
- struct ulist_node *unode;
- struct ulist_iterator uiter;
struct btrfs_qgroup *qg;
- u64 cur_new_count, cur_old_count;
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(qgroups, &uiter))) {
+ list_for_each_entry(qg, qgroups, nested_iterator) {
+ u64 cur_new_count, cur_old_count;
bool dirty = false;
- qg = unode_aux_to_qgroup(unode);
cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
@@ -2625,7 +2782,6 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
if (dirty)
qgroup_dirty(fs_info, qg);
}
- return 0;
}
/*
@@ -2662,8 +2818,7 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
struct ulist *new_roots)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct ulist *qgroups = NULL;
- struct ulist *tmp = NULL;
+ LIST_HEAD(qgroups);
u64 seq;
u64 nr_new_roots = 0;
u64 nr_old_roots = 0;
@@ -2673,7 +2828,7 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
* If quotas get disabled meanwhile, the resources need to be freed and
* we can't just exit here.
*/
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+ if (!btrfs_qgroup_full_accounting(fs_info) ||
fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
goto out_free;
@@ -2697,17 +2852,6 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
num_bytes, nr_old_roots, nr_new_roots);
- qgroups = ulist_alloc(GFP_NOFS);
- if (!qgroups) {
- ret = -ENOMEM;
- goto out_free;
- }
- tmp = ulist_alloc(GFP_NOFS);
- if (!tmp) {
- ret = -ENOMEM;
- goto out_free;
- }
-
mutex_lock(&fs_info->qgroup_rescan_lock);
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
@@ -2722,29 +2866,27 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
seq = fs_info->qgroup_seq;
/* Update old refcnts using old_roots */
- ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq,
- UPDATE_OLD);
- if (ret < 0)
- goto out;
+ qgroup_update_refcnt(fs_info, old_roots, &qgroups, seq, UPDATE_OLD);
/* Update new refcnts using new_roots */
- ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq,
- UPDATE_NEW);
- if (ret < 0)
- goto out;
+ qgroup_update_refcnt(fs_info, new_roots, &qgroups, seq, UPDATE_NEW);
- qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots,
+ qgroup_update_counters(fs_info, &qgroups, nr_old_roots, nr_new_roots,
num_bytes, seq);
/*
+ * We're done using the iterator, release all its qgroups while holding
+ * fs_info->qgroup_lock so that we don't race with btrfs_remove_qgroup()
+ * and trigger use-after-free accesses to qgroups.
+ */
+ qgroup_iterator_nested_clean(&qgroups);
+
+ /*
* Bump qgroup_seq to avoid seq overlap
*/
fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
-out:
spin_unlock(&fs_info->qgroup_lock);
out_free:
- ulist_free(tmp);
- ulist_free(qgroups);
ulist_free(old_roots);
ulist_free(new_roots);
return ret;
@@ -2761,6 +2903,9 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
u64 qgroup_to_skip;
int ret = 0;
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
+ return 0;
+
delayed_refs = &trans->transaction->delayed_refs;
qgroup_to_skip = delayed_refs->qgroup_to_skip;
while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
@@ -2876,7 +3021,7 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
qgroup_mark_inconsistent(fs_info);
spin_lock(&fs_info->qgroup_lock);
}
- if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (btrfs_qgroup_enabled(fs_info))
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
else
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
@@ -2889,6 +3034,47 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
return ret;
}
+static int qgroup_auto_inherit(struct btrfs_fs_info *fs_info,
+ u64 inode_rootid,
+ struct btrfs_qgroup_inherit **inherit)
+{
+ int i = 0;
+ u64 num_qgroups = 0;
+ struct btrfs_qgroup *inode_qg;
+ struct btrfs_qgroup_list *qg_list;
+ struct btrfs_qgroup_inherit *res;
+ size_t struct_sz;
+ u64 *qgids;
+
+ if (*inherit)
+ return -EEXIST;
+
+ inode_qg = find_qgroup_rb(fs_info, inode_rootid);
+ if (!inode_qg)
+ return -ENOENT;
+
+ num_qgroups = list_count_nodes(&inode_qg->groups);
+
+ if (!num_qgroups)
+ return 0;
+
+ struct_sz = struct_size(res, qgroups, num_qgroups);
+ if (struct_sz == SIZE_MAX)
+ return -ERANGE;
+
+ res = kzalloc(struct_sz, GFP_NOFS);
+ if (!res)
+ return -ENOMEM;
+ res->num_qgroups = num_qgroups;
+ qgids = res->qgroups;
+
+ list_for_each_entry(qg_list, &inode_qg->groups, next_group)
+ qgids[i] = qg_list->group->qgroupid;
+
+ *inherit = res;
+ return 0;
+}
+
/*
* Copy the accounting information between qgroups. This is necessary
* when a snapshot or a subvolume is created. Throwing an error will
@@ -2896,7 +3082,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
* when a readonly fs is a reasonable outcome.
*/
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
- u64 objectid, struct btrfs_qgroup_inherit *inherit)
+ u64 objectid, u64 inode_rootid,
+ struct btrfs_qgroup_inherit *inherit)
{
int ret = 0;
int i;
@@ -2906,10 +3093,17 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
struct btrfs_root *quota_root;
struct btrfs_qgroup *srcgroup;
struct btrfs_qgroup *dstgroup;
+ struct btrfs_qgroup *prealloc;
+ struct btrfs_qgroup_list **qlist_prealloc = NULL;
+ bool free_inherit = false;
bool need_rescan = false;
u32 level_size = 0;
u64 nums;
+ prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
+ if (!prealloc)
+ return -ENOMEM;
+
/*
* There are only two callers of this function.
*
@@ -2929,7 +3123,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
if (!committing)
mutex_lock(&fs_info->qgroup_ioctl_lock);
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (!btrfs_qgroup_enabled(fs_info))
goto out;
quota_root = fs_info->quota_root;
@@ -2938,6 +3132,13 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
goto out;
}
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE && !inherit) {
+ ret = qgroup_auto_inherit(fs_info, inode_rootid, &inherit);
+ if (ret)
+ goto out;
+ free_inherit = true;
+ }
+
if (inherit) {
i_qgroups = (u64 *)(inherit + 1);
nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
@@ -2982,16 +3183,28 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
goto out;
}
ret = 0;
- }
+ qlist_prealloc = kcalloc(inherit->num_qgroups,
+ sizeof(struct btrfs_qgroup_list *),
+ GFP_NOFS);
+ if (!qlist_prealloc) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ for (int i = 0; i < inherit->num_qgroups; i++) {
+ qlist_prealloc[i] = kzalloc(sizeof(struct btrfs_qgroup_list),
+ GFP_NOFS);
+ if (!qlist_prealloc[i]) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ }
spin_lock(&fs_info->qgroup_lock);
- dstgroup = add_qgroup_rb(fs_info, objectid);
- if (IS_ERR(dstgroup)) {
- ret = PTR_ERR(dstgroup);
- goto unlock;
- }
+ dstgroup = add_qgroup_rb(fs_info, prealloc, objectid);
+ prealloc = NULL;
if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
dstgroup->lim_flags = inherit->lim.flags;
@@ -3003,7 +3216,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
qgroup_dirty(fs_info, dstgroup);
}
- if (srcid) {
+ if (srcid && btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL) {
srcgroup = find_qgroup_rb(fs_info, srcid);
if (!srcgroup)
goto unlock;
@@ -3038,7 +3251,9 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
i_qgroups = (u64 *)(inherit + 1);
for (i = 0; i < inherit->num_qgroups; ++i) {
if (*i_qgroups) {
- ret = add_relation_rb(fs_info, objectid, *i_qgroups);
+ ret = add_relation_rb(fs_info, qlist_prealloc[i], objectid,
+ *i_qgroups);
+ qlist_prealloc[i] = NULL;
if (ret)
goto unlock;
}
@@ -3102,6 +3317,14 @@ out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (need_rescan)
qgroup_mark_inconsistent(fs_info);
+ if (qlist_prealloc) {
+ for (int i = 0; i < inherit->num_qgroups; i++)
+ kfree(qlist_prealloc[i]);
+ kfree(qlist_prealloc);
+ }
+ if (free_inherit)
+ kfree(inherit);
+ kfree(prealloc);
return ret;
}
@@ -3125,8 +3348,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
struct btrfs_fs_info *fs_info = root->fs_info;
u64 ref_root = root->root_key.objectid;
int ret = 0;
- struct ulist_node *unode;
- struct ulist_iterator uiter;
+ LIST_HEAD(qgroup_list);
if (!is_fstree(ref_root))
return 0;
@@ -3146,49 +3368,28 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
if (!qgroup)
goto out;
- /*
- * in a first step, we check all affected qgroups if any limits would
- * be exceeded
- */
- ulist_reinit(fs_info->qgroup_ulist);
- ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
- qgroup_to_aux(qgroup), GFP_ATOMIC);
- if (ret < 0)
- goto out;
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
- struct btrfs_qgroup *qg;
+ qgroup_iterator_add(&qgroup_list, qgroup);
+ list_for_each_entry(qgroup, &qgroup_list, iterator) {
struct btrfs_qgroup_list *glist;
- qg = unode_aux_to_qgroup(unode);
-
- if (enforce && !qgroup_check_limits(qg, num_bytes)) {
+ if (enforce && !qgroup_check_limits(qgroup, num_bytes)) {
ret = -EDQUOT;
goto out;
}
- list_for_each_entry(glist, &qg->groups, next_group) {
- ret = ulist_add(fs_info->qgroup_ulist,
- glist->group->qgroupid,
- qgroup_to_aux(glist->group), GFP_ATOMIC);
- if (ret < 0)
- goto out;
- }
+ list_for_each_entry(glist, &qgroup->groups, next_group)
+ qgroup_iterator_add(&qgroup_list, glist->group);
}
+
ret = 0;
/*
* no limits exceeded, now record the reservation into all qgroups
*/
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
- struct btrfs_qgroup *qg;
-
- qg = unode_aux_to_qgroup(unode);
-
- qgroup_rsv_add(fs_info, qg, num_bytes, type);
- }
+ list_for_each_entry(qgroup, &qgroup_list, iterator)
+ qgroup_rsv_add(fs_info, qgroup, num_bytes, type);
out:
+ qgroup_iterator_clean(&qgroup_list);
spin_unlock(&fs_info->qgroup_lock);
return ret;
}
@@ -3207,9 +3408,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
enum btrfs_qgroup_rsv_type type)
{
struct btrfs_qgroup *qgroup;
- struct ulist_node *unode;
- struct ulist_iterator uiter;
- int ret = 0;
+ LIST_HEAD(qgroup_list);
if (!is_fstree(ref_root))
return;
@@ -3237,30 +3436,17 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
*/
num_bytes = qgroup->rsv.values[type];
- ulist_reinit(fs_info->qgroup_ulist);
- ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
- qgroup_to_aux(qgroup), GFP_ATOMIC);
- if (ret < 0)
- goto out;
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
- struct btrfs_qgroup *qg;
+ qgroup_iterator_add(&qgroup_list, qgroup);
+ list_for_each_entry(qgroup, &qgroup_list, iterator) {
struct btrfs_qgroup_list *glist;
- qg = unode_aux_to_qgroup(unode);
-
- qgroup_rsv_release(fs_info, qg, num_bytes, type);
-
- list_for_each_entry(glist, &qg->groups, next_group) {
- ret = ulist_add(fs_info->qgroup_ulist,
- glist->group->qgroupid,
- qgroup_to_aux(glist->group), GFP_ATOMIC);
- if (ret < 0)
- goto out;
+ qgroup_rsv_release(fs_info, qgroup, num_bytes, type);
+ list_for_each_entry(glist, &qgroup->groups, next_group) {
+ qgroup_iterator_add(&qgroup_list, glist->group);
}
}
-
out:
+ qgroup_iterator_clean(&qgroup_list);
spin_unlock(&fs_info->qgroup_lock);
}
@@ -3295,6 +3481,9 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
int slot;
int ret;
+ if (!btrfs_qgroup_full_accounting(fs_info))
+ return 1;
+
mutex_lock(&fs_info->qgroup_rescan_lock);
extent_root = btrfs_extent_root(fs_info,
fs_info->qgroup_rescan_progress.objectid);
@@ -3375,10 +3564,15 @@ out:
static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
{
- return btrfs_fs_closing(fs_info) ||
- test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) ||
- !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
- fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN;
+ if (btrfs_fs_closing(fs_info))
+ return true;
+ if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
+ return true;
+ if (!btrfs_qgroup_enabled(fs_info))
+ return true;
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN)
+ return true;
+ return false;
}
static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
@@ -3392,6 +3586,9 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
bool stopped = false;
bool did_leaf_rescans = false;
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
+ return;
+
path = btrfs_alloc_path();
if (!path)
goto out;
@@ -3495,6 +3692,11 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
{
int ret = 0;
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) {
+ btrfs_warn(fs_info, "qgroup rescan init failed, running in simple mode");
+ return -EINVAL;
+ }
+
if (!init_flags) {
/* we're resuming qgroup rescan at mount time */
if (!(fs_info->qgroup_flags &
@@ -3525,7 +3727,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
btrfs_warn(fs_info,
"qgroup rescan init failed, qgroup is not enabled");
ret = -EINVAL;
- } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
+ } else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) {
/* Quota disable is in progress */
ret = -EBUSY;
}
@@ -3546,7 +3748,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
mutex_unlock(&fs_info->qgroup_rescan_lock);
btrfs_init_work(&fs_info->qgroup_rescan_work,
- btrfs_qgroup_rescan_worker, NULL, NULL);
+ btrfs_qgroup_rescan_worker, NULL);
return 0;
}
@@ -3784,7 +3986,7 @@ static int qgroup_reserve_data(struct btrfs_inode *inode,
u64 to_reserve;
int ret;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
+ if (btrfs_qgroup_mode(root->fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
!is_fstree(root->root_key.objectid) || len == 0)
return 0;
@@ -3855,13 +4057,14 @@ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
/* Free ranges specified by @reserved, normally in error path */
static int qgroup_free_reserved_data(struct btrfs_inode *inode,
- struct extent_changeset *reserved, u64 start, u64 len)
+ struct extent_changeset *reserved,
+ u64 start, u64 len, u64 *freed_ret)
{
struct btrfs_root *root = inode->root;
struct ulist_node *unode;
struct ulist_iterator uiter;
struct extent_changeset changeset;
- int freed = 0;
+ u64 freed = 0;
int ret;
extent_changeset_init(&changeset);
@@ -3902,7 +4105,9 @@ static int qgroup_free_reserved_data(struct btrfs_inode *inode,
}
btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed,
BTRFS_QGROUP_RSV_DATA);
- ret = freed;
+ if (freed_ret)
+ *freed_ret = freed;
+ ret = 0;
out:
extent_changeset_release(&changeset);
return ret;
@@ -3910,19 +4115,23 @@ out:
static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len,
- int free)
+ u64 *released, int free)
{
struct extent_changeset changeset;
int trace_op = QGROUP_RELEASE;
int ret;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags))
- return 0;
+ if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) {
+ extent_changeset_init(&changeset);
+ return clear_record_extent_bits(&inode->io_tree, start,
+ start + len - 1,
+ EXTENT_QGROUP_RESERVED, &changeset);
+ }
/* In release case, we shouldn't have @reserved */
WARN_ON(!free && reserved);
if (free && reserved)
- return qgroup_free_reserved_data(inode, reserved, start, len);
+ return qgroup_free_reserved_data(inode, reserved, start, len, released);
extent_changeset_init(&changeset);
ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1,
EXTENT_QGROUP_RESERVED, &changeset);
@@ -3937,7 +4146,8 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
btrfs_qgroup_free_refroot(inode->root->fs_info,
inode->root->root_key.objectid,
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
- ret = changeset.bytes_changed;
+ if (released)
+ *released = changeset.bytes_changed;
out:
extent_changeset_release(&changeset);
return ret;
@@ -3956,9 +4166,10 @@ out:
* NOTE: This function may sleep for memory allocation.
*/
int btrfs_qgroup_free_data(struct btrfs_inode *inode,
- struct extent_changeset *reserved, u64 start, u64 len)
+ struct extent_changeset *reserved,
+ u64 start, u64 len, u64 *freed)
{
- return __btrfs_qgroup_release_data(inode, reserved, start, len, 1);
+ return __btrfs_qgroup_release_data(inode, reserved, start, len, freed, 1);
}
/*
@@ -3976,9 +4187,9 @@ int btrfs_qgroup_free_data(struct btrfs_inode *inode,
*
* NOTE: This function may sleep for memory allocation.
*/
-int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len)
+int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released)
{
- return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
+ return __btrfs_qgroup_release_data(inode, NULL, start, len, released, 0);
}
static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes,
@@ -4027,7 +4238,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
!is_fstree(root->root_key.objectid) || num_bytes == 0)
return 0;
@@ -4064,11 +4275,15 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
}
+/*
+ * Per-transaction meta reservation should be all freed at transaction commit
+ * time
+ */
void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
!is_fstree(root->root_key.objectid))
return;
@@ -4084,7 +4299,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
{
struct btrfs_fs_info *fs_info = root->fs_info;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
!is_fstree(root->root_key.objectid))
return;
@@ -4104,9 +4319,7 @@ static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
int num_bytes)
{
struct btrfs_qgroup *qgroup;
- struct ulist_node *unode;
- struct ulist_iterator uiter;
- int ret = 0;
+ LIST_HEAD(qgroup_list);
if (num_bytes == 0)
return;
@@ -4117,39 +4330,36 @@ static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
qgroup = find_qgroup_rb(fs_info, ref_root);
if (!qgroup)
goto out;
- ulist_reinit(fs_info->qgroup_ulist);
- ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
- qgroup_to_aux(qgroup), GFP_ATOMIC);
- if (ret < 0)
- goto out;
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
- struct btrfs_qgroup *qg;
- struct btrfs_qgroup_list *glist;
- qg = unode_aux_to_qgroup(unode);
+ qgroup_iterator_add(&qgroup_list, qgroup);
+ list_for_each_entry(qgroup, &qgroup_list, iterator) {
+ struct btrfs_qgroup_list *glist;
- qgroup_rsv_release(fs_info, qg, num_bytes,
+ qgroup_rsv_release(fs_info, qgroup, num_bytes,
BTRFS_QGROUP_RSV_META_PREALLOC);
- qgroup_rsv_add(fs_info, qg, num_bytes,
- BTRFS_QGROUP_RSV_META_PERTRANS);
- list_for_each_entry(glist, &qg->groups, next_group) {
- ret = ulist_add(fs_info->qgroup_ulist,
- glist->group->qgroupid,
- qgroup_to_aux(glist->group), GFP_ATOMIC);
- if (ret < 0)
- goto out;
- }
+ if (!sb_rdonly(fs_info->sb))
+ qgroup_rsv_add(fs_info, qgroup, num_bytes,
+ BTRFS_QGROUP_RSV_META_PERTRANS);
+
+ list_for_each_entry(glist, &qgroup->groups, next_group)
+ qgroup_iterator_add(&qgroup_list, glist->group);
}
out:
+ qgroup_iterator_clean(&qgroup_list);
spin_unlock(&fs_info->qgroup_lock);
}
+/*
+ * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS.
+ *
+ * This is called when preallocated meta reservation needs to be used.
+ * Normally after btrfs_join_transaction() call.
+ */
void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
!is_fstree(root->root_key.objectid))
return;
/* Same as btrfs_qgroup_free_meta_prealloc() */
@@ -4257,7 +4467,7 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
int level = btrfs_header_level(subvol_parent) - 1;
int ret = 0;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
@@ -4367,7 +4577,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
int ret = 0;
int i;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
return 0;
@@ -4450,3 +4660,61 @@ void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
}
*root = RB_ROOT;
}
+
+void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes)
+{
+ if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE)
+ return;
+
+ if (!is_fstree(root))
+ return;
+
+ btrfs_qgroup_free_refroot(fs_info, root, rsv_bytes, BTRFS_QGROUP_RSV_DATA);
+}
+
+int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
+ struct btrfs_squota_delta *delta)
+{
+ int ret;
+ struct btrfs_qgroup *qgroup;
+ struct btrfs_qgroup *qg;
+ LIST_HEAD(qgroup_list);
+ u64 root = delta->root;
+ u64 num_bytes = delta->num_bytes;
+ const int sign = (delta->is_inc ? 1 : -1);
+
+ if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE)
+ return 0;
+
+ if (!is_fstree(root))
+ return 0;
+
+ /* If the extent predates enabling quotas, don't count it. */
+ if (delta->generation < fs_info->qgroup_enable_gen)
+ return 0;
+
+ spin_lock(&fs_info->qgroup_lock);
+ qgroup = find_qgroup_rb(fs_info, root);
+ if (!qgroup) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = 0;
+ qgroup_iterator_add(&qgroup_list, qgroup);
+ list_for_each_entry(qg, &qgroup_list, iterator) {
+ struct btrfs_qgroup_list *glist;
+
+ qg->excl += num_bytes * sign;
+ qg->rfer += num_bytes * sign;
+ qgroup_dirty(fs_info, qg);
+
+ list_for_each_entry(glist, &qg->groups, next_group)
+ qgroup_iterator_add(&qgroup_list, glist->group);
+ }
+ qgroup_iterator_clean(&qgroup_list);
+
+out:
+ spin_unlock(&fs_info->qgroup_lock);
+ return ret;
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 7bffa10589d6..be18c862e64e 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -101,8 +101,15 @@
* subtree rescan for them.
*/
-#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1UL << 3)
-#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1UL << 4)
+/*
+ * These flags share the flags field of the btrfs_qgroup_status_item with the
+ * persisted flags defined in btrfs_tree.h.
+ *
+ * To minimize the chance of collision with new persisted status flags, these
+ * count backwards from the MSB.
+ */
+#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1ULL << 63)
+#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1ULL << 62)
/*
* Record a dirty extent, and info qgroup to update quota on it
@@ -220,6 +227,33 @@ struct btrfs_qgroup {
struct list_head groups; /* groups this group is member of */
struct list_head members; /* groups that are members of this group */
struct list_head dirty; /* dirty groups */
+
+ /*
+ * For qgroup iteration usage.
+ *
+ * The iteration list should always be empty until qgroup_iterator_add()
+ * is called. And should be reset to empty after the iteration is
+ * finished.
+ */
+ struct list_head iterator;
+
+ /*
+ * For nested iterator usage.
+ *
+ * Here we support at most one level of nested iterator calls like:
+ *
+ * LIST_HEAD(all_qgroups);
+ * {
+ * LIST_HEAD(local_qgroups);
+ * qgroup_iterator_add(local_qgroups, qg);
+ * qgroup_iterator_nested_add(all_qgroups, qg);
+ * do_some_work(local_qgroups);
+ * qgroup_iterator_clean(local_qgroups);
+ * }
+ * do_some_work(all_qgroups);
+ * qgroup_iterator_nested_clean(all_qgroups);
+ */
+ struct list_head nested_iterator;
struct rb_node node; /* tree of qgroups */
/*
@@ -235,6 +269,19 @@ struct btrfs_qgroup {
struct kobject kobj;
};
+struct btrfs_squota_delta {
+ /* The fstree root this delta counts against. */
+ u64 root;
+ /* The number of bytes in the extent being counted. */
+ u64 num_bytes;
+ /* The generation the extent was created in. */
+ u64 generation;
+ /* Whether we are using or freeing the extent. */
+ bool is_inc;
+ /* Whether the extent is data or metadata. */
+ bool is_data;
+};
+
static inline u64 btrfs_qgroup_subvolid(u64 qgroupid)
{
return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1));
@@ -249,14 +296,23 @@ enum {
ENUM_BIT(QGROUP_FREE),
};
-int btrfs_quota_enable(struct btrfs_fs_info *fs_info);
+enum btrfs_qgroup_mode {
+ BTRFS_QGROUP_MODE_DISABLED,
+ BTRFS_QGROUP_MODE_FULL,
+ BTRFS_QGROUP_MODE_SIMPLE
+};
+
+enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info);
+bool btrfs_qgroup_enabled(struct btrfs_fs_info *fs_info);
+bool btrfs_qgroup_full_accounting(struct btrfs_fs_info *fs_info);
+int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_quota_ctl_args *quota_ctl_args);
int btrfs_quota_disable(struct btrfs_fs_info *fs_info);
int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
bool interruptible);
-int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
- u64 dst);
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst);
int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
u64 dst);
int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid);
@@ -267,80 +323,16 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
struct btrfs_delayed_extent_op;
-/*
- * Inform qgroup to trace one dirty extent, its info is recorded in @record.
- * So qgroup can account it at transaction committing time.
- *
- * No lock version, caller must acquire delayed ref lock and allocated memory,
- * then call btrfs_qgroup_trace_extent_post() after exiting lock context.
- *
- * Return 0 for success insert
- * Return >0 for existing record, caller can free @record safely.
- * Error is not possible
- */
int btrfs_qgroup_trace_extent_nolock(
struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_qgroup_extent_record *record);
-
-/*
- * Post handler after qgroup_trace_extent_nolock().
- *
- * NOTE: Current qgroup does the expensive backref walk at transaction
- * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming
- * new transaction.
- * This is designed to allow btrfs_find_all_roots() to get correct new_roots
- * result.
- *
- * However for old_roots there is no need to do backref walk at that time,
- * since we search commit roots to walk backref and result will always be
- * correct.
- *
- * Due to the nature of no lock version, we can't do backref there.
- * So we must call btrfs_qgroup_trace_extent_post() after exiting
- * spinlock context.
- *
- * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result
- * using current root, then we can move all expensive backref walk out of
- * transaction committing, but not now as qgroup accounting will be wrong again.
- */
int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
struct btrfs_qgroup_extent_record *qrecord);
-
-/*
- * Inform qgroup to trace one dirty extent, specified by @bytenr and
- * @num_bytes.
- * So qgroup can account it at commit trans time.
- *
- * Better encapsulated version, with memory allocation and backref walk for
- * commit roots.
- * So this can sleep.
- *
- * Return 0 if the operation is done.
- * Return <0 for error, like memory allocation failure or invalid parameter
- * (NULL trans)
- */
int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 num_bytes);
-
-/*
- * Inform qgroup to trace all leaf items of data
- *
- * Return 0 for success
- * Return <0 for error(ENOMEM)
- */
int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
struct extent_buffer *eb);
-/*
- * Inform qgroup to trace a whole subtree, including all its child tree
- * blocks and data.
- * The root tree block is specified by @root_eb.
- *
- * Normally used by relocation(tree block swap) and subvolume deletion.
- *
- * Return 0 for success
- * Return <0 for error(ENOMEM or tree search error)
- */
int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *root_eb,
u64 root_gen, int root_level);
@@ -350,7 +342,8 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans);
int btrfs_run_qgroups(struct btrfs_trans_handle *trans);
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
- u64 objectid, struct btrfs_qgroup_inherit *inherit);
+ u64 objectid, u64 inode_rootid,
+ struct btrfs_qgroup_inherit *inherit);
void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes,
enum btrfs_qgroup_rsv_type type);
@@ -363,10 +356,10 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
/* New io_tree based accurate qgroup reserve API */
int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
-int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len);
+int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released);
int btrfs_qgroup_free_data(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start,
- u64 len);
+ u64 len, u64 *freed);
int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type, bool enforce);
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
@@ -408,20 +401,8 @@ static inline void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root,
BTRFS_QGROUP_RSV_META_PREALLOC);
}
-/*
- * Per-transaction meta reservation should be all freed at transaction commit
- * time
- */
void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root);
-
-/*
- * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS.
- *
- * This is called when preallocated meta reservation needs to be used.
- * Normally after btrfs_join_transaction() call.
- */
void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
-
void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode);
/* btrfs_qgroup_swapped_blocks related functions */
@@ -439,5 +420,8 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *eb);
void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info);
+void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes);
+int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
+ struct btrfs_squota_delta *delta);
#endif
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
new file mode 100644
index 000000000000..9589362acfbf
--- /dev/null
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -0,0 +1,274 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023 Western Digital Corporation or its affiliates.
+ */
+
+#include <linux/btrfs_tree.h>
+#include "ctree.h"
+#include "fs.h"
+#include "accessors.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "raid-stripe-tree.h"
+#include "volumes.h"
+#include "misc.h"
+#include "print-tree.h"
+
+int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *stripe_root = fs_info->stripe_root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ u64 found_start;
+ u64 found_end;
+ u64 end = start + length;
+ int slot;
+ int ret;
+
+ if (!stripe_root)
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ while (1) {
+ key.objectid = start;
+ key.type = BTRFS_RAID_STRIPE_KEY;
+ key.offset = length;
+
+ ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ ret = 0;
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ }
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ found_start = key.objectid;
+ found_end = found_start + key.offset;
+
+ /* That stripe ends before we start, we're done. */
+ if (found_end <= start)
+ break;
+
+ trace_btrfs_raid_extent_delete(fs_info, start, end,
+ found_start, found_end);
+
+ ASSERT(found_start >= start && found_end <= end);
+ ret = btrfs_del_item(trans, stripe_root, path);
+ if (ret)
+ break;
+
+ btrfs_release_path(path);
+ }
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_io_context *bioc)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_key stripe_key;
+ struct btrfs_root *stripe_root = fs_info->stripe_root;
+ const int num_stripes = btrfs_bg_type_to_factor(bioc->map_type);
+ u8 encoding = btrfs_bg_flags_to_raid_index(bioc->map_type);
+ struct btrfs_stripe_extent *stripe_extent;
+ const size_t item_size = struct_size(stripe_extent, strides, num_stripes);
+ int ret;
+
+ stripe_extent = kzalloc(item_size, GFP_NOFS);
+ if (!stripe_extent) {
+ btrfs_abort_transaction(trans, -ENOMEM);
+ btrfs_end_transaction(trans);
+ return -ENOMEM;
+ }
+
+ trace_btrfs_insert_one_raid_extent(fs_info, bioc->logical, bioc->size,
+ num_stripes);
+ btrfs_set_stack_stripe_extent_encoding(stripe_extent, encoding);
+ for (int i = 0; i < num_stripes; i++) {
+ u64 devid = bioc->stripes[i].dev->devid;
+ u64 physical = bioc->stripes[i].physical;
+ u64 length = bioc->stripes[i].length;
+ struct btrfs_raid_stride *raid_stride = &stripe_extent->strides[i];
+
+ if (length == 0)
+ length = bioc->size;
+
+ btrfs_set_stack_raid_stride_devid(raid_stride, devid);
+ btrfs_set_stack_raid_stride_physical(raid_stride, physical);
+ }
+
+ stripe_key.objectid = bioc->logical;
+ stripe_key.type = BTRFS_RAID_STRIPE_KEY;
+ stripe_key.offset = bioc->size;
+
+ ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent,
+ item_size);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+
+ kfree(stripe_extent);
+
+ return ret;
+}
+
+int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_ordered_extent *ordered_extent)
+{
+ struct btrfs_io_context *bioc;
+ int ret;
+
+ if (!btrfs_fs_incompat(trans->fs_info, RAID_STRIPE_TREE))
+ return 0;
+
+ list_for_each_entry(bioc, &ordered_extent->bioc_list, rst_ordered_entry) {
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret)
+ return ret;
+ }
+
+ while (!list_empty(&ordered_extent->bioc_list)) {
+ bioc = list_first_entry(&ordered_extent->bioc_list,
+ typeof(*bioc), rst_ordered_entry);
+ list_del(&bioc->rst_ordered_entry);
+ btrfs_put_bioc(bioc);
+ }
+
+ return 0;
+}
+
+int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 *length, u64 map_type,
+ u32 stripe_index, struct btrfs_io_stripe *stripe)
+{
+ struct btrfs_root *stripe_root = fs_info->stripe_root;
+ struct btrfs_stripe_extent *stripe_extent;
+ struct btrfs_key stripe_key;
+ struct btrfs_key found_key;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ const u64 end = logical + *length;
+ int num_stripes;
+ u8 encoding;
+ u64 offset;
+ u64 found_logical;
+ u64 found_length;
+ u64 found_end;
+ int slot;
+ int ret;
+
+ stripe_key.objectid = logical;
+ stripe_key.type = BTRFS_RAID_STRIPE_KEY;
+ stripe_key.offset = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ if (stripe->is_scrub) {
+ path->skip_locking = 1;
+ path->search_commit_root = 1;
+ }
+
+ ret = btrfs_search_slot(NULL, stripe_root, &stripe_key, path, 0, 0);
+ if (ret < 0)
+ goto free_path;
+ if (ret) {
+ if (path->slots[0] != 0)
+ path->slots[0]--;
+ }
+
+ while (1) {
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ found_logical = found_key.objectid;
+ found_length = found_key.offset;
+ found_end = found_logical + found_length;
+
+ if (found_logical > end) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ if (in_range(logical, found_logical, found_length))
+ break;
+
+ ret = btrfs_next_item(stripe_root, path);
+ if (ret)
+ goto out;
+ }
+
+ offset = logical - found_logical;
+
+ /*
+ * If we have a logically contiguous, but physically non-continuous
+ * range, we need to split the bio. Record the length after which we
+ * must split the bio.
+ */
+ if (end > found_end)
+ *length -= end - found_end;
+
+ num_stripes = btrfs_num_raid_stripes(btrfs_item_size(leaf, slot));
+ stripe_extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
+ encoding = btrfs_stripe_extent_encoding(leaf, stripe_extent);
+
+ if (encoding != btrfs_bg_flags_to_raid_index(map_type)) {
+ ret = -EUCLEAN;
+ btrfs_handle_fs_error(fs_info, ret,
+ "on-disk stripe encoding %d doesn't match RAID index %d",
+ encoding,
+ btrfs_bg_flags_to_raid_index(map_type));
+ goto out;
+ }
+
+ for (int i = 0; i < num_stripes; i++) {
+ struct btrfs_raid_stride *stride = &stripe_extent->strides[i];
+ u64 devid = btrfs_raid_stride_devid(leaf, stride);
+ u64 physical = btrfs_raid_stride_physical(leaf, stride);
+
+ if (devid != stripe->dev->devid)
+ continue;
+
+ if ((map_type & BTRFS_BLOCK_GROUP_DUP) && stripe_index != i)
+ continue;
+
+ stripe->physical = physical + offset;
+
+ trace_btrfs_get_raid_extent_offset(fs_info, logical, *length,
+ stripe->physical, devid);
+
+ ret = 0;
+ goto free_path;
+ }
+
+ /* If we're here, we haven't found the requested devid in the stripe. */
+ ret = -ENOENT;
+out:
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret && ret != -EIO && !stripe->is_scrub) {
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG))
+ btrfs_print_tree(leaf, 1);
+ btrfs_err(fs_info,
+ "cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s",
+ logical, logical + *length, stripe->dev->devid,
+ btrfs_bg_type_to_raid_name(map_type));
+ }
+free_path:
+ btrfs_free_path(path);
+
+ return ret;
+}
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
new file mode 100644
index 000000000000..cdb58b38fcb5
--- /dev/null
+++ b/fs/btrfs/raid-stripe-tree.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2023 Western Digital Corporation or its affiliates.
+ */
+
+#ifndef BTRFS_RAID_STRIPE_TREE_H
+#define BTRFS_RAID_STRIPE_TREE_H
+
+#define BTRFS_RST_SUPP_BLOCK_GROUP_MASK (BTRFS_BLOCK_GROUP_DUP | \
+ BTRFS_BLOCK_GROUP_RAID1_MASK | \
+ BTRFS_BLOCK_GROUP_RAID0 | \
+ BTRFS_BLOCK_GROUP_RAID10)
+
+struct btrfs_io_context;
+struct btrfs_io_stripe;
+struct btrfs_ordered_extent;
+struct btrfs_trans_handle;
+
+int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length);
+int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 *length, u64 map_type,
+ u32 stripe_index, struct btrfs_io_stripe *stripe);
+int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_ordered_extent *ordered_extent);
+
+static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info,
+ u64 map_type)
+{
+ u64 type = map_type & BTRFS_BLOCK_GROUP_TYPE_MASK;
+ u64 profile = map_type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+ if (!btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE))
+ return false;
+
+ if (type != BTRFS_BLOCK_GROUP_DATA)
+ return false;
+
+ if (profile & BTRFS_RST_SUPP_BLOCK_GROUP_MASK)
+ return true;
+
+ return false;
+}
+
+static inline int btrfs_num_raid_stripes(u32 item_size)
+{
+ return (item_size - offsetof(struct btrfs_stripe_extent, strides)) /
+ sizeof(struct btrfs_raid_stride);
+}
+
+#endif
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 95d28497de7c..6486f0d7e993 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -485,6 +485,9 @@ static int process_extent_item(struct btrfs_fs_info *fs_info,
ret = add_shared_data_ref(fs_info, offset, count,
key->objectid, key->offset);
break;
+ case BTRFS_EXTENT_OWNER_REF_KEY:
+ WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+ break;
default:
btrfs_err(fs_info, "invalid key type in iref");
ret = -EINVAL;
@@ -652,7 +655,7 @@ static void dump_block_entry(struct btrfs_fs_info *fs_info,
}
/*
- * btrfs_ref_tree_mod: called when we modify a ref for a bytenr
+ * Called when we modify a ref for a bytenr.
*
* This will add an action item to the given bytenr and do sanity checks to make
* sure we haven't messed something up. If we are making a new allocation and
@@ -681,10 +684,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
if (generic_ref->type == BTRFS_REF_METADATA) {
if (!parent)
- ref_root = generic_ref->tree_ref.owning_root;
+ ref_root = generic_ref->tree_ref.ref_root;
owner = generic_ref->tree_ref.level;
} else if (!parent) {
- ref_root = generic_ref->data_ref.owning_root;
+ ref_root = generic_ref->data_ref.ref_root;
owner = generic_ref->data_ref.ino;
offset = generic_ref->data_ref.offset;
}
@@ -791,6 +794,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
dump_ref_action(fs_info, ra);
kfree(ref);
kfree(ra);
+ kfree(re);
goto out_unlock;
} else if (be->num_refs == 0) {
btrfs_err(fs_info,
@@ -800,6 +804,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
dump_ref_action(fs_info, ra);
kfree(ref);
kfree(ra);
+ kfree(re);
goto out_unlock;
}
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 65d2bd6910f2..f88b0c2ac3fe 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -25,12 +25,11 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
const u64 olen,
int no_time_update)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
inode_inc_iversion(inode);
if (!no_time_update) {
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
}
/*
* We round up to the block size at eof when determining which
@@ -43,7 +42,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
}
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
if (ret) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index c6d4bb8cbe29..f5d9e5f74a52 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -111,8 +111,8 @@ struct tree_block {
}; /* Use rb_simple_node for search/insert */
u64 owner;
struct btrfs_key key;
- unsigned int level:8;
- unsigned int key_ready:1;
+ u8 level;
+ bool key_ready;
};
#define MAX_EXTENTS 128
@@ -122,6 +122,13 @@ struct file_extent_cluster {
u64 end;
u64 boundary[MAX_EXTENTS];
unsigned int nr;
+ u64 owning_root;
+};
+
+/* Stages of data relocation. */
+enum reloc_stage {
+ MOVE_DATA_EXTENTS,
+ UPDATE_DATA_PTRS
};
struct reloc_control {
@@ -155,16 +162,12 @@ struct reloc_control {
u64 search_start;
u64 extents_found;
- unsigned int stage:8;
- unsigned int create_reloc_tree:1;
- unsigned int merge_reloc_tree:1;
- unsigned int found_file_extent:1;
+ enum reloc_stage stage;
+ bool create_reloc_tree;
+ bool merge_reloc_tree;
+ bool found_file_extent;
};
-/* stages of data relocation */
-#define MOVE_DATA_EXTENTS 0
-#define UPDATE_DATA_PTRS 1
-
static void mark_block_processed(struct reloc_control *rc,
struct btrfs_backref_node *node)
{
@@ -180,13 +183,6 @@ static void mark_block_processed(struct reloc_control *rc,
node->processed = 1;
}
-
-static void mapping_tree_init(struct mapping_tree *tree)
-{
- tree->rb_root = RB_ROOT;
- spin_lock_init(&tree->lock);
-}
-
/*
* walk up backref nodes until reach node presents tree root
*/
@@ -299,7 +295,7 @@ static int update_backref_cache(struct btrfs_trans_handle *trans,
return 1;
}
-static bool reloc_root_is_dead(struct btrfs_root *root)
+static bool reloc_root_is_dead(const struct btrfs_root *root)
{
/*
* Pair with set_bit/clear_bit in clean_dirty_subvols and
@@ -320,7 +316,7 @@ static bool reloc_root_is_dead(struct btrfs_root *root)
* from no reloc root. But btrfs_should_ignore_reloc_root() below is a
* special case.
*/
-static bool have_reloc_root(struct btrfs_root *root)
+static bool have_reloc_root(const struct btrfs_root *root)
{
if (reloc_root_is_dead(root))
return false;
@@ -329,31 +325,30 @@ static bool have_reloc_root(struct btrfs_root *root)
return true;
}
-int btrfs_should_ignore_reloc_root(struct btrfs_root *root)
+bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root)
{
struct btrfs_root *reloc_root;
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
- return 0;
+ return false;
/* This root has been merged with its reloc tree, we can ignore it */
if (reloc_root_is_dead(root))
- return 1;
+ return true;
reloc_root = root->reloc_root;
if (!reloc_root)
- return 0;
+ return false;
if (btrfs_header_generation(reloc_root->commit_root) ==
root->fs_info->running_transaction->transid)
- return 0;
+ return false;
/*
- * if there is reloc tree and it was created in previous
- * transaction backref lookup can find the reloc tree,
- * so backref node for the fs tree root is useless for
- * relocation.
+ * If there is reloc tree and it was created in previous transaction
+ * backref lookup can find the reloc tree, so backref node for the fs
+ * tree root is useless for relocation.
*/
- return 1;
+ return true;
}
/*
@@ -547,7 +542,7 @@ out:
*/
static int clone_backref_node(struct btrfs_trans_handle *trans,
struct reloc_control *rc,
- struct btrfs_root *src,
+ const struct btrfs_root *src,
struct btrfs_root *dest)
{
struct btrfs_root *reloc_root = src->reloc_root;
@@ -632,7 +627,7 @@ fail:
/*
* helper to add 'address of tree root -> reloc tree' mapping
*/
-static int __must_check __add_reloc_root(struct btrfs_root *root)
+static int __add_reloc_root(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *rb_node;
@@ -1159,7 +1154,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
key.offset -= btrfs_file_extent_offset(leaf, fi);
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
- num_bytes, parent);
+ num_bytes, parent, root->root_key.objectid);
btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
key.objectid, key.offset,
root->root_key.objectid, false);
@@ -1170,7 +1165,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
}
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
- num_bytes, parent);
+ num_bytes, parent, root->root_key.objectid);
btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
key.objectid, key.offset,
root->root_key.objectid, false);
@@ -1181,15 +1176,15 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
}
}
if (dirty)
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
if (inode)
btrfs_add_delayed_iput(BTRFS_I(inode));
return ret;
}
-static noinline_for_stack
-int memcmp_node_keys(struct extent_buffer *eb, int slot,
- struct btrfs_path *path, int level)
+static noinline_for_stack int memcmp_node_keys(const struct extent_buffer *eb,
+ int slot, const struct btrfs_path *path,
+ int level)
{
struct btrfs_disk_key key1;
struct btrfs_disk_key key2;
@@ -1374,16 +1369,17 @@ again:
*/
btrfs_set_node_blockptr(parent, slot, new_bytenr);
btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
- btrfs_mark_buffer_dirty(parent);
+ btrfs_mark_buffer_dirty(trans, parent);
btrfs_set_node_blockptr(path->nodes[level],
path->slots[level], old_bytenr);
btrfs_set_node_ptr_generation(path->nodes[level],
path->slots[level], old_ptr_gen);
- btrfs_mark_buffer_dirty(path->nodes[level]);
+ btrfs_mark_buffer_dirty(trans, path->nodes[level]);
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
- blocksize, path->nodes[level]->start);
+ blocksize, path->nodes[level]->start,
+ src->root_key.objectid);
btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
0, true);
ret = btrfs_inc_extent_ref(trans, &ref);
@@ -1392,7 +1388,7 @@ again:
break;
}
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
- blocksize, 0);
+ blocksize, 0, dest->root_key.objectid);
btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0,
true);
ret = btrfs_inc_extent_ref(trans, &ref);
@@ -1401,8 +1397,9 @@ again:
break;
}
+ /* We don't know the real owning_root, use 0. */
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
- blocksize, path->nodes[level]->start);
+ blocksize, path->nodes[level]->start, 0);
btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
0, true);
ret = btrfs_free_extent(trans, &ref);
@@ -1411,8 +1408,9 @@ again:
break;
}
+ /* We don't know the real owning_root, use 0. */
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
- blocksize, 0);
+ blocksize, 0, 0);
btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid,
0, true);
ret = btrfs_free_extent(trans, &ref);
@@ -1518,8 +1516,8 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
* [min_key, max_key)
*/
static int invalidate_extent_cache(struct btrfs_root *root,
- struct btrfs_key *min_key,
- struct btrfs_key *max_key)
+ const struct btrfs_key *min_key,
+ const struct btrfs_key *max_key)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct inode *inode = NULL;
@@ -1897,7 +1895,7 @@ again:
}
}
- rc->merge_reloc_tree = 1;
+ rc->merge_reloc_tree = true;
while (!list_empty(&rc->reloc_roots)) {
reloc_root = list_entry(rc->reloc_roots.next,
@@ -2517,11 +2515,12 @@ static int do_relocation(struct btrfs_trans_handle *trans,
node->eb->start);
btrfs_set_node_ptr_generation(upper->eb, slot,
trans->transid);
- btrfs_mark_buffer_dirty(upper->eb);
+ btrfs_mark_buffer_dirty(trans, upper->eb);
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
node->eb->start, blocksize,
- upper->eb->start);
+ upper->eb->start,
+ btrfs_header_owner(upper->eb));
btrfs_init_tree_ref(&ref, node->level,
btrfs_header_owner(upper->eb),
root->root_key.objectid, false);
@@ -2633,7 +2632,7 @@ static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
u32 blocksize = rc->extent_root->fs_info->nodesize;
if (test_range_bit(&rc->processed_blocks, bytenr,
- bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
+ bytenr + blocksize - 1, EXTENT_DIRTY, NULL))
return 1;
return 0;
}
@@ -2660,7 +2659,7 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info,
else
btrfs_node_key_to_cpu(eb, &block->key, 0);
free_extent_buffer(eb);
- block->key_ready = 1;
+ block->key_ready = true;
return 0;
}
@@ -2830,7 +2829,7 @@ out_free_blocks:
static noinline_for_stack int prealloc_file_extent_cluster(
struct btrfs_inode *inode,
- struct file_extent_cluster *cluster)
+ const struct file_extent_cluster *cluster)
{
u64 alloc_hint = 0;
u64 start;
@@ -2965,7 +2964,7 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod
/*
* Allow error injection to test balance/relocation cancellation
*/
-noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
+noinline int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info)
{
return atomic_read(&fs_info->balance_cancel_req) ||
atomic_read(&fs_info->reloc_cancel_req) ||
@@ -2973,7 +2972,7 @@ noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
}
ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
-static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster,
+static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster,
int cluster_nr)
{
/* Last extent, use cluster end directly */
@@ -2985,7 +2984,7 @@ static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster,
}
static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
- struct file_extent_cluster *cluster,
+ const struct file_extent_cluster *cluster,
int *cluster_nr, unsigned long page_index)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -3120,7 +3119,7 @@ release_page:
}
static int relocate_file_extent_cluster(struct inode *inode,
- struct file_extent_cluster *cluster)
+ const struct file_extent_cluster *cluster)
{
u64 offset = BTRFS_I(inode)->index_cnt;
unsigned long index;
@@ -3158,11 +3157,12 @@ out:
return ret;
}
-static noinline_for_stack
-int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
- struct file_extent_cluster *cluster)
+static noinline_for_stack int relocate_data_extent(struct inode *inode,
+ const struct btrfs_key *extent_key,
+ struct file_extent_cluster *cluster)
{
int ret;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
ret = relocate_file_extent_cluster(inode, cluster);
@@ -3171,8 +3171,38 @@ int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
cluster->nr = 0;
}
- if (!cluster->nr)
+ /*
+ * Under simple quotas, we set root->relocation_src_root when we find
+ * the extent. If adjacent extents have different owners, we can't merge
+ * them while relocating. Handle this by storing the owning root that
+ * started a cluster and if we see an extent from a different root break
+ * cluster formation (just like the above case of non-adjacent extents).
+ *
+ * Without simple quotas, relocation_src_root is always 0, so we should
+ * never see a mismatch, and it should have no effect on relocation
+ * clusters.
+ */
+ if (cluster->nr > 0 && cluster->owning_root != root->relocation_src_root) {
+ u64 tmp = root->relocation_src_root;
+
+ /*
+ * root->relocation_src_root is the state that actually affects
+ * the preallocation we do here, so set it to the root owning
+ * the cluster we need to relocate.
+ */
+ root->relocation_src_root = cluster->owning_root;
+ ret = relocate_file_extent_cluster(inode, cluster);
+ if (ret)
+ return ret;
+ cluster->nr = 0;
+ /* And reset it back for the current extent's owning root. */
+ root->relocation_src_root = tmp;
+ }
+
+ if (!cluster->nr) {
cluster->start = extent_key->objectid;
+ cluster->owning_root = root->relocation_src_root;
+ }
else
BUG_ON(cluster->nr >= MAX_EXTENTS);
cluster->end = extent_key->objectid + extent_key->offset - 1;
@@ -3193,7 +3223,7 @@ int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
* the major work is getting the generation and level of the block
*/
static int add_tree_block(struct reloc_control *rc,
- struct btrfs_key *extent_key,
+ const struct btrfs_key *extent_key,
struct btrfs_path *path,
struct rb_root *blocks)
{
@@ -3278,7 +3308,7 @@ static int add_tree_block(struct reloc_control *rc,
block->key.objectid = rc->extent_root->fs_info->nodesize;
block->key.offset = generation;
block->level = level;
- block->key_ready = 0;
+ block->key_ready = false;
block->owner = owner;
rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node);
@@ -3444,11 +3474,10 @@ static int delete_v1_space_cache(struct extent_buffer *leaf,
/*
* helper to find all tree blocks that reference a given data extent
*/
-static noinline_for_stack
-int add_data_references(struct reloc_control *rc,
- struct btrfs_key *extent_key,
- struct btrfs_path *path,
- struct rb_root *blocks)
+static noinline_for_stack int add_data_references(struct reloc_control *rc,
+ const struct btrfs_key *extent_key,
+ struct btrfs_path *path,
+ struct rb_root *blocks)
{
struct btrfs_backref_walk_ctx ctx = { 0 };
struct ulist_iterator leaf_uiter;
@@ -3622,7 +3651,7 @@ int prepare_to_relocate(struct reloc_control *rc)
if (ret)
return ret;
- rc->create_reloc_tree = 1;
+ rc->create_reloc_tree = true;
set_reloc_control(rc);
trans = btrfs_join_transaction(rc->extent_root);
@@ -3702,6 +3731,21 @@ restart:
struct btrfs_extent_item);
flags = btrfs_extent_flags(path->nodes[0], ei);
+ /*
+ * If we are relocating a simple quota owned extent item, we
+ * need to note the owner on the reloc data root so that when
+ * we allocate the replacement item, we can attribute it to the
+ * correct eventual owner (rather than the reloc data root).
+ */
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) {
+ struct btrfs_root *root = BTRFS_I(rc->data_inode)->root;
+ u64 owning_root_id = btrfs_get_extent_owner_root(fs_info,
+ path->nodes[0],
+ path->slots[0]);
+
+ root->relocation_src_root = owning_root_id;
+ }
+
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
ret = add_tree_block(rc, &key, path, &blocks);
} else if (rc->stage == UPDATE_DATA_PTRS &&
@@ -3734,7 +3778,7 @@ restart:
if (rc->stage == MOVE_DATA_EXTENTS &&
(flags & BTRFS_EXTENT_FLAG_DATA)) {
- rc->found_file_extent = 1;
+ rc->found_file_extent = true;
ret = relocate_data_extent(rc->data_inode,
&key, &rc->cluster);
if (ret < 0) {
@@ -3771,7 +3815,7 @@ restart:
err = ret;
}
- rc->create_reloc_tree = 0;
+ rc->create_reloc_tree = false;
set_reloc_control(rc);
btrfs_backref_release_cache(&rc->backref_cache);
@@ -3789,7 +3833,7 @@ restart:
merge_reloc_roots(rc);
- rc->merge_reloc_tree = 0;
+ rc->merge_reloc_tree = false;
unset_reloc_control(rc);
btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
@@ -3835,7 +3879,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
BTRFS_INODE_PREALLOC);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
out:
btrfs_free_path(path);
return ret;
@@ -3874,9 +3918,9 @@ out:
* helper to create inode for data relocation.
* the inode is in data relocation tree and its link count is 0
*/
-static noinline_for_stack
-struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group *group)
+static noinline_for_stack struct inode *create_reloc_inode(
+ struct btrfs_fs_info *fs_info,
+ const struct btrfs_block_group *group)
{
struct inode *inode = NULL;
struct btrfs_trans_handle *trans;
@@ -3971,8 +4015,9 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&rc->reloc_roots);
INIT_LIST_HEAD(&rc->dirty_subvol_roots);
- btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1);
- mapping_tree_init(&rc->reloc_root_tree);
+ btrfs_backref_init_cache(fs_info, &rc->backref_cache, true);
+ rc->reloc_root_tree.rb_root = RB_ROOT;
+ spin_lock_init(&rc->reloc_root_tree.lock);
extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS);
return rc;
}
@@ -4004,7 +4049,7 @@ static void describe_relocation(struct btrfs_fs_info *fs_info,
block_group->start, buf);
}
-static const char *stage_to_string(int stage)
+static const char *stage_to_string(enum reloc_stage stage)
{
if (stage == MOVE_DATA_EXTENTS)
return "move data extents";
@@ -4120,7 +4165,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
WARN_ON(ret && ret != -EAGAIN);
while (1) {
- int finishes_stage;
+ enum reloc_stage finishes_stage;
mutex_lock(&fs_info->cleaner_mutex);
ret = relocate_block_group(rc);
@@ -4303,7 +4348,7 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
goto out_unset;
}
- rc->merge_reloc_tree = 1;
+ rc->merge_reloc_tree = true;
while (!list_empty(&reloc_roots)) {
reloc_root = list_entry(reloc_roots.next,
@@ -4422,7 +4467,8 @@ int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered)
}
int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct extent_buffer *buf,
+ struct btrfs_root *root,
+ const struct extent_buffer *buf,
struct extent_buffer *cow)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4561,7 +4607,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
*
* Return U64_MAX if no running relocation.
*/
-u64 btrfs_get_reloc_bg_bytenr(struct btrfs_fs_info *fs_info)
+u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info)
{
u64 logical = U64_MAX;
diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h
index 77d69f6ae967..5fb60f2deb53 100644
--- a/fs/btrfs/relocation.h
+++ b/fs/btrfs/relocation.h
@@ -10,15 +10,16 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
int btrfs_recover_relocation(struct btrfs_fs_info *fs_info);
int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered);
int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct extent_buffer *buf,
+ struct btrfs_root *root,
+ const struct extent_buffer *buf,
struct extent_buffer *cow);
void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
u64 *bytes_to_reserve);
int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending);
-int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info);
+int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info);
struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr);
-int btrfs_should_ignore_reloc_root(struct btrfs_root *root);
-u64 btrfs_get_reloc_bg_bytenr(struct btrfs_fs_info *fs_info);
+bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root);
+u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info);
#endif
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 859874579456..603ad1459368 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -51,7 +51,8 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
}
/*
- * btrfs_find_root - lookup the root by the key.
+ * Lookup the root by the key.
+ *
* root: the root of the root tree
* search_key: the key to search
* path: the path we search
@@ -191,7 +192,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
write_extent_buffer(l, item, ptr, sizeof(*item));
- btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_mark_buffer_dirty(trans, path->nodes[0]);
out:
btrfs_free_path(path);
return ret;
@@ -438,7 +439,7 @@ again:
btrfs_set_root_ref_name_len(leaf, ref, name->len);
ptr = (unsigned long)(ref + 1);
write_extent_buffer(leaf, name->name, ptr, name->len);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
if (key.type == BTRFS_ROOT_BACKREF_KEY) {
btrfs_release_path(path);
@@ -485,7 +486,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
}
/*
- * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
+ * Reserve space for subvolume operation.
+ *
* root: the root of the parent directory
* rsv: block reservation
* items: the number of items that we need do reservation
@@ -508,7 +510,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
- if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
+ if (btrfs_qgroup_enabled(fs_info)) {
/* One for parent inode, two for dir entries */
qgroup_num_bytes = 3 * fs_info->nodesize;
ret = btrfs_qgroup_reserve_meta_prealloc(root,
diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h
index cbbaca32126e..8b2c3859e464 100644
--- a/fs/btrfs/root-tree.h
+++ b/fs/btrfs/root-tree.h
@@ -3,6 +3,8 @@
#ifndef BTRFS_ROOT_TREE_H
#define BTRFS_ROOT_TREE_H
+struct fscrypt_str;
+
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv,
int nitems, bool use_global_rsv);
@@ -18,10 +20,8 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, const struct btrfs_key *key
int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *key,
struct btrfs_root_item *item);
-int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_key *key,
- struct btrfs_root_item *item);
+int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_key *key, struct btrfs_root_item *item);
int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key,
struct btrfs_path *path, struct btrfs_root_item *root_item,
struct btrfs_key *root_key);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b877203f1dc5..f62a408671cb 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -16,7 +16,6 @@
#include "backref.h"
#include "extent_io.h"
#include "dev-replace.h"
-#include "check-integrity.h"
#include "raid56.h"
#include "block-group.h"
#include "zoned.h"
@@ -24,6 +23,7 @@
#include "accessors.h"
#include "file-item.h"
#include "scrub.h"
+#include "raid-stripe-tree.h"
/*
* This is only the first step towards a full-features scrub. It reads all
@@ -897,7 +897,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
ASSERT(stripe->mirror_num >= 1);
ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
stripe->logical, &mapped_len, &bioc,
- NULL, NULL, 1);
+ NULL, NULL);
/*
* If we failed, dev will be NULL, and later detailed reports
* will just be skipped.
@@ -1635,6 +1635,71 @@ static void scrub_reset_stripe(struct scrub_stripe *stripe)
}
}
+static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
+ struct scrub_stripe *stripe)
+{
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+ struct btrfs_bio *bbio = NULL;
+ u64 stripe_len = BTRFS_STRIPE_LEN;
+ int mirror = stripe->mirror_num;
+ int i;
+
+ atomic_inc(&stripe->pending_io);
+
+ for_each_set_bit(i, &stripe->extent_sector_bitmap, stripe->nr_sectors) {
+ struct page *page = scrub_stripe_get_page(stripe, i);
+ unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i);
+
+ /* The current sector cannot be merged, submit the bio. */
+ if (bbio &&
+ ((i > 0 &&
+ !test_bit(i - 1, &stripe->extent_sector_bitmap)) ||
+ bbio->bio.bi_iter.bi_size >= stripe_len)) {
+ ASSERT(bbio->bio.bi_iter.bi_size);
+ atomic_inc(&stripe->pending_io);
+ btrfs_submit_bio(bbio, mirror);
+ bbio = NULL;
+ }
+
+ if (!bbio) {
+ struct btrfs_io_stripe io_stripe = {};
+ struct btrfs_io_context *bioc = NULL;
+ const u64 logical = stripe->logical +
+ (i << fs_info->sectorsize_bits);
+ int err;
+
+ bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
+ fs_info, scrub_read_endio, stripe);
+ bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT;
+
+ io_stripe.is_scrub = true;
+ err = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
+ &stripe_len, &bioc, &io_stripe,
+ &mirror);
+ btrfs_put_bioc(bioc);
+ if (err) {
+ btrfs_bio_end_io(bbio,
+ errno_to_blk_status(err));
+ return;
+ }
+ }
+
+ __bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
+ }
+
+ if (bbio) {
+ ASSERT(bbio->bio.bi_iter.bi_size);
+ atomic_inc(&stripe->pending_io);
+ btrfs_submit_bio(bbio, mirror);
+ }
+
+ if (atomic_dec_and_test(&stripe->pending_io)) {
+ wake_up(&stripe->io_wait);
+ INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker);
+ queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work);
+ }
+}
+
static void scrub_submit_initial_read(struct scrub_ctx *sctx,
struct scrub_stripe *stripe)
{
@@ -1646,6 +1711,11 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
ASSERT(stripe->mirror_num > 0);
ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state));
+ if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) {
+ scrub_submit_extent_sector_read(sctx, stripe);
+ return;
+ }
+
bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info,
scrub_read_endio, stripe);
@@ -1798,6 +1868,9 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *
*/
ASSERT(sctx->cur_stripe < SCRUB_TOTAL_STRIPES);
+ /* @found_logical_ret must be specified. */
+ ASSERT(found_logical_ret);
+
stripe = &sctx->stripes[sctx->cur_stripe];
scrub_reset_stripe(stripe);
ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path,
@@ -1806,8 +1879,7 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *
/* Either >0 as no more extents or <0 for error. */
if (ret)
return ret;
- if (found_logical_ret)
- *found_logical_ret = stripe->logical;
+ *found_logical_ret = stripe->logical;
sctx->cur_stripe++;
/* We filled one group, submit it. */
@@ -1952,7 +2024,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start,
- &length, &bioc, NULL, NULL, 1);
+ &length, &bioc, NULL, NULL);
if (ret < 0) {
btrfs_put_bioc(bioc);
btrfs_bio_counter_dec(fs_info);
@@ -2010,7 +2082,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
/* Go through each extent items inside the logical range */
while (cur_logical < logical_end) {
- u64 found_logical;
+ u64 found_logical = U64_MAX;
u64 cur_physical = physical + cur_logical - logical_start;
/* Canceled? */
@@ -2045,6 +2117,8 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
if (ret < 0)
break;
+ /* queue_scrub_stripe() returned 0, @found_logical must be updated. */
+ ASSERT(found_logical != U64_MAX);
cur_logical = found_logical + BTRFS_STRIPE_LEN;
/* Don't hold CPU for too long time */
@@ -2717,7 +2791,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
if (scrub_dev->fs_devices != fs_info->fs_devices)
gen = scrub_dev->generation;
else
- gen = fs_info->last_trans_committed;
+ gen = btrfs_get_last_trans_committed(fs_info);
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
bytenr = btrfs_sb_offset(i);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 3a566150c531..4e36550618e5 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -796,7 +796,7 @@ static int send_cmd(struct send_ctx *sctx)
put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len);
put_unaligned_le32(0, &hdr->crc);
- crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
+ crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
put_unaligned_le32(crc, &hdr->crc);
ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
@@ -5669,8 +5669,8 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
hdr = (struct btrfs_cmd_header *)sctx->send_buf;
hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr));
hdr->crc = 0;
- crc = btrfs_crc32c(0, sctx->send_buf, sctx->send_size);
- crc = btrfs_crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes);
+ crc = crc32c(0, sctx->send_buf, sctx->send_size);
+ crc = crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes);
hdr->crc = cpu_to_le32(crc);
ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
@@ -8158,7 +8158,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
}
sctx->send_filp = fget(arg->send_fd);
- if (!sctx->send_filp) {
+ if (!sctx->send_filp || !(sctx->send_filp->f_mode & FMODE_WRITE)) {
ret = -EBADF;
goto out;
}
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index d7e8cd4f140c..571bb13587d5 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -345,8 +345,10 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
enum btrfs_reserve_flush_enum flush)
{
+ struct btrfs_space_info *data_sinfo;
u64 profile;
u64 avail;
+ u64 data_chunk_size;
int factor;
if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
@@ -364,6 +366,36 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
*/
factor = btrfs_bg_type_to_factor(profile);
avail = div_u64(avail, factor);
+ if (avail == 0)
+ return 0;
+
+ /*
+ * Calculate the data_chunk_size, space_info->chunk_size is the
+ * "optimal" chunk size based on the fs size. However when we actually
+ * allocate the chunk we will strip this down further, making it no more
+ * than 10% of the disk or 1G, whichever is smaller.
+ */
+ data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
+ data_chunk_size = min(data_sinfo->chunk_size,
+ mult_perc(fs_info->fs_devices->total_rw_bytes, 10));
+ data_chunk_size = min_t(u64, data_chunk_size, SZ_1G);
+
+ /*
+ * Since data allocations immediately use block groups as part of the
+ * reservation, because we assume that data reservations will == actual
+ * usage, we could potentially overcommit and then immediately have that
+ * available space used by a data allocation, which could put us in a
+ * bind when we get close to filling the file system.
+ *
+ * To handle this simply remove the data_chunk_size from the available
+ * space. If we are relatively empty this won't affect our ability to
+ * overcommit much, and if we're very close to full it'll keep us from
+ * getting into a position where we've given ourselves very little
+ * metadata wiggle room.
+ */
+ if (avail <= data_chunk_size)
+ return 0;
+ avail -= data_chunk_size;
/*
* If we aren't flushing all things, let us overcommit up to
@@ -556,18 +588,6 @@ static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info,
return nr;
}
-static inline u64 calc_delayed_refs_nr(const struct btrfs_fs_info *fs_info,
- u64 to_reclaim)
-{
- const u64 bytes = btrfs_calc_delayed_ref_bytes(fs_info, 1);
- u64 nr;
-
- nr = div64_u64(to_reclaim, bytes);
- if (!nr)
- nr = 1;
- return nr;
-}
-
#define EXTENT_SIZE_PER_ITEM SZ_256K
/*
@@ -749,10 +769,9 @@ static void flush_space(struct btrfs_fs_info *fs_info,
break;
}
if (state == FLUSH_DELAYED_REFS_NR)
- nr = calc_delayed_refs_nr(fs_info, num_bytes);
+ btrfs_run_delayed_refs(trans, num_bytes);
else
- nr = 0;
- btrfs_run_delayed_refs(trans, nr);
+ btrfs_run_delayed_refs(trans, 0);
btrfs_end_transaction(trans);
break;
case ALLOC_CHUNK:
@@ -978,7 +997,8 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
}
/*
- * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets
+ * We've exhausted our flushing, start failing tickets.
+ *
* @fs_info - fs_info for this fs
* @space_info - the space info we were flushing
*
@@ -1742,7 +1762,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
* Try to reserve metadata bytes from the block_rsv's space.
*
* @fs_info: the filesystem
- * @block_rsv: block_rsv we're allocating for
+ * @space_info: the space_info we're allocating for
* @orig_bytes: number of bytes we want
* @flush: whether or not we can flush to make our reservation
*
@@ -1754,21 +1774,19 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
* space already.
*/
int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *block_rsv,
+ struct btrfs_space_info *space_info,
u64 orig_bytes,
enum btrfs_reserve_flush_enum flush)
{
int ret;
- ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush);
+ ret = __reserve_bytes(fs_info, space_info, orig_bytes, flush);
if (ret == -ENOSPC) {
trace_btrfs_space_reservation(fs_info, "space_info:enospc",
- block_rsv->space_info->flags,
- orig_bytes, 1);
+ space_info->flags, orig_bytes, 1);
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
- btrfs_dump_space_info(fs_info, block_rsv->space_info,
- orig_bytes, 0);
+ btrfs_dump_space_info(fs_info, space_info, orig_bytes, 0);
}
return ret;
}
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 0bb9d14e60a8..92c595fed1b0 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -3,6 +3,7 @@
#ifndef BTRFS_SPACE_INFO_H
#define BTRFS_SPACE_INFO_H
+#include <trace/events/btrfs.h>
#include "volumes.h"
/*
@@ -212,7 +213,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *info, u64 bytes,
int dump_block_groups);
int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *block_rsv,
+ struct btrfs_space_info *space_info,
u64 orig_bytes,
enum btrfs_reserve_flush_enum flush);
void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1a093ec0f7e3..ef256b944c72 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -26,6 +26,7 @@
#include <linux/ratelimit.h>
#include <linux/crc32c.h>
#include <linux/btrfs.h>
+#include <linux/security.h>
#include "messages.h"
#include "delayed-inode.h"
#include "ctree.h"
@@ -79,7 +80,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data);
static void btrfs_put_super(struct super_block *sb)
{
- close_ctree(btrfs_sb(sb));
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+
+ btrfs_info(fs_info, "last unmount of filesystem %pU", fs_info->fs_devices->fsid);
+ close_ctree(fs_info);
}
enum {
@@ -129,9 +133,6 @@ enum {
Opt_inode_cache, Opt_noinode_cache,
/* Debugging options */
- Opt_check_integrity,
- Opt_check_integrity_including_extent_data,
- Opt_check_integrity_print_mask,
Opt_enospc_debug, Opt_noenospc_debug,
#ifdef CONFIG_BTRFS_DEBUG
Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
@@ -200,9 +201,6 @@ static const match_table_t tokens = {
{Opt_recovery, "recovery"},
/* Debugging options */
- {Opt_check_integrity, "check_int"},
- {Opt_check_integrity_including_extent_data, "check_int_data"},
- {Opt_check_integrity_print_mask, "check_int_print_mask=%u"},
{Opt_enospc_debug, "enospc_debug"},
{Opt_noenospc_debug, "noenospc_debug"},
#ifdef CONFIG_BTRFS_DEBUG
@@ -707,44 +705,6 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
case Opt_skip_balance:
btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
break;
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- case Opt_check_integrity_including_extent_data:
- btrfs_warn(info,
- "integrity checker is deprecated and will be removed in 6.7");
- btrfs_info(info,
- "enabling check integrity including extent data");
- btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY_DATA);
- btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
- break;
- case Opt_check_integrity:
- btrfs_warn(info,
- "integrity checker is deprecated and will be removed in 6.7");
- btrfs_info(info, "enabling check integrity");
- btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
- break;
- case Opt_check_integrity_print_mask:
- ret = match_int(&args[0], &intarg);
- if (ret) {
- btrfs_err(info,
- "unrecognized check_integrity_print_mask value %s",
- args[0].from);
- goto out;
- }
- info->check_integrity_print_mask = intarg;
- btrfs_warn(info,
- "integrity checker is deprecated and will be removed in 6.7");
- btrfs_info(info, "check_integrity_print_mask 0x%x",
- info->check_integrity_print_mask);
- break;
-#else
- case Opt_check_integrity_including_extent_data:
- case Opt_check_integrity:
- case Opt_check_integrity_print_mask:
- btrfs_err(info,
- "support for check_integrity* not compiled in!");
- ret = -EINVAL;
- goto out;
-#endif
case Opt_fatal_errors:
if (strcmp(args[0].from, "panic") == 0) {
btrfs_set_opt(info->mount_opt,
@@ -889,7 +849,7 @@ static int btrfs_parse_device_options(const char *options, blk_mode_t flags)
error = -ENOMEM;
goto out;
}
- device = btrfs_scan_one_device(device_name, flags);
+ device = btrfs_scan_one_device(device_name, flags, false);
kfree(device_name);
if (IS_ERR(device)) {
error = PTR_ERR(device);
@@ -1305,15 +1265,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",autodefrag");
if (btrfs_test_opt(info, SKIP_BALANCE))
seq_puts(seq, ",skip_balance");
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- if (btrfs_test_opt(info, CHECK_INTEGRITY_DATA))
- seq_puts(seq, ",check_int_data");
- else if (btrfs_test_opt(info, CHECK_INTEGRITY))
- seq_puts(seq, ",check_int");
- if (info->check_integrity_print_mask)
- seq_printf(seq, ",check_int_print_mask=%d",
- info->check_integrity_print_mask);
-#endif
if (info->metadata_ratio)
seq_printf(seq, ",metadata_ratio=%u", info->metadata_ratio);
if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR))
@@ -1484,7 +1435,12 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
goto error_fs_info;
}
- device = btrfs_scan_one_device(device_name, mode);
+ /*
+ * With 'true' passed to btrfs_scan_one_device() (mount time) we expect
+ * either a valid device or an error.
+ */
+ device = btrfs_scan_one_device(device_name, mode, true);
+ ASSERT(device != NULL);
if (IS_ERR(device)) {
mutex_unlock(&uuid_mutex);
error = PTR_ERR(device);
@@ -1519,7 +1475,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
error = -EBUSY;
} else {
snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
- shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name,
+ shrinker_debugfs_rename(s->s_shrink, "sb-%s:%s", fs_type->name,
s->s_id);
btrfs_sb(s)->bdev_holder = fs_type;
error = btrfs_fill_super(s, fs_devices, data);
@@ -2196,7 +2152,11 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
switch (cmd) {
case BTRFS_IOC_SCAN_DEV:
mutex_lock(&uuid_mutex);
- device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ);
+ /*
+ * Scanning outside of mount can return NULL which would turn
+ * into 0 error code.
+ */
+ device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
ret = PTR_ERR_OR_ZERO(device);
mutex_unlock(&uuid_mutex);
break;
@@ -2210,8 +2170,12 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
break;
case BTRFS_IOC_DEVICES_READY:
mutex_lock(&uuid_mutex);
- device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ);
- if (IS_ERR(device)) {
+ /*
+ * Scanning outside of mount can return NULL which would turn
+ * into 0 error code.
+ */
+ device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
+ if (IS_ERR_OR_NULL(device)) {
mutex_unlock(&uuid_mutex);
ret = PTR_ERR(device);
break;
@@ -2256,6 +2220,7 @@ static int check_dev_super(struct btrfs_device *dev)
{
struct btrfs_fs_info *fs_info = dev->fs_info;
struct btrfs_super_block *sb;
+ u64 last_trans;
u16 csum_type;
int ret = 0;
@@ -2291,10 +2256,10 @@ static int check_dev_super(struct btrfs_device *dev)
if (ret < 0)
goto out;
- if (btrfs_super_generation(sb) != fs_info->last_trans_committed) {
+ last_trans = btrfs_get_last_trans_committed(fs_info);
+ if (btrfs_super_generation(sb) != last_trans) {
btrfs_err(fs_info, "transid mismatch, has %llu expect %llu",
- btrfs_super_generation(sb),
- fs_info->last_trans_committed);
+ btrfs_super_generation(sb), last_trans);
ret = -EUCLEAN;
goto out;
}
@@ -2404,9 +2369,6 @@ static int __init btrfs_print_mod_info(void)
#ifdef CONFIG_BTRFS_ASSERT
", assert=on"
#endif
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- ", integrity-checker=on"
-#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
", ref-verify=on"
#endif
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index b1d1ac25237b..e6b51fb3ddc1 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -291,12 +291,15 @@ BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);
BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
BTRFS_FEAT_ATTR_COMPAT_RO(block_group_tree, BLOCK_GROUP_TREE);
BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
+BTRFS_FEAT_ATTR_INCOMPAT(simple_quota, SIMPLE_QUOTA);
#ifdef CONFIG_BLK_DEV_ZONED
BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
#endif
#ifdef CONFIG_BTRFS_DEBUG
/* Remove once support for extent tree v2 is feature complete */
BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2);
+/* Remove once support for raid stripe tree is feature complete. */
+BTRFS_FEAT_ATTR_INCOMPAT(raid_stripe_tree, RAID_STRIPE_TREE);
#endif
#ifdef CONFIG_FS_VERITY
BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY);
@@ -322,11 +325,13 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(free_space_tree),
BTRFS_FEAT_ATTR_PTR(raid1c34),
BTRFS_FEAT_ATTR_PTR(block_group_tree),
+ BTRFS_FEAT_ATTR_PTR(simple_quota),
#ifdef CONFIG_BLK_DEV_ZONED
BTRFS_FEAT_ATTR_PTR(zoned),
#endif
#ifdef CONFIG_BTRFS_DEBUG
BTRFS_FEAT_ATTR_PTR(extent_tree_v2),
+ BTRFS_FEAT_ATTR_PTR(raid_stripe_tree),
#endif
#ifdef CONFIG_FS_VERITY
BTRFS_FEAT_ATTR_PTR(verity),
@@ -420,6 +425,13 @@ static ssize_t acl_show(struct kobject *kobj, struct kobj_attribute *a, char *bu
}
BTRFS_ATTR(static_feature, acl, acl_show);
+static ssize_t temp_fsid_supported_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ return sysfs_emit(buf, "0\n");
+}
+BTRFS_ATTR(static_feature, temp_fsid, temp_fsid_supported_show);
+
/*
* Features which only depend on kernel version.
*
@@ -433,6 +445,7 @@ static struct attribute *btrfs_supported_static_feature_attrs[] = {
BTRFS_ATTR_PTR(static_feature, send_stream_version),
BTRFS_ATTR_PTR(static_feature, supported_rescue_options),
BTRFS_ATTR_PTR(static_feature, supported_sectorsizes),
+ BTRFS_ATTR_PTR(static_feature, temp_fsid),
NULL
};
@@ -1196,10 +1209,19 @@ static ssize_t btrfs_generation_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return sysfs_emit(buf, "%llu\n", fs_info->generation);
+ return sysfs_emit(buf, "%llu\n", btrfs_get_fs_generation(fs_info));
}
BTRFS_ATTR(, generation, btrfs_generation_show);
+static ssize_t btrfs_temp_fsid_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return sysfs_emit(buf, "%d\n", fs_info->fs_devices->temp_fsid);
+}
+BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show);
+
static const char * const btrfs_read_policy_name[] = { "pid" };
static ssize_t btrfs_read_policy_show(struct kobject *kobj,
@@ -1302,6 +1324,7 @@ static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, read_policy),
BTRFS_ATTR_PTR(, bg_reclaim_threshold),
BTRFS_ATTR_PTR(, commit_stats),
+ BTRFS_ATTR_PTR(, temp_fsid),
NULL,
};
@@ -2086,6 +2109,33 @@ static ssize_t qgroup_enabled_show(struct kobject *qgroups_kobj,
}
BTRFS_ATTR(qgroups, enabled, qgroup_enabled_show);
+static ssize_t qgroup_mode_show(struct kobject *qgroups_kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent);
+ ssize_t ret = 0;
+
+ spin_lock(&fs_info->qgroup_lock);
+ ASSERT(btrfs_qgroup_enabled(fs_info));
+ switch (btrfs_qgroup_mode(fs_info)) {
+ case BTRFS_QGROUP_MODE_FULL:
+ ret = sysfs_emit(buf, "qgroup\n");
+ break;
+ case BTRFS_QGROUP_MODE_SIMPLE:
+ ret = sysfs_emit(buf, "squota\n");
+ break;
+ default:
+ btrfs_warn(fs_info, "unexpected qgroup mode %d\n",
+ btrfs_qgroup_mode(fs_info));
+ break;
+ }
+ spin_unlock(&fs_info->qgroup_lock);
+
+ return ret;
+}
+BTRFS_ATTR(qgroups, mode, qgroup_mode_show);
+
static ssize_t qgroup_inconsistent_show(struct kobject *qgroups_kobj,
struct kobj_attribute *a,
char *buf)
@@ -2148,6 +2198,7 @@ static struct attribute *qgroups_attrs[] = {
BTRFS_ATTR_PTR(qgroups, enabled),
BTRFS_ATTR_PTR(qgroups, inconsistent),
BTRFS_ATTR_PTR(qgroups, drop_subtree_threshold),
+ BTRFS_ATTR_PTR(qgroups, mode),
NULL
};
ATTRIBUTE_GROUPS(qgroups);
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index 5ef0b90e25c3..6a43a64ba55a 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -61,7 +61,11 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
key.type = BTRFS_EXTENT_CSUM_KEY;
key.offset = 0;
- btrfs_setup_item_for_insert(root, path, &key, value_len);
+ /*
+ * Passing a NULL trans handle is fine here, we have a dummy root eb
+ * and the tree is a single node (level 0).
+ */
+ btrfs_setup_item_for_insert(NULL, root, path, &key, value_len);
write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
value_len);
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 05b03f5eab83..492d69d2fa73 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -34,7 +34,11 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = start;
- btrfs_setup_item_for_insert(root, &path, &key, value_len);
+ /*
+ * Passing a NULL trans handle is fine here, we have a dummy root eb
+ * and the tree is a single node (level 0).
+ */
+ btrfs_setup_item_for_insert(NULL, root, &path, &key, value_len);
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
btrfs_set_file_extent_generation(leaf, fi, 1);
btrfs_set_file_extent_type(leaf, fi, type);
@@ -64,7 +68,11 @@ static void insert_inode_item_key(struct btrfs_root *root)
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- btrfs_setup_item_for_insert(root, &path, &key, value_len);
+ /*
+ * Passing a NULL trans handle is fine here, we have a dummy root eb
+ * and the tree is a single node (level 0).
+ */
+ btrfs_setup_item_for_insert(NULL, root, &path, &key, value_len);
}
/*
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c780d3729463..5b3333ceef04 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -37,8 +37,6 @@
static struct kmem_cache *btrfs_trans_handle_cachep;
-#define BTRFS_ROOT_TRANS_TAG 0
-
/*
* Transaction states and transitions
*
@@ -386,7 +384,7 @@ loop:
IO_TREE_TRANS_DIRTY_PAGES);
extent_io_tree_init(fs_info, &cur_trans->pinned_extents,
IO_TREE_FS_PINNED_EXTENTS);
- fs_info->generation++;
+ btrfs_set_fs_generation(fs_info, fs_info->generation + 1);
cur_trans->transid = fs_info->generation;
fs_info->running_transaction = cur_trans;
cur_trans->aborted = 0;
@@ -561,6 +559,69 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root)
return true;
}
+static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info,
+ enum btrfs_reserve_flush_enum flush,
+ u64 num_bytes,
+ u64 *delayed_refs_bytes)
+{
+ struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+ struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info;
+ u64 extra_delayed_refs_bytes = 0;
+ u64 bytes;
+ int ret;
+
+ /*
+ * If there's a gap between the size of the delayed refs reserve and
+ * its reserved space, than some tasks have added delayed refs or bumped
+ * its size otherwise (due to block group creation or removal, or block
+ * group item update). Also try to allocate that gap in order to prevent
+ * using (and possibly abusing) the global reserve when committing the
+ * transaction.
+ */
+ if (flush == BTRFS_RESERVE_FLUSH_ALL &&
+ !btrfs_block_rsv_full(delayed_refs_rsv)) {
+ spin_lock(&delayed_refs_rsv->lock);
+ if (delayed_refs_rsv->size > delayed_refs_rsv->reserved)
+ extra_delayed_refs_bytes = delayed_refs_rsv->size -
+ delayed_refs_rsv->reserved;
+ spin_unlock(&delayed_refs_rsv->lock);
+ }
+
+ bytes = num_bytes + *delayed_refs_bytes + extra_delayed_refs_bytes;
+
+ /*
+ * We want to reserve all the bytes we may need all at once, so we only
+ * do 1 enospc flushing cycle per transaction start.
+ */
+ ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
+ if (ret == 0) {
+ if (extra_delayed_refs_bytes > 0)
+ btrfs_migrate_to_delayed_refs_rsv(fs_info,
+ extra_delayed_refs_bytes);
+ return 0;
+ }
+
+ if (extra_delayed_refs_bytes > 0) {
+ bytes -= extra_delayed_refs_bytes;
+ ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
+ if (ret == 0)
+ return 0;
+ }
+
+ /*
+ * If we are an emergency flush, which can steal from the global block
+ * reserve, then attempt to not reserve space for the delayed refs, as
+ * we will consume space for them from the global block reserve.
+ */
+ if (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
+ bytes -= *delayed_refs_bytes;
+ *delayed_refs_bytes = 0;
+ ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
+ }
+
+ return ret;
+}
+
static struct btrfs_trans_handle *
start_transaction(struct btrfs_root *root, unsigned int num_items,
unsigned int type, enum btrfs_reserve_flush_enum flush,
@@ -568,10 +629,12 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+ struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv;
struct btrfs_trans_handle *h;
struct btrfs_transaction *cur_trans;
u64 num_bytes = 0;
u64 qgroup_reserved = 0;
+ u64 delayed_refs_bytes = 0;
bool reloc_reserved = false;
bool do_chunk_alloc = false;
int ret;
@@ -594,9 +657,6 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
* the appropriate flushing if need be.
*/
if (num_items && root != fs_info->chunk_root) {
- struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv;
- u64 delayed_refs_bytes = 0;
-
qgroup_reserved = num_items * fs_info->nodesize;
/*
* Use prealloc for now, as there might be a currently running
@@ -608,20 +668,16 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
if (ret)
return ERR_PTR(ret);
+ num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
/*
- * We want to reserve all the bytes we may need all at once, so
- * we only do 1 enospc flushing cycle per transaction start. We
- * accomplish this by simply assuming we'll do num_items worth
- * of delayed refs updates in this trans handle, and refill that
- * amount for whatever is missing in the reserve.
+ * If we plan to insert/update/delete "num_items" from a btree,
+ * we will also generate delayed refs for extent buffers in the
+ * respective btree paths, so reserve space for the delayed refs
+ * that will be generated by the caller as it modifies btrees.
+ * Try to reserve them to avoid excessive use of the global
+ * block reserve.
*/
- num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
- if (flush == BTRFS_RESERVE_FLUSH_ALL &&
- !btrfs_block_rsv_full(delayed_refs_rsv)) {
- delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info,
- num_items);
- num_bytes += delayed_refs_bytes;
- }
+ delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info, num_items);
/*
* Do the reservation for the relocation root creation
@@ -631,16 +687,14 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
reloc_reserved = true;
}
- ret = btrfs_reserve_metadata_bytes(fs_info, rsv, num_bytes, flush);
+ ret = btrfs_reserve_trans_metadata(fs_info, flush, num_bytes,
+ &delayed_refs_bytes);
if (ret)
goto reserve_fail;
- if (delayed_refs_bytes) {
- btrfs_migrate_to_delayed_refs_rsv(fs_info, delayed_refs_bytes);
- num_bytes -= delayed_refs_bytes;
- }
- btrfs_block_rsv_add_bytes(rsv, num_bytes, true);
- if (rsv->space_info->force_alloc)
+ btrfs_block_rsv_add_bytes(trans_rsv, num_bytes, true);
+
+ if (trans_rsv->space_info->force_alloc)
do_chunk_alloc = true;
} else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL &&
!btrfs_block_rsv_full(delayed_refs_rsv)) {
@@ -700,6 +754,7 @@ again:
h->type = type;
INIT_LIST_HEAD(&h->new_bgs);
+ btrfs_init_metadata_block_rsv(fs_info, &h->delayed_rsv, BTRFS_BLOCK_RSV_DELOPS);
smp_mb();
if (cur_trans->state >= TRANS_STATE_COMMIT_START &&
@@ -712,8 +767,17 @@ again:
if (num_bytes) {
trace_btrfs_space_reservation(fs_info, "transaction",
h->transid, num_bytes, 1);
- h->block_rsv = &fs_info->trans_block_rsv;
+ h->block_rsv = trans_rsv;
h->bytes_reserved = num_bytes;
+ if (delayed_refs_bytes > 0) {
+ trace_btrfs_space_reservation(fs_info,
+ "local_delayed_refs_rsv",
+ h->transid,
+ delayed_refs_bytes, 1);
+ h->delayed_refs_bytes_reserved = delayed_refs_bytes;
+ btrfs_block_rsv_add_bytes(&h->delayed_rsv, delayed_refs_bytes, true);
+ delayed_refs_bytes = 0;
+ }
h->reloc_reserved = reloc_reserved;
}
@@ -769,8 +833,10 @@ join_fail:
kmem_cache_free(btrfs_trans_handle_cachep, h);
alloc_fail:
if (num_bytes)
- btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv,
- num_bytes, NULL);
+ btrfs_block_rsv_release(fs_info, trans_rsv, num_bytes, NULL);
+ if (delayed_refs_bytes)
+ btrfs_space_info_free_bytes_may_use(fs_info, trans_rsv->space_info,
+ delayed_refs_bytes);
reserve_fail:
btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
return ERR_PTR(ret);
@@ -817,7 +883,7 @@ struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *roo
}
/*
- * btrfs_attach_transaction() - catch the running transaction
+ * Catch the running transaction.
*
* It is used when we want to commit the current the transaction, but
* don't want to start a new one.
@@ -836,7 +902,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
}
/*
- * btrfs_attach_transaction_barrier() - catch the running transaction
+ * Catch the running transaction.
*
* It is similar to the above function, the difference is this one
* will wait for all the inactive transactions until they fully
@@ -912,7 +978,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
int ret = 0;
if (transid) {
- if (transid <= fs_info->last_trans_committed)
+ if (transid <= btrfs_get_last_trans_committed(fs_info))
goto out;
/* find specified transaction */
@@ -936,7 +1002,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
* raced with btrfs_commit_transaction
*/
if (!cur_trans) {
- if (transid > fs_info->last_trans_committed)
+ if (transid > btrfs_get_last_trans_committed(fs_info))
ret = -EINVAL;
goto out;
}
@@ -991,11 +1057,14 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
if (!trans->block_rsv) {
ASSERT(!trans->bytes_reserved);
+ ASSERT(!trans->delayed_refs_bytes_reserved);
return;
}
- if (!trans->bytes_reserved)
+ if (!trans->bytes_reserved) {
+ ASSERT(!trans->delayed_refs_bytes_reserved);
return;
+ }
ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
trace_btrfs_space_reservation(fs_info, "transaction",
@@ -1003,6 +1072,16 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
btrfs_block_rsv_release(fs_info, trans->block_rsv,
trans->bytes_reserved, NULL);
trans->bytes_reserved = 0;
+
+ if (!trans->delayed_refs_bytes_reserved)
+ return;
+
+ trace_btrfs_space_reservation(fs_info, "local_delayed_refs_rsv",
+ trans->transid,
+ trans->delayed_refs_bytes_reserved, 0);
+ btrfs_block_rsv_release(fs_info, &trans->delayed_rsv,
+ trans->delayed_refs_bytes_reserved, NULL);
+ trans->delayed_refs_bytes_reserved = 0;
}
static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
@@ -1334,7 +1413,7 @@ again:
}
/* Now flush any delayed refs generated by updating all of the roots */
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, U64_MAX);
if (ret)
return ret;
@@ -1349,7 +1428,7 @@ again:
* so we want to keep this flushing in this loop to make sure
* everything gets run.
*/
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, U64_MAX);
if (ret)
return ret;
}
@@ -1484,45 +1563,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
}
/*
- * defrag a given btree.
- * Every leaf in the btree is read and defragged.
- */
-int btrfs_defrag_root(struct btrfs_root *root)
-{
- struct btrfs_fs_info *info = root->fs_info;
- struct btrfs_trans_handle *trans;
- int ret;
-
- if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))
- return 0;
-
- while (1) {
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- break;
- }
-
- ret = btrfs_defrag_leaves(trans, root);
-
- btrfs_end_transaction(trans);
- btrfs_btree_balance_dirty(info);
- cond_resched();
-
- if (btrfs_fs_closing(info) || ret != -EAGAIN)
- break;
-
- if (btrfs_defrag_cancelled(info)) {
- btrfs_debug(info, "defrag_root cancelled");
- ret = -EAGAIN;
- break;
- }
- }
- clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);
- return ret;
-}
-
-/*
* Do all special snapshot related qgroup dirty hack.
*
* Will do all needed qgroup inherit and dirty hack like switch commit
@@ -1539,11 +1579,10 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
int ret;
/*
- * Save some performance in the case that qgroups are not
- * enabled. If this check races with the ioctl, rescan will
- * kick in anyway.
+ * Save some performance in the case that qgroups are not enabled. If
+ * this check races with the ioctl, rescan will kick in anyway.
*/
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
/*
@@ -1567,7 +1606,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
* for now flush the delayed refs to narrow the race window where the
* qgroup counters could end up wrong.
*/
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, U64_MAX);
if (ret) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -1582,7 +1621,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
/* Now qgroup are all updated, we can inherit it to new qgroups */
ret = btrfs_qgroup_inherit(trans, src->root_key.objectid, dst_objectid,
- inherit);
+ parent->root_key.objectid, inherit);
if (ret < 0)
goto out;
@@ -1732,6 +1771,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
+ ret = btrfs_create_qgroup(trans, objectid);
+ if (ret && ret != -EEXIST) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
+
/*
* pull in the delayed directory update
* and the delayed inode item
@@ -1843,8 +1888,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
* To co-operate with that hack, we do hack again.
* Or snapshot will be greatly slowed down by a subtree qgroup rescan
*/
- ret = qgroup_account_snapshot(trans, root, parent_root,
- pending->inherit, objectid);
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL)
+ ret = qgroup_account_snapshot(trans, root, parent_root,
+ pending->inherit, objectid);
+ else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
+ ret = btrfs_qgroup_inherit(trans, root->root_key.objectid, objectid,
+ parent_root->root_key.objectid, pending->inherit);
if (ret < 0)
goto fail;
@@ -1860,8 +1909,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size +
fname.disk_name.len * 2);
- parent_inode->i_mtime = inode_set_ctime_current(parent_inode);
- ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode));
+ inode_set_mtime_to_ts(parent_inode,
+ inode_set_ctime_current(parent_inode));
+ ret = btrfs_update_inode_fallback(trans, BTRFS_I(parent_inode));
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
@@ -2084,7 +2134,7 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
struct btrfs_block_group *block_group, *tmp;
list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
- btrfs_delayed_refs_rsv_release(fs_info, 1);
+ btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
list_del_init(&block_group->bg_list);
}
}
@@ -2403,7 +2453,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (ret)
goto unlock_reloc;
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, U64_MAX);
if (ret)
goto unlock_reloc;
@@ -2536,7 +2586,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
btrfs_clear_space_info_full(fs_info);
- fs_info->last_trans_committed = cur_trans->transid;
+ btrfs_set_last_trans_committed(fs_info, cur_trans->transid);
/*
* We needn't acquire the lock here because there is no other task
* which can change it.
@@ -2654,18 +2704,18 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
*/
void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
const char *function,
- unsigned int line, int errno, bool first_hit)
+ unsigned int line, int error, bool first_hit)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- WRITE_ONCE(trans->aborted, errno);
- WRITE_ONCE(trans->transaction->aborted, errno);
- if (first_hit && errno == -ENOSPC)
+ WRITE_ONCE(trans->aborted, error);
+ WRITE_ONCE(trans->transaction->aborted, error);
+ if (first_hit && error == -ENOSPC)
btrfs_dump_space_info_for_trans_abort(fs_info);
/* Wake up anybody who may be waiting on this transaction */
wake_up(&fs_info->transaction_wait);
wake_up(&fs_info->transaction_blocked_wait);
- __btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
+ __btrfs_handle_fs_error(fs_info, function, line, error, NULL);
}
int __init btrfs_transaction_init(void)
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 93869cda6af9..2bf8bbdfd0b3 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -12,6 +12,9 @@
#include "ctree.h"
#include "misc.h"
+/* Radix-tree tag for roots that are part of the trasaction. */
+#define BTRFS_ROOT_TRANS_TAG 0
+
enum btrfs_trans_state {
TRANS_STATE_RUNNING,
TRANS_STATE_COMMIT_PREP,
@@ -118,8 +121,10 @@ enum {
struct btrfs_trans_handle {
u64 transid;
u64 bytes_reserved;
+ u64 delayed_refs_bytes_reserved;
u64 chunk_bytes_reserved;
unsigned long delayed_ref_updates;
+ unsigned long delayed_ref_csum_deletions;
struct btrfs_transaction *transaction;
struct btrfs_block_rsv *block_rsv;
struct btrfs_block_rsv *orig_rsv;
@@ -139,6 +144,7 @@ struct btrfs_trans_handle {
bool in_fsync;
struct btrfs_fs_info *fs_info;
struct list_head new_bgs;
+ struct btrfs_block_rsv delayed_rsv;
};
/*
@@ -172,7 +178,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
{
spin_lock(&inode->lock);
inode->last_trans = trans->transaction->transid;
- inode->last_sub_trans = inode->root->log_transid;
+ inode->last_sub_trans = btrfs_get_root_log_transid(inode->root);
inode->last_log_commit = inode->last_sub_trans - 1;
spin_unlock(&inode->lock);
}
@@ -200,32 +206,32 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
delayed_refs->qgroup_to_skip = 0;
}
-bool __cold abort_should_print_stack(int errno);
+bool __cold abort_should_print_stack(int error);
/*
* Call btrfs_abort_transaction as early as possible when an error condition is
* detected, that way the exact stack trace is reported for some errors.
*/
-#define btrfs_abort_transaction(trans, errno) \
+#define btrfs_abort_transaction(trans, error) \
do { \
bool first = false; \
/* Report first abort since mount */ \
if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
&((trans)->fs_info->fs_state))) { \
first = true; \
- if (WARN(abort_should_print_stack(errno), \
+ if (WARN(abort_should_print_stack(error), \
KERN_ERR \
"BTRFS: Transaction aborted (error %d)\n", \
- (errno))) { \
+ (error))) { \
/* Stack trace printed. */ \
} else { \
btrfs_err((trans)->fs_info, \
"Transaction aborted (error %d)", \
- (errno)); \
+ (error)); \
} \
} \
__btrfs_abort_transaction((trans), __func__, \
- __LINE__, (errno), first); \
+ __LINE__, (error), first); \
} while (0)
int btrfs_end_transaction(struct btrfs_trans_handle *trans);
@@ -243,7 +249,6 @@ struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid);
void btrfs_add_dead_root(struct btrfs_root *root);
-int btrfs_defrag_root(struct btrfs_root *root);
void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info);
int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info);
int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
@@ -264,7 +269,7 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
const char *function,
- unsigned int line, int errno, bool first_hit);
+ unsigned int line, int error, bool first_hit);
int __init btrfs_transaction_init(void);
void __cold btrfs_transaction_exit(void);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index ab08a0b01311..50fdc69fdddf 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -29,6 +29,9 @@
#include "accessors.h"
#include "file-item.h"
#include "inode-item.h"
+#include "dir-item.h"
+#include "raid-stripe-tree.h"
+#include "extent-tree.h"
/*
* Error message should follow the following format:
@@ -1274,6 +1277,8 @@ static int check_extent_item(struct extent_buffer *leaf,
unsigned long ptr; /* Current pointer inside inline refs */
unsigned long end; /* Extent item end */
const u32 item_size = btrfs_item_size(leaf, slot);
+ u8 last_type = 0;
+ u64 last_seq = U64_MAX;
u64 flags;
u64 generation;
u64 total_refs; /* Total refs in btrfs_extent_item */
@@ -1320,6 +1325,18 @@ static int check_extent_item(struct extent_buffer *leaf,
* 2.2) Ref type specific data
* Either using btrfs_extent_inline_ref::offset, or specific
* data structure.
+ *
+ * All above inline items should follow the order:
+ *
+ * - All btrfs_extent_inline_ref::type should be in an ascending
+ * order
+ *
+ * - Within the same type, the items should follow a descending
+ * order by their sequence number. The sequence number is
+ * determined by:
+ * * btrfs_extent_inline_ref::offset for all types other than
+ * EXTENT_DATA_REF
+ * * hash_extent_data_ref() for EXTENT_DATA_REF
*/
if (unlikely(item_size < sizeof(*ei))) {
extent_err(leaf, slot,
@@ -1401,6 +1418,7 @@ static int check_extent_item(struct extent_buffer *leaf,
struct btrfs_extent_inline_ref *iref;
struct btrfs_extent_data_ref *dref;
struct btrfs_shared_data_ref *sref;
+ u64 seq;
u64 dref_offset;
u64 inline_offset;
u8 inline_type;
@@ -1414,6 +1432,7 @@ static int check_extent_item(struct extent_buffer *leaf,
iref = (struct btrfs_extent_inline_ref *)ptr;
inline_type = btrfs_extent_inline_ref_type(leaf, iref);
inline_offset = btrfs_extent_inline_ref_offset(leaf, iref);
+ seq = inline_offset;
if (unlikely(ptr + btrfs_extent_inline_ref_size(inline_type) > end)) {
extent_err(leaf, slot,
"inline ref item overflows extent item, ptr %lu iref size %u end %lu",
@@ -1444,6 +1463,10 @@ static int check_extent_item(struct extent_buffer *leaf,
case BTRFS_EXTENT_DATA_REF_KEY:
dref = (struct btrfs_extent_data_ref *)(&iref->offset);
dref_offset = btrfs_extent_data_ref_offset(leaf, dref);
+ seq = hash_extent_data_ref(
+ btrfs_extent_data_ref_root(leaf, dref),
+ btrfs_extent_data_ref_objectid(leaf, dref),
+ btrfs_extent_data_ref_offset(leaf, dref));
if (unlikely(!IS_ALIGNED(dref_offset,
fs_info->sectorsize))) {
extent_err(leaf, slot,
@@ -1465,11 +1488,32 @@ static int check_extent_item(struct extent_buffer *leaf,
}
inline_refs += btrfs_shared_data_ref_count(leaf, sref);
break;
+ case BTRFS_EXTENT_OWNER_REF_KEY:
+ WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+ break;
default:
extent_err(leaf, slot, "unknown inline ref type: %u",
inline_type);
return -EUCLEAN;
}
+ if (inline_type < last_type) {
+ extent_err(leaf, slot,
+ "inline ref out-of-order: has type %u, prev type %u",
+ inline_type, last_type);
+ return -EUCLEAN;
+ }
+ /* Type changed, allow the sequence starts from U64_MAX again. */
+ if (inline_type > last_type)
+ last_seq = U64_MAX;
+ if (seq > last_seq) {
+ extent_err(leaf, slot,
+"inline ref out-of-order: has type %u offset %llu seq 0x%llx, prev type %u seq 0x%llx",
+ inline_type, inline_offset, seq,
+ last_type, last_seq);
+ return -EUCLEAN;
+ }
+ last_type = inline_type;
+ last_seq = seq;
ptr += btrfs_extent_inline_ref_size(inline_type);
}
/* No padding is allowed */
@@ -1631,6 +1675,44 @@ static int check_inode_ref(struct extent_buffer *leaf,
return 0;
}
+static int check_raid_stripe_extent(const struct extent_buffer *leaf,
+ const struct btrfs_key *key, int slot)
+{
+ struct btrfs_stripe_extent *stripe_extent =
+ btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
+
+ if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) {
+ generic_err(leaf, slot,
+"invalid key objectid for raid stripe extent, have %llu expect aligned to %u",
+ key->objectid, leaf->fs_info->sectorsize);
+ return -EUCLEAN;
+ }
+
+ if (unlikely(!btrfs_fs_incompat(leaf->fs_info, RAID_STRIPE_TREE))) {
+ generic_err(leaf, slot,
+ "RAID_STRIPE_EXTENT present but RAID_STRIPE_TREE incompat bit unset");
+ return -EUCLEAN;
+ }
+
+ switch (btrfs_stripe_extent_encoding(leaf, stripe_extent)) {
+ case BTRFS_STRIPE_RAID0:
+ case BTRFS_STRIPE_RAID1:
+ case BTRFS_STRIPE_DUP:
+ case BTRFS_STRIPE_RAID10:
+ case BTRFS_STRIPE_RAID5:
+ case BTRFS_STRIPE_RAID6:
+ case BTRFS_STRIPE_RAID1C3:
+ case BTRFS_STRIPE_RAID1C4:
+ break;
+ default:
+ generic_err(leaf, slot, "invalid raid stripe encoding %u",
+ btrfs_stripe_extent_encoding(leaf, stripe_extent));
+ return -EUCLEAN;
+ }
+
+ return 0;
+}
+
/*
* Common point to switch the item-specific validation.
*/
@@ -1685,6 +1767,9 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf,
case BTRFS_EXTENT_DATA_REF_KEY:
ret = check_extent_data_ref(leaf, key, slot);
break;
+ case BTRFS_RAID_STRIPE_KEY:
+ ret = check_raid_stripe_extent(leaf, key, slot);
+ break;
}
if (ret)
@@ -2005,7 +2090,7 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level,
* So we only checks tree blocks which is read from disk, whose
* generation <= fs_info->last_trans_committed.
*/
- if (btrfs_header_generation(eb) > fs_info->last_trans_committed)
+ if (btrfs_header_generation(eb) > btrfs_get_last_trans_committed(fs_info))
return 0;
/* We have @first_key, so this @eb must have at least one item */
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index cbb17b542131..7d6729d9fd2f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -347,8 +347,7 @@ static int process_one_buffer(struct btrfs_root *log,
}
if (wc->pin) {
- ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
- eb->len);
+ ret = btrfs_pin_extent_for_log_replay(wc->trans, eb);
if (ret)
return ret;
@@ -504,9 +503,9 @@ insert:
found_size = btrfs_item_size(path->nodes[0],
path->slots[0]);
if (found_size > item_size)
- btrfs_truncate_item(path, item_size, 1);
+ btrfs_truncate_item(trans, path, item_size, 1);
else if (found_size < item_size)
- btrfs_extend_item(path, item_size - found_size);
+ btrfs_extend_item(trans, path, item_size - found_size);
} else if (ret) {
return ret;
}
@@ -574,7 +573,7 @@ insert:
}
}
no_copy:
- btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_mark_buffer_dirty(trans, path->nodes[0]);
btrfs_release_path(path);
return 0;
}
@@ -767,7 +766,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
} else if (ret == 0) {
btrfs_init_generic_ref(&ref,
BTRFS_ADD_DELAYED_REF,
- ins.objectid, ins.offset, 0);
+ ins.objectid, ins.offset, 0,
+ root->root_key.objectid);
btrfs_init_data_ref(&ref,
root->root_key.objectid,
key->objectid, offset, 0, false);
@@ -890,7 +890,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
update_inode:
btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
out:
iput(inode);
return ret;
@@ -1445,7 +1445,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
if (ret)
goto out;
}
@@ -1483,8 +1483,7 @@ out:
return ret;
}
-static int count_inode_extrefs(struct btrfs_root *root,
- struct btrfs_inode *inode, struct btrfs_path *path)
+static int count_inode_extrefs(struct btrfs_inode *inode, struct btrfs_path *path)
{
int ret = 0;
int name_len;
@@ -1498,8 +1497,8 @@ static int count_inode_extrefs(struct btrfs_root *root,
struct extent_buffer *leaf;
while (1) {
- ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
- &extref, &offset);
+ ret = btrfs_find_one_extref(inode->root, inode_objectid, offset,
+ path, &extref, &offset);
if (ret)
break;
@@ -1527,8 +1526,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
return nlink;
}
-static int count_inode_refs(struct btrfs_root *root,
- struct btrfs_inode *inode, struct btrfs_path *path)
+static int count_inode_refs(struct btrfs_inode *inode, struct btrfs_path *path)
{
int ret;
struct btrfs_key key;
@@ -1543,7 +1541,7 @@ static int count_inode_refs(struct btrfs_root *root,
key.offset = (u64)-1;
while (1) {
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, inode->root, &key, path, 0, 0);
if (ret < 0)
break;
if (ret > 0) {
@@ -1595,9 +1593,9 @@ process_slot:
* will free the inode.
*/
static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct inode *inode)
{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path;
int ret;
u64 nlink = 0;
@@ -1607,13 +1605,13 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- ret = count_inode_refs(root, BTRFS_I(inode), path);
+ ret = count_inode_refs(BTRFS_I(inode), path);
if (ret < 0)
goto out;
nlink = ret;
- ret = count_inode_extrefs(root, BTRFS_I(inode), path);
+ ret = count_inode_extrefs(BTRFS_I(inode), path);
if (ret < 0)
goto out;
@@ -1623,7 +1621,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
if (nlink != inode->i_nlink) {
set_nlink(inode, nlink);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
if (ret)
goto out;
}
@@ -1685,7 +1683,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
break;
}
- ret = fixup_inode_link_count(trans, root, inode);
+ ret = fixup_inode_link_count(trans, inode);
iput(inode);
if (ret)
break;
@@ -1732,7 +1730,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
set_nlink(inode, 1);
else
inc_nlink(inode);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
} else if (ret == -EEXIST) {
ret = 0;
}
@@ -1939,7 +1937,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
out:
if (!ret && update_size) {
btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2);
- ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
+ ret = btrfs_update_inode(trans, BTRFS_I(dir));
}
kfree(name.name);
iput(dir);
@@ -2483,7 +2481,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
drop_args.bytes_found);
/* Update the inode's nbytes. */
ret = btrfs_update_inode(wc->trans,
- root, BTRFS_I(inode));
+ BTRFS_I(inode));
}
iput(inode);
if (ret)
@@ -2574,7 +2572,7 @@ static int clean_log_buffer(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(eb);
if (trans) {
- ret = btrfs_pin_reserved_extent(trans, eb->start, eb->len);
+ ret = btrfs_pin_reserved_extent(trans, eb);
if (ret)
return ret;
btrfs_redirty_list_add(trans->transaction, eb);
@@ -2848,10 +2846,9 @@ static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
}
/*
- * btrfs_sync_log does sends a given tree log down to the disk and
- * updates the super blocks to record it. When this call is done,
- * you know that any inodes previously logged are safely on disk only
- * if it returns 0.
+ * Sends a given tree log down to the disk and updates the super blocks to
+ * record it. When this call is done, you know that any inodes previously
+ * logged are safely on disk only if it returns 0.
*
* Any other return value means you need to call btrfs_commit_transaction.
* Some of the edge cases for fsyncing directories that have had unlinks
@@ -2961,7 +2958,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
btrfs_set_root_node(&log->root_item, log->node);
memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
- root->log_transid++;
+ btrfs_set_root_log_transid(root, root->log_transid + 1);
log->log_transid = root->log_transid;
root->log_start_pid = 0;
/*
@@ -2999,9 +2996,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
*/
ret = update_log_root(trans, log, &new_root_item);
if (ret) {
- if (!list_empty(&root_log_ctx.list))
- list_del_init(&root_log_ctx.list);
-
+ list_del_init(&root_log_ctx.list);
blk_finish_plug(&plug);
btrfs_set_log_full_commit(trans);
if (ret != -ENOSPC)
@@ -3021,7 +3016,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out;
}
- index2 = root_log_ctx.log_transid % 2;
if (atomic_read(&log_root_tree->log_commit[index2])) {
blk_finish_plug(&plug);
ret = btrfs_wait_tree_log_extents(log, mark);
@@ -3136,8 +3130,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* someone else already started it. We use <= and not < because the
* first log transaction has an ID of 0.
*/
- ASSERT(root->last_log_commit <= log_transid);
- root->last_log_commit = log_transid;
+ ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid);
+ btrfs_set_root_last_log_commit(root, log_transid);
out_wake_log_root:
mutex_lock(&log_root_tree->log_mutex);
@@ -3211,8 +3205,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
}
}
- clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
- EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
+ extent_io_tree_release(&log->dirty_log_pages);
extent_io_tree_release(&log->log_csum_range);
btrfs_put_root(log);
@@ -3530,7 +3523,7 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
last_offset = max(last_offset, curr_end);
}
btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
- btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_mark_buffer_dirty(trans, path->nodes[0]);
btrfs_release_path(path);
return 0;
}
@@ -4138,19 +4131,19 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
btrfs_set_token_timespec_sec(&token, &item->atime,
- inode->i_atime.tv_sec);
+ inode_get_atime_sec(inode));
btrfs_set_token_timespec_nsec(&token, &item->atime,
- inode->i_atime.tv_nsec);
+ inode_get_atime_nsec(inode));
btrfs_set_token_timespec_sec(&token, &item->mtime,
- inode->i_mtime.tv_sec);
+ inode_get_mtime_sec(inode));
btrfs_set_token_timespec_nsec(&token, &item->mtime,
- inode->i_mtime.tv_nsec);
+ inode_get_mtime_nsec(inode));
btrfs_set_token_timespec_sec(&token, &item->ctime,
- inode_get_ctime(inode).tv_sec);
+ inode_get_ctime_sec(inode));
btrfs_set_token_timespec_nsec(&token, &item->ctime,
- inode_get_ctime(inode).tv_nsec);
+ inode_get_ctime_nsec(inode));
/*
* We do not need to set the nbytes field, in fact during a fast fsync
@@ -4488,7 +4481,7 @@ copy_item:
dst_index++;
}
- btrfs_mark_buffer_dirty(dst_path->nodes[0]);
+ btrfs_mark_buffer_dirty(trans, dst_path->nodes[0]);
btrfs_release_path(dst_path);
out:
kfree(ins_data);
@@ -4693,7 +4686,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
write_extent_buffer(leaf, &fi,
btrfs_item_ptr_offset(leaf, path->slots[0]),
sizeof(fi));
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
@@ -4921,12 +4914,12 @@ process:
set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
- spin_lock_irq(&inode->ordered_tree.lock);
+ spin_lock_irq(&inode->ordered_tree_lock);
if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
atomic_inc(&trans->transaction->pending_ordered);
}
- spin_unlock_irq(&inode->ordered_tree.lock);
+ spin_unlock_irq(&inode->ordered_tree_lock);
}
btrfs_put_ordered_extent(ordered);
}
@@ -7204,9 +7197,7 @@ again:
* each subsequent pass.
*/
if (ret == -ENOENT)
- ret = btrfs_pin_extent_for_log_replay(trans,
- log->node->start,
- log->node->len);
+ ret = btrfs_pin_extent_for_log_replay(trans, log->node);
btrfs_put_root(log);
if (!ret)
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 33606025513d..b4ac2b0cd235 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -223,7 +223,8 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
}
/*
- * ulist_del - delete one node from ulist
+ * Delete one node from ulist.
+ *
* @ulist: ulist to remove node from
* @val: value to delete
* @aux: aux to delete
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 7c7001f42b14..5be74f9e47eb 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -124,7 +124,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
* An item with that type already exists.
* Extend the item and store the new subid at the end.
*/
- btrfs_extend_item(path, sizeof(subid_le));
+ btrfs_extend_item(trans, path, sizeof(subid_le));
eb = path->nodes[0];
slot = path->slots[0];
offset = btrfs_item_ptr_offset(eb, slot);
@@ -139,7 +139,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
ret = 0;
subid_le = cpu_to_le64(subid_cpu);
write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le));
- btrfs_mark_buffer_dirty(eb);
+ btrfs_mark_buffer_dirty(trans, eb);
out:
btrfs_free_path(path);
@@ -221,7 +221,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
move_src = offset + sizeof(subid);
move_len = item_size - (move_src - btrfs_item_ptr_offset(eb, slot));
memmove_extent_buffer(eb, move_dst, move_src, move_len);
- btrfs_truncate_item(path, item_size - sizeof(subid), 1);
+ btrfs_truncate_item(trans, path, item_size - sizeof(subid), 1);
out:
btrfs_free_path(path);
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index 744f4f4d4c68..66e2270b0dae 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -487,7 +487,7 @@ static int rollback_verity(struct btrfs_inode *inode)
}
inode->ro_flags &= ~BTRFS_INODE_RO_VERITY;
btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -554,7 +554,7 @@ static int finish_verity(struct btrfs_inode *inode, const void *desc,
}
inode->ro_flags |= BTRFS_INODE_RO_VERITY;
btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode);
if (ret)
goto end_trans;
ret = del_orphan(trans, inode);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b9ef6f54635c..f627674b37db 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -35,6 +35,7 @@
#include "relocation.h"
#include "scrub.h"
#include "super.h"
+#include "raid-stripe-tree.h"
#define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
BTRFS_BLOCK_GROUP_RAID10 | \
@@ -357,21 +358,19 @@ struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
}
/*
- * alloc_fs_devices - allocate struct btrfs_fs_devices
- * @fsid: if not NULL, copy the UUID to fs_devices::fsid
- * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid
+ * Allocate new btrfs_fs_devices structure identified by a fsid.
+ *
+ * @fsid: if not NULL, copy the UUID to fs_devices::fsid and to
+ * fs_devices::metadata_fsid
*
* Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
* The returned struct is not linked onto any lists and can be destroyed with
* kfree() right away.
*/
-static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
- const u8 *metadata_fsid)
+static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
{
struct btrfs_fs_devices *fs_devs;
- ASSERT(fsid || !metadata_fsid);
-
fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
if (!fs_devs)
return ERR_PTR(-ENOMEM);
@@ -385,8 +384,7 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
if (fsid) {
memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
- memcpy(fs_devs->metadata_uuid,
- metadata_fsid ?: fsid, BTRFS_FSID_SIZE);
+ memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
}
return fs_devs;
@@ -457,91 +455,41 @@ static noinline struct btrfs_fs_devices *find_fsid(
return NULL;
}
-/*
- * First check if the metadata_uuid is different from the fsid in the given
- * fs_devices. Then check if the given fsid is the same as the metadata_uuid
- * in the fs_devices. If it is, return true; otherwise, return false.
- */
-static inline bool check_fsid_changed(const struct btrfs_fs_devices *fs_devices,
- const u8 *fsid)
-{
- return memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
- BTRFS_FSID_SIZE) != 0 &&
- memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE) == 0;
-}
-
-static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
- struct btrfs_super_block *disk_super)
-{
-
- struct btrfs_fs_devices *fs_devices;
-
- /*
- * Handle scanned device having completed its fsid change but
- * belonging to a fs_devices that was created by first scanning
- * a device which didn't have its fsid/metadata_uuid changed
- * at all and the CHANGING_FSID_V2 flag set.
- */
- list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- if (!fs_devices->fsid_change)
- continue;
-
- if (match_fsid_fs_devices(fs_devices, disk_super->metadata_uuid,
- fs_devices->fsid))
- return fs_devices;
- }
-
- /*
- * Handle scanned device having completed its fsid change but
- * belonging to a fs_devices that was created by a device that
- * has an outdated pair of fsid/metadata_uuid and
- * CHANGING_FSID_V2 flag set.
- */
- list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- if (!fs_devices->fsid_change)
- continue;
-
- if (check_fsid_changed(fs_devices, disk_super->metadata_uuid))
- return fs_devices;
- }
-
- return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
-}
-
-
static int
btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
- int flush, struct block_device **bdev,
+ int flush, struct bdev_handle **bdev_handle,
struct btrfs_super_block **disk_super)
{
+ struct block_device *bdev;
int ret;
- *bdev = blkdev_get_by_path(device_path, flags, holder, NULL);
+ *bdev_handle = bdev_open_by_path(device_path, flags, holder, NULL);
- if (IS_ERR(*bdev)) {
- ret = PTR_ERR(*bdev);
+ if (IS_ERR(*bdev_handle)) {
+ ret = PTR_ERR(*bdev_handle);
goto error;
}
+ bdev = (*bdev_handle)->bdev;
if (flush)
- sync_blockdev(*bdev);
- ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
+ sync_blockdev(bdev);
+ ret = set_blocksize(bdev, BTRFS_BDEV_BLOCKSIZE);
if (ret) {
- blkdev_put(*bdev, holder);
+ bdev_release(*bdev_handle);
goto error;
}
- invalidate_bdev(*bdev);
- *disk_super = btrfs_read_dev_super(*bdev);
+ invalidate_bdev(bdev);
+ *disk_super = btrfs_read_dev_super(bdev);
if (IS_ERR(*disk_super)) {
ret = PTR_ERR(*disk_super);
- blkdev_put(*bdev, holder);
+ bdev_release(*bdev_handle);
goto error;
}
return 0;
error:
- *bdev = NULL;
+ *bdev_handle = NULL;
return ret;
}
@@ -562,13 +510,13 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device
{
struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
struct btrfs_device *device, *tmp_device;
- int ret = 0;
+ int ret;
+ bool freed = false;
lockdep_assert_held(&uuid_mutex);
- if (devt)
- ret = -ENOENT;
-
+ /* Return good status if there is no instance of devt. */
+ ret = 0;
list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
mutex_lock(&fs_devices->device_list_mutex);
@@ -579,8 +527,7 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device
if (devt && devt != device->devt)
continue;
if (fs_devices->opened) {
- /* for an already deleted device return 0 */
- if (devt && ret != 0)
+ if (devt)
ret = -EBUSY;
break;
}
@@ -590,7 +537,7 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device
list_del(&device->dev_list);
btrfs_free_device(device);
- ret = 0;
+ freed = true;
}
mutex_unlock(&fs_devices->device_list_mutex);
@@ -601,9 +548,81 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device
}
}
+ /* If there is at least one freed device return 0. */
+ if (freed)
+ return 0;
+
return ret;
}
+static struct btrfs_fs_devices *find_fsid_by_device(
+ struct btrfs_super_block *disk_super,
+ dev_t devt, bool *same_fsid_diff_dev)
+{
+ struct btrfs_fs_devices *fsid_fs_devices;
+ struct btrfs_fs_devices *devt_fs_devices;
+ const bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
+ bool found_by_devt = false;
+
+ /* Find the fs_device by the usual method, if found use it. */
+ fsid_fs_devices = find_fsid(disk_super->fsid,
+ has_metadata_uuid ? disk_super->metadata_uuid : NULL);
+
+ /* The temp_fsid feature is supported only with single device filesystem. */
+ if (btrfs_super_num_devices(disk_super) != 1)
+ return fsid_fs_devices;
+
+ /*
+ * A seed device is an integral component of the sprout device, which
+ * functions as a multi-device filesystem. So, temp-fsid feature is
+ * not supported.
+ */
+ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)
+ return fsid_fs_devices;
+
+ /* Try to find a fs_devices by matching devt. */
+ list_for_each_entry(devt_fs_devices, &fs_uuids, fs_list) {
+ struct btrfs_device *device;
+
+ list_for_each_entry(device, &devt_fs_devices->devices, dev_list) {
+ if (device->devt == devt) {
+ found_by_devt = true;
+ break;
+ }
+ }
+ if (found_by_devt)
+ break;
+ }
+
+ if (found_by_devt) {
+ /* Existing device. */
+ if (fsid_fs_devices == NULL) {
+ if (devt_fs_devices->opened == 0) {
+ /* Stale device. */
+ return NULL;
+ } else {
+ /* temp_fsid is mounting a subvol. */
+ return devt_fs_devices;
+ }
+ } else {
+ /* Regular or temp_fsid device mounting a subvol. */
+ return devt_fs_devices;
+ }
+ } else {
+ /* New device. */
+ if (fsid_fs_devices == NULL) {
+ return NULL;
+ } else {
+ /* sb::fsid is already used create a new temp_fsid. */
+ *same_fsid_diff_dev = true;
+ return NULL;
+ }
+ }
+
+ /* Not reached. */
+}
+
/*
* This is only used on mount, and we are protected from competing things
* messing with our fs_devices by the uuid_mutex, thus we do not need the
@@ -613,7 +632,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *device, blk_mode_t flags,
void *holder)
{
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
struct btrfs_super_block *disk_super;
u64 devid;
int ret;
@@ -624,7 +643,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
return -EINVAL;
ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
- &bdev, &disk_super);
+ &bdev_handle, &disk_super);
if (ret)
return ret;
@@ -648,21 +667,21 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
fs_devices->seeding = true;
} else {
- if (bdev_read_only(bdev))
+ if (bdev_read_only(bdev_handle->bdev))
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
else
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
}
- if (!bdev_nonrot(bdev))
+ if (!bdev_nonrot(bdev_handle->bdev))
fs_devices->rotating = true;
- if (bdev_max_discard_sectors(bdev))
+ if (bdev_max_discard_sectors(bdev_handle->bdev))
fs_devices->discardable = true;
- device->bdev = bdev;
+ device->bdev_handle = bdev_handle;
+ device->bdev = bdev_handle->bdev;
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
- device->holder = holder;
fs_devices->open_devices++;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
@@ -676,7 +695,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
error_free_page:
btrfs_release_disk_super(disk_super);
- blkdev_put(bdev, holder);
+ bdev_release(bdev_handle);
return -EINVAL;
}
@@ -690,84 +709,6 @@ u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb)
}
/*
- * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
- * being created with a disk that has already completed its fsid change. Such
- * disk can belong to an fs which has its FSID changed or to one which doesn't.
- * Handle both cases here.
- */
-static struct btrfs_fs_devices *find_fsid_inprogress(
- struct btrfs_super_block *disk_super)
-{
- struct btrfs_fs_devices *fs_devices;
-
- list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- if (fs_devices->fsid_change)
- continue;
-
- if (check_fsid_changed(fs_devices, disk_super->fsid))
- return fs_devices;
- }
-
- return find_fsid(disk_super->fsid, NULL);
-}
-
-static struct btrfs_fs_devices *find_fsid_changed(
- struct btrfs_super_block *disk_super)
-{
- struct btrfs_fs_devices *fs_devices;
-
- /*
- * Handles the case where scanned device is part of an fs that had
- * multiple successful changes of FSID but currently device didn't
- * observe it. Meaning our fsid will be different than theirs. We need
- * to handle two subcases :
- * 1 - The fs still continues to have different METADATA/FSID uuids.
- * 2 - The fs is switched back to its original FSID (METADATA/FSID
- * are equal).
- */
- list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- /* Changed UUIDs */
- if (check_fsid_changed(fs_devices, disk_super->metadata_uuid) &&
- memcmp(fs_devices->fsid, disk_super->fsid,
- BTRFS_FSID_SIZE) != 0)
- return fs_devices;
-
- /* Unchanged UUIDs */
- if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
- BTRFS_FSID_SIZE) == 0 &&
- memcmp(fs_devices->fsid, disk_super->metadata_uuid,
- BTRFS_FSID_SIZE) == 0)
- return fs_devices;
- }
-
- return NULL;
-}
-
-static struct btrfs_fs_devices *find_fsid_reverted_metadata(
- struct btrfs_super_block *disk_super)
-{
- struct btrfs_fs_devices *fs_devices;
-
- /*
- * Handle the case where the scanned device is part of an fs whose last
- * metadata UUID change reverted it to the original FSID. At the same
- * time fs_devices was first created by another constituent device
- * which didn't fully observe the operation. This results in an
- * btrfs_fs_devices created with metadata/fsid different AND
- * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
- * fs_devices equal to the FSID of the disk.
- */
- list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- if (!fs_devices->fsid_change)
- continue;
-
- if (check_fsid_changed(fs_devices, disk_super->fsid))
- return fs_devices;
- }
-
- return NULL;
-}
-/*
* Add new device to list of registered devices
*
* Returns:
@@ -785,10 +726,16 @@ static noinline struct btrfs_device *device_list_add(const char *path,
u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
dev_t path_devt;
int error;
+ bool same_fsid_diff_dev = false;
bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
- bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
- BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
+
+ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
+ btrfs_err(NULL,
+"device %s has incomplete metadata_uuid change, please use btrfstune to complete",
+ path);
+ return ERR_PTR(-EAGAIN);
+ }
error = lookup_bdev(path, &path_devt);
if (error) {
@@ -797,27 +744,23 @@ static noinline struct btrfs_device *device_list_add(const char *path,
return ERR_PTR(error);
}
- if (fsid_change_in_progress) {
- if (!has_metadata_uuid)
- fs_devices = find_fsid_inprogress(disk_super);
- else
- fs_devices = find_fsid_changed(disk_super);
- } else if (has_metadata_uuid) {
- fs_devices = find_fsid_with_metadata_uuid(disk_super);
- } else {
- fs_devices = find_fsid_reverted_metadata(disk_super);
- if (!fs_devices)
- fs_devices = find_fsid(disk_super->fsid, NULL);
- }
-
+ fs_devices = find_fsid_by_device(disk_super, path_devt, &same_fsid_diff_dev);
if (!fs_devices) {
- fs_devices = alloc_fs_devices(disk_super->fsid,
- has_metadata_uuid ? disk_super->metadata_uuid : NULL);
+ fs_devices = alloc_fs_devices(disk_super->fsid);
if (IS_ERR(fs_devices))
return ERR_CAST(fs_devices);
- fs_devices->fsid_change = fsid_change_in_progress;
+ if (has_metadata_uuid)
+ memcpy(fs_devices->metadata_uuid,
+ disk_super->metadata_uuid, BTRFS_FSID_SIZE);
+
+ if (same_fsid_diff_dev) {
+ generate_random_uuid(fs_devices->fsid);
+ fs_devices->temp_fsid = true;
+ pr_info("BTRFS: device %s using temp-fsid %pU\n",
+ path, fs_devices->fsid);
+ }
mutex_lock(&fs_devices->device_list_mutex);
list_add(&fs_devices->fs_list, &fs_uuids);
@@ -832,18 +775,11 @@ static noinline struct btrfs_device *device_list_add(const char *path,
mutex_lock(&fs_devices->device_list_mutex);
device = btrfs_find_device(fs_devices, &args);
- /*
- * If this disk has been pulled into an fs devices created by
- * a device which had the CHANGING_FSID_V2 flag then replace the
- * metadata_uuid/fsid values of the fs_devices.
- */
- if (fs_devices->fsid_change &&
- found_transid > fs_devices->latest_generation) {
+ if (found_transid > fs_devices->latest_generation) {
memcpy(fs_devices->fsid, disk_super->fsid,
BTRFS_FSID_SIZE);
memcpy(fs_devices->metadata_uuid,
btrfs_sb_fsid_ptr(disk_super), BTRFS_FSID_SIZE);
- fs_devices->fsid_change = false;
}
}
@@ -997,7 +933,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
lockdep_assert_held(&uuid_mutex);
- fs_devices = alloc_fs_devices(orig->fsid, NULL);
+ fs_devices = alloc_fs_devices(orig->fsid);
if (IS_ERR(fs_devices))
return fs_devices;
@@ -1068,9 +1004,10 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
if (device->devid == BTRFS_DEV_REPLACE_DEVID)
continue;
- if (device->bdev) {
- blkdev_put(device->bdev, device->holder);
+ if (device->bdev_handle) {
+ bdev_release(device->bdev_handle);
device->bdev = NULL;
+ device->bdev_handle = NULL;
fs_devices->open_devices--;
}
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
@@ -1115,7 +1052,7 @@ static void btrfs_close_bdev(struct btrfs_device *device)
invalidate_bdev(device->bdev);
}
- blkdev_put(device->bdev, device->holder);
+ bdev_release(device->bdev_handle);
}
static void btrfs_close_one_device(struct btrfs_device *device)
@@ -1356,14 +1293,19 @@ int btrfs_forget_devices(dev_t devt)
/*
* Look for a btrfs signature on a device. This may be called out of the mount path
* and we are not allowed to call set_blocksize during the scan. The superblock
- * is read via pagecache
+ * is read via pagecache.
+ *
+ * With @mount_arg_dev it's a scan during mount time that will always register
+ * the device or return an error. Multi-device and seeding devices are registered
+ * in both cases.
*/
-struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags)
+struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
+ bool mount_arg_dev)
{
struct btrfs_super_block *disk_super;
bool new_device_added = false;
struct btrfs_device *device = NULL;
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
u64 bytenr, bytenr_orig;
int ret;
@@ -1386,31 +1328,49 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags)
* values temporarily, as the device paths of the fsid are the only
* required information for assembling the volume.
*/
- bdev = blkdev_get_by_path(path, flags, NULL, NULL);
- if (IS_ERR(bdev))
- return ERR_CAST(bdev);
+ bdev_handle = bdev_open_by_path(path, flags, NULL, NULL);
+ if (IS_ERR(bdev_handle))
+ return ERR_CAST(bdev_handle);
bytenr_orig = btrfs_sb_offset(0);
- ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
+ ret = btrfs_sb_log_location_bdev(bdev_handle->bdev, 0, READ, &bytenr);
if (ret) {
device = ERR_PTR(ret);
goto error_bdev_put;
}
- disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
+ disk_super = btrfs_read_disk_super(bdev_handle->bdev, bytenr,
+ bytenr_orig);
if (IS_ERR(disk_super)) {
device = ERR_CAST(disk_super);
goto error_bdev_put;
}
+ if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 &&
+ !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)) {
+ dev_t devt;
+
+ ret = lookup_bdev(path, &devt);
+ if (ret)
+ btrfs_warn(NULL, "lookup bdev failed for path %s: %d",
+ path, ret);
+ else
+ btrfs_free_stale_devices(devt, NULL);
+
+ pr_debug("BTRFS: skip registering single non-seed device %s\n", path);
+ device = NULL;
+ goto free_disk_super;
+ }
+
device = device_list_add(path, disk_super, &new_device_added);
if (!IS_ERR(device) && new_device_added)
btrfs_free_stale_devices(device->devt, device);
+free_disk_super:
btrfs_release_disk_super(disk_super);
error_bdev_put:
- blkdev_put(bdev, NULL);
+ bdev_release(bdev_handle);
return device;
}
@@ -1894,7 +1854,7 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
ptr = btrfs_device_fsid(dev_item);
write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
ptr, BTRFS_FSID_SIZE);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
ret = 0;
out:
@@ -2087,7 +2047,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
struct btrfs_dev_lookup_args *args,
- struct block_device **bdev, void **holder)
+ struct bdev_handle **bdev_handle)
{
struct btrfs_trans_handle *trans;
struct btrfs_device *device;
@@ -2196,7 +2156,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
btrfs_assign_next_active_device(device, NULL);
- if (device->bdev) {
+ if (device->bdev_handle) {
cur_devices->open_devices--;
/* remove sysfs entry */
btrfs_sysfs_remove_device(device);
@@ -2212,9 +2172,9 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
* free the device.
*
* We cannot call btrfs_close_bdev() here because we're holding the sb
- * write lock, and blkdev_put() will pull in the ->open_mutex on the
- * block device and it's dependencies. Instead just flush the device
- * and let the caller do the final blkdev_put.
+ * write lock, and bdev_release() will pull in the ->open_mutex on
+ * the block device and it's dependencies. Instead just flush the
+ * device and let the caller do the final bdev_release.
*/
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
btrfs_scratch_superblocks(fs_info, device->bdev,
@@ -2225,8 +2185,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
}
}
- *bdev = device->bdev;
- *holder = device->holder;
+ *bdev_handle = device->bdev_handle;
synchronize_rcu();
btrfs_free_device(device);
@@ -2363,7 +2322,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
const char *path)
{
struct btrfs_super_block *disk_super;
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
int ret;
if (!path || !path[0])
@@ -2381,7 +2340,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
}
ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0,
- &bdev, &disk_super);
+ &bdev_handle, &disk_super);
if (ret) {
btrfs_put_dev_args_from_path(args);
return ret;
@@ -2394,7 +2353,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
else
memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
btrfs_release_disk_super(disk_super);
- blkdev_put(bdev, NULL);
+ bdev_release(bdev_handle);
return 0;
}
@@ -2451,7 +2410,7 @@ static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info)
* Private copy of the seed devices, anchored at
* fs_info->fs_devices->seed_list
*/
- seed_devices = alloc_fs_devices(NULL, NULL);
+ seed_devices = alloc_fs_devices(NULL);
if (IS_ERR(seed_devices))
return seed_devices;
@@ -2597,7 +2556,7 @@ next_slot:
if (device->fs_devices->seeding) {
btrfs_set_device_generation(leaf, dev_item,
device->generation);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
}
path->slots[0]++;
@@ -2614,7 +2573,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_trans_handle *trans;
struct btrfs_device *device;
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
struct super_block *sb = fs_info->sb;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_fs_devices *seed_devices = NULL;
@@ -2627,12 +2586,12 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
if (sb_rdonly(sb) && !fs_devices->seeding)
return -EROFS;
- bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE,
- fs_info->bdev_holder, NULL);
- if (IS_ERR(bdev))
- return PTR_ERR(bdev);
+ bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE,
+ fs_info->bdev_holder, NULL);
+ if (IS_ERR(bdev_handle))
+ return PTR_ERR(bdev_handle);
- if (!btrfs_check_device_zone_type(fs_info, bdev)) {
+ if (!btrfs_check_device_zone_type(fs_info, bdev_handle->bdev)) {
ret = -EINVAL;
goto error;
}
@@ -2644,11 +2603,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
locked = true;
}
- sync_blockdev(bdev);
+ sync_blockdev(bdev_handle->bdev);
rcu_read_lock();
list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
- if (device->bdev == bdev) {
+ if (device->bdev == bdev_handle->bdev) {
ret = -EEXIST;
rcu_read_unlock();
goto error;
@@ -2664,7 +2623,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
}
device->fs_info = fs_info;
- device->bdev = bdev;
+ device->bdev_handle = bdev_handle;
+ device->bdev = bdev_handle->bdev;
ret = lookup_bdev(device_path, &device->devt);
if (ret)
goto error_free_device;
@@ -2685,12 +2645,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
device->io_align = fs_info->sectorsize;
device->sector_size = fs_info->sectorsize;
device->total_bytes =
- round_down(bdev_nr_bytes(bdev), fs_info->sectorsize);
+ round_down(bdev_nr_bytes(device->bdev), fs_info->sectorsize);
device->disk_total_bytes = device->total_bytes;
device->commit_total_bytes = device->total_bytes;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
- device->holder = fs_info->bdev_holder;
device->dev_stats_valid = 1;
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
@@ -2726,7 +2685,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
- if (!bdev_nonrot(bdev))
+ if (!bdev_nonrot(device->bdev))
fs_devices->rotating = true;
orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
@@ -2848,7 +2807,7 @@ error_free_zone:
error_free_device:
btrfs_free_device(device);
error:
- blkdev_put(bdev, fs_info->bdev_holder);
+ bdev_release(bdev_handle);
if (locked) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
@@ -2895,7 +2854,7 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
btrfs_device_get_disk_total_bytes(device));
btrfs_set_device_bytes_used(leaf, dev_item,
btrfs_device_get_bytes_used(device));
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
out:
btrfs_free_path(path);
@@ -2929,6 +2888,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
btrfs_set_super_total_bytes(super_copy,
round_down(old_total + diff, fs_info->sectorsize));
device->fs_devices->total_rw_bytes += diff;
+ atomic64_add(diff, &fs_info->free_chunk_space);
btrfs_device_set_total_bytes(device, new_size);
btrfs_device_set_disk_total_bytes(device, new_size);
@@ -3027,7 +2987,8 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
}
/*
- * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
+ * Find the mapping containing the given logical extent.
+ *
* @logical: Logical block offset in bytes.
* @length: Length of extent in bytes.
*
@@ -3045,15 +3006,16 @@ struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
read_unlock(&em_tree->lock);
if (!em) {
- btrfs_crit(fs_info, "unable to find logical %llu length %llu",
+ btrfs_crit(fs_info,
+ "unable to find chunk map for logical %llu length %llu",
logical, length);
return ERR_PTR(-EINVAL);
}
- if (em->start > logical || em->start + em->len < logical) {
+ if (em->start > logical || em->start + em->len <= logical) {
btrfs_crit(fs_info,
- "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
- logical, length, em->start, em->start + em->len);
+ "found a bad chunk map, wanted %llu-%llu, found %llu-%llu",
+ logical, logical + length, em->start, em->start + em->len);
free_extent_map(em);
return ERR_PTR(-EINVAL);
}
@@ -3483,7 +3445,7 @@ static int insert_balance_item(struct btrfs_fs_info *fs_info,
btrfs_set_balance_flags(leaf, item, bctl->flags);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
out:
btrfs_free_path(path);
err = btrfs_commit_transaction(trans);
@@ -4838,6 +4800,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
u64 old_size = btrfs_device_get_total_bytes(device);
u64 diff;
u64 start;
+ u64 free_diff = 0;
new_size = round_down(new_size, fs_info->sectorsize);
start = new_size;
@@ -4863,7 +4826,19 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
btrfs_device_set_total_bytes(device, new_size);
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
device->fs_devices->total_rw_bytes -= diff;
- atomic64_sub(diff, &fs_info->free_chunk_space);
+
+ /*
+ * The new free_chunk_space is new_size - used, so we have to
+ * subtract the delta of the old free_chunk_space which included
+ * old_size - used. If used > new_size then just subtract this
+ * entire device's free space.
+ */
+ if (device->bytes_used < new_size)
+ free_diff = (old_size - device->bytes_used) -
+ (new_size - device->bytes_used);
+ else
+ free_diff = old_size - device->bytes_used;
+ atomic64_sub(free_diff, &fs_info->free_chunk_space);
}
/*
@@ -4998,9 +4973,10 @@ done:
if (ret) {
mutex_lock(&fs_info->chunk_mutex);
btrfs_device_set_total_bytes(device, old_size);
- if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
device->fs_devices->total_rw_bytes += diff;
- atomic64_add(diff, &fs_info->free_chunk_space);
+ atomic64_add(free_diff, &fs_info->free_chunk_space);
+ }
mutex_unlock(&fs_info->chunk_mutex);
}
return ret;
@@ -5880,6 +5856,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
}
static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
+ u64 logical,
u16 total_stripes)
{
struct btrfs_io_context *bioc;
@@ -5899,6 +5876,7 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
bioc->fs_info = fs_info;
bioc->replace_stripe_src = -1;
bioc->full_stripe_logical = (u64)-1;
+ bioc->logical = logical;
return bioc;
}
@@ -6203,12 +6181,20 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
return U64_MAX;
}
-static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map,
- u32 stripe_index, u64 stripe_offset, u32 stripe_nr)
+static int set_io_stripe(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ u64 logical, u64 *length, struct btrfs_io_stripe *dst,
+ struct map_lookup *map, u32 stripe_index,
+ u64 stripe_offset, u64 stripe_nr)
{
dst->dev = map->stripes[stripe_index].dev;
+
+ if (op == BTRFS_MAP_READ && btrfs_need_stripe_tree_update(fs_info, map->type))
+ return btrfs_get_raid_extent_offset(fs_info, logical, length,
+ map->type, stripe_index, dst);
+
dst->physical = map->stripes[stripe_index].physical +
stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
+ return 0;
}
/*
@@ -6245,16 +6231,11 @@ static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *
* For RAID6 profile, mirror > 2 means mark another
* data/P stripe error and rebuild from the remaining
* stripes..
- *
- * @need_raid_map: (Used only for integrity checker) whether the map wants
- * a full stripe map (including all data and P/Q stripes)
- * for RAID56. Should always be 1 except integrity checker.
*/
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_io_context **bioc_ret,
- struct btrfs_io_stripe *smap, int *mirror_num_ret,
- int need_raid_map)
+ struct btrfs_io_stripe *smap, int *mirror_num_ret)
{
struct extent_map *em;
struct map_lookup *map;
@@ -6349,8 +6330,10 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
}
} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- if (need_raid_map && (op != BTRFS_MAP_READ || mirror_num > 1)) {
+ if (op != BTRFS_MAP_READ || mirror_num > 1) {
/*
+ * Needs full stripe mapping.
+ *
* Push stripe_nr back to the start of the full stripe
* For those cases needing a full stripe, @stripe_nr
* is the full stripe number.
@@ -6373,19 +6356,14 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
stripe_index = 0;
stripe_offset = 0;
} else {
- /*
- * Mirror #0 or #1 means the original data block.
- * Mirror #2 is RAID5 parity block.
- * Mirror #3 is RAID6 Q block.
- */
+ ASSERT(mirror_num <= 1);
+ /* Just grab the data stripe directly. */
stripe_index = stripe_nr % data_stripes;
stripe_nr /= data_stripes;
- if (mirror_num > 1)
- stripe_index = data_stripes + mirror_num - 2;
/* We distribute the parity blocks across stripes */
stripe_index = (stripe_nr + stripe_index) % map->num_stripes;
- if (op == BTRFS_MAP_READ && mirror_num <= 1)
+ if (op == BTRFS_MAP_READ && mirror_num < 1)
mirror_num = 1;
}
} else {
@@ -6424,16 +6402,18 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
* I/O context structure.
*/
if (smap && num_alloc_stripes == 1 &&
+ !(btrfs_need_stripe_tree_update(fs_info, map->type) &&
+ op != BTRFS_MAP_READ) &&
!((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)) {
- set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr);
+ ret = set_io_stripe(fs_info, op, logical, length, smap, map,
+ stripe_index, stripe_offset, stripe_nr);
if (mirror_num_ret)
*mirror_num_ret = mirror_num;
*bioc_ret = NULL;
- ret = 0;
goto out;
}
- bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes);
+ bioc = alloc_btrfs_io_context(fs_info, logical, num_alloc_stripes);
if (!bioc) {
ret = -ENOMEM;
goto out;
@@ -6447,7 +6427,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
*
* It's still mostly the same as other profiles, just with extra rotation.
*/
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
(op != BTRFS_MAP_READ || mirror_num > 1)) {
/*
* For RAID56 @stripe_nr is already the number of full stripes
@@ -6459,22 +6439,35 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
*/
bioc->full_stripe_logical = em->start +
btrfs_stripe_nr_to_offset(stripe_nr * data_stripes);
- for (i = 0; i < num_stripes; i++)
- set_io_stripe(&bioc->stripes[i], map,
- (i + stripe_nr) % num_stripes,
- stripe_offset, stripe_nr);
+ for (int i = 0; i < num_stripes; i++) {
+ ret = set_io_stripe(fs_info, op, logical, length,
+ &bioc->stripes[i], map,
+ (i + stripe_nr) % num_stripes,
+ stripe_offset, stripe_nr);
+ if (ret < 0)
+ break;
+ }
} else {
/*
* For all other non-RAID56 profiles, just copy the target
* stripe into the bioc.
*/
for (i = 0; i < num_stripes; i++) {
- set_io_stripe(&bioc->stripes[i], map, stripe_index,
- stripe_offset, stripe_nr);
+ ret = set_io_stripe(fs_info, op, logical, length,
+ &bioc->stripes[i], map, stripe_index,
+ stripe_offset, stripe_nr);
+ if (ret < 0)
+ break;
stripe_index++;
}
}
+ if (ret) {
+ *bioc_ret = NULL;
+ btrfs_put_bioc(bioc);
+ goto out;
+ }
+
if (op != BTRFS_MAP_READ)
max_errors = btrfs_chunk_max_errors(map);
@@ -6901,7 +6894,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
if (!btrfs_test_opt(fs_info, DEGRADED))
return ERR_PTR(-ENOENT);
- fs_devices = alloc_fs_devices(fsid, NULL);
+ fs_devices = alloc_fs_devices(fsid);
if (IS_ERR(fs_devices))
return fs_devices;
@@ -7534,7 +7527,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
btrfs_set_dev_stats_value(eb, ptr, i,
btrfs_dev_stat_read(device, i));
- btrfs_mark_buffer_dirty(eb);
+ btrfs_mark_buffer_dirty(trans, eb);
out:
btrfs_free_path(path);
@@ -8076,7 +8069,7 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
ASSERT(mirror_num > 0);
ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length,
- &bioc, smap, &mirror_ret, true);
+ &bioc, smap, &mirror_ret);
if (ret < 0)
return ret;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2128a032c3b7..9cc374864a79 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -90,13 +90,11 @@ struct btrfs_device {
u64 generation;
+ struct bdev_handle *bdev_handle;
struct block_device *bdev;
struct btrfs_zoned_device_info *zone_info;
- /* block device holder for blkdev_get/put */
- void *holder;
-
/*
* Device's major-minor number. Must be set even if the device is not
* opened (bdev == NULL), unless the device is missing.
@@ -290,6 +288,19 @@ struct btrfs_fs_devices {
* - Following shall be true at all times:
* - metadata_uuid == btrfs_header::fsid
* - metadata_uuid == btrfs_dev_item::fsid
+ *
+ * - Relations between fsid and metadata_uuid in sb and fs_devices:
+ * - Normal:
+ * fs_devices->fsid == fs_devices->metadata_uuid == sb->fsid
+ * sb->metadata_uuid == 0
+ *
+ * - When the BTRFS_FEATURE_INCOMPAT_METADATA_UUID flag is set:
+ * fs_devices->fsid == sb->fsid
+ * fs_devices->metadata_uuid == sb->metadata_uuid
+ *
+ * - When in-memory fs_devices->temp_fsid is true
+ * fs_devices->fsid = random
+ * fs_devices->metadata_uuid == sb->fsid
*/
u8 metadata_uuid[BTRFS_FSID_SIZE];
@@ -353,9 +364,10 @@ struct btrfs_fs_devices {
bool rotating;
/* Devices support TRIM/discard commands. */
bool discardable;
- bool fsid_change;
/* The filesystem is a seed filesystem. */
bool seeding;
+ /* The mount needs to use a randomly generated fsid. */
+ bool temp_fsid;
struct btrfs_fs_info *fs_info;
/* sysfs kobjects */
@@ -381,12 +393,12 @@ struct btrfs_fs_devices {
struct btrfs_io_stripe {
struct btrfs_device *dev;
- union {
- /* Block mapping */
- u64 physical;
- /* For the endio handler */
- struct btrfs_io_context *bioc;
- };
+ /* Block mapping. */
+ u64 physical;
+ u64 length;
+ bool is_scrub;
+ /* For the endio handler. */
+ struct btrfs_io_context *bioc;
};
struct btrfs_discard_stripe {
@@ -419,6 +431,11 @@ struct btrfs_io_context {
atomic_t error;
u16 max_errors;
+ u64 logical;
+ u64 size;
+ /* Raid stripe tree ordered entry. */
+ struct list_head rst_ordered_entry;
+
/*
* The total number of stripes, including the extra duplicated
* stripe for replace.
@@ -596,8 +613,7 @@ void btrfs_put_bioc(struct btrfs_io_context *bioc);
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_io_context **bioc_ret,
- struct btrfs_io_stripe *smap, int *mirror_num_ret,
- int need_raid_map);
+ struct btrfs_io_stripe *smap, int *mirror_num_ret);
int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
struct btrfs_io_stripe *smap, u64 logical,
u32 length, int mirror_num);
@@ -611,7 +627,8 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
void btrfs_mapping_tree_free(struct extent_map_tree *tree);
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
blk_mode_t flags, void *holder);
-struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags);
+struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
+ bool mount_arg_dev);
int btrfs_forget_devices(dev_t devt);
void btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices);
@@ -629,7 +646,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args);
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
struct btrfs_dev_lookup_args *args,
- struct block_device **bdev, void **holder);
+ struct bdev_handle **bdev_handle);
void __exit btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 96828a13dd43..3cf236fb40a4 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -188,15 +188,15 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
if (old_data_len + name_len + sizeof(*di) == item_size) {
/* No other xattrs packed in the same leaf item. */
if (size > old_data_len)
- btrfs_extend_item(path, size - old_data_len);
+ btrfs_extend_item(trans, path, size - old_data_len);
else if (size < old_data_len)
- btrfs_truncate_item(path, data_size, 1);
+ btrfs_truncate_item(trans, path, data_size, 1);
} else {
/* There are other xattrs packed in the same item. */
ret = btrfs_delete_one_dir_name(trans, root, path, di);
if (ret)
goto out;
- btrfs_extend_item(path, data_size);
+ btrfs_extend_item(trans, path, data_size);
}
ptr = btrfs_item_ptr(leaf, slot, char);
@@ -205,7 +205,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
btrfs_set_dir_data_len(leaf, di, size);
data_ptr = ((unsigned long)(di + 1)) + name_len;
write_extent_buffer(leaf, value, data_ptr, size);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
} else {
/*
* Insert, and we had space for the xattr, so path->slots[0] is
@@ -265,7 +265,7 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name,
inode_inc_iversion(inode);
inode_set_ctime_current(inode);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
if (ret)
btrfs_abort_transaction(trans, ret);
out:
@@ -408,7 +408,7 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
if (!ret) {
inode_inc_iversion(inode);
inode_set_ctime_current(inode);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
if (ret)
btrfs_abort_transaction(trans, ret);
}
@@ -442,7 +442,7 @@ static const struct xattr_handler btrfs_btrfs_xattr_handler = {
.set = btrfs_xattr_handler_set_prop,
};
-const struct xattr_handler *btrfs_xattr_handlers[] = {
+const struct xattr_handler * const btrfs_xattr_handlers[] = {
&btrfs_security_xattr_handler,
&btrfs_trusted_xattr_handler,
&btrfs_user_xattr_handler,
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 1cd3fc0a8f17..118118ca3e1d 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -8,7 +8,7 @@
#include <linux/xattr.h>
-extern const struct xattr_handler *btrfs_xattr_handlers[];
+extern const struct xattr_handler * const btrfs_xattr_handlers[];
int btrfs_getxattr(struct inode *inode, const char *name,
void *buffer, size_t size);
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 09bc325d075d..188378ca19c7 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1282,21 +1282,284 @@ out:
return ret;
}
+struct zone_info {
+ u64 physical;
+ u64 capacity;
+ u64 alloc_offset;
+};
+
+static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
+ struct zone_info *info, unsigned long *active,
+ struct map_lookup *map)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct btrfs_device *device = map->stripes[zone_idx].dev;
+ int dev_replace_is_ongoing = 0;
+ unsigned int nofs_flag;
+ struct blk_zone zone;
+ int ret;
+
+ info->physical = map->stripes[zone_idx].physical;
+
+ if (!device->bdev) {
+ info->alloc_offset = WP_MISSING_DEV;
+ return 0;
+ }
+
+ /* Consider a zone as active if we can allow any number of active zones. */
+ if (!device->zone_info->max_active_zones)
+ __set_bit(zone_idx, active);
+
+ if (!btrfs_dev_is_sequential(device, info->physical)) {
+ info->alloc_offset = WP_CONVENTIONAL;
+ return 0;
+ }
+
+ /* This zone will be used for allocation, so mark this zone non-empty. */
+ btrfs_dev_clear_zone_empty(device, info->physical);
+
+ down_read(&dev_replace->rwsem);
+ dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
+ if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
+ btrfs_dev_clear_zone_empty(dev_replace->tgtdev, info->physical);
+ up_read(&dev_replace->rwsem);
+
+ /*
+ * The group is mapped to a sequential zone. Get the zone write pointer
+ * to determine the allocation offset within the zone.
+ */
+ WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size));
+ nofs_flag = memalloc_nofs_save();
+ ret = btrfs_get_dev_zone(device, info->physical, &zone);
+ memalloc_nofs_restore(nofs_flag);
+ if (ret) {
+ if (ret != -EIO && ret != -EOPNOTSUPP)
+ return ret;
+ info->alloc_offset = WP_MISSING_DEV;
+ return 0;
+ }
+
+ if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
+ btrfs_err_in_rcu(fs_info,
+ "zoned: unexpected conventional zone %llu on device %s (devid %llu)",
+ zone.start << SECTOR_SHIFT, rcu_str_deref(device->name),
+ device->devid);
+ return -EIO;
+ }
+
+ info->capacity = (zone.capacity << SECTOR_SHIFT);
+
+ switch (zone.cond) {
+ case BLK_ZONE_COND_OFFLINE:
+ case BLK_ZONE_COND_READONLY:
+ btrfs_err(fs_info,
+ "zoned: offline/readonly zone %llu on device %s (devid %llu)",
+ (info->physical >> device->zone_info->zone_size_shift),
+ rcu_str_deref(device->name), device->devid);
+ info->alloc_offset = WP_MISSING_DEV;
+ break;
+ case BLK_ZONE_COND_EMPTY:
+ info->alloc_offset = 0;
+ break;
+ case BLK_ZONE_COND_FULL:
+ info->alloc_offset = info->capacity;
+ break;
+ default:
+ /* Partially used zone. */
+ info->alloc_offset = ((zone.wp - zone.start) << SECTOR_SHIFT);
+ __set_bit(zone_idx, active);
+ break;
+ }
+
+ return 0;
+}
+
+static int btrfs_load_block_group_single(struct btrfs_block_group *bg,
+ struct zone_info *info,
+ unsigned long *active)
+{
+ if (info->alloc_offset == WP_MISSING_DEV) {
+ btrfs_err(bg->fs_info,
+ "zoned: cannot recover write pointer for zone %llu",
+ info->physical);
+ return -EIO;
+ }
+
+ bg->alloc_offset = info->alloc_offset;
+ bg->zone_capacity = info->capacity;
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+ return 0;
+}
+
+static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
+ struct map_lookup *map,
+ struct zone_info *zone_info,
+ unsigned long *active)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
+ btrfs_err(fs_info, "zoned: data DUP profile needs raid-stripe-tree");
+ return -EINVAL;
+ }
+
+ if (zone_info[0].alloc_offset == WP_MISSING_DEV) {
+ btrfs_err(bg->fs_info,
+ "zoned: cannot recover write pointer for zone %llu",
+ zone_info[0].physical);
+ return -EIO;
+ }
+ if (zone_info[1].alloc_offset == WP_MISSING_DEV) {
+ btrfs_err(bg->fs_info,
+ "zoned: cannot recover write pointer for zone %llu",
+ zone_info[1].physical);
+ return -EIO;
+ }
+ if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) {
+ btrfs_err(bg->fs_info,
+ "zoned: write pointer offset mismatch of zones in DUP profile");
+ return -EIO;
+ }
+
+ if (test_bit(0, active) != test_bit(1, active)) {
+ if (!btrfs_zone_activate(bg))
+ return -EIO;
+ } else if (test_bit(0, active)) {
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+ }
+
+ bg->alloc_offset = zone_info[0].alloc_offset;
+ bg->zone_capacity = min(zone_info[0].capacity, zone_info[1].capacity);
+ return 0;
+}
+
+static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
+ struct map_lookup *map,
+ struct zone_info *zone_info,
+ unsigned long *active)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+ int i;
+
+ if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
+ btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
+ btrfs_bg_type_to_raid_name(map->type));
+ return -EINVAL;
+ }
+
+ for (i = 0; i < map->num_stripes; i++) {
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
+ zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ continue;
+
+ if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) &&
+ !btrfs_test_opt(fs_info, DEGRADED)) {
+ btrfs_err(fs_info,
+ "zoned: write pointer offset mismatch of zones in %s profile",
+ btrfs_bg_type_to_raid_name(map->type));
+ return -EIO;
+ }
+ if (test_bit(0, active) != test_bit(i, active)) {
+ if (!btrfs_test_opt(fs_info, DEGRADED) &&
+ !btrfs_zone_activate(bg)) {
+ return -EIO;
+ }
+ } else {
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+ }
+ /* In case a device is missing we have a cap of 0, so don't use it. */
+ bg->zone_capacity = min_not_zero(zone_info[0].capacity,
+ zone_info[1].capacity);
+ }
+
+ if (zone_info[0].alloc_offset != WP_MISSING_DEV)
+ bg->alloc_offset = zone_info[0].alloc_offset;
+ else
+ bg->alloc_offset = zone_info[i - 1].alloc_offset;
+
+ return 0;
+}
+
+static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
+ struct map_lookup *map,
+ struct zone_info *zone_info,
+ unsigned long *active)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
+ btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
+ btrfs_bg_type_to_raid_name(map->type));
+ return -EINVAL;
+ }
+
+ for (int i = 0; i < map->num_stripes; i++) {
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
+ zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ continue;
+
+ if (test_bit(0, active) != test_bit(i, active)) {
+ if (!btrfs_zone_activate(bg))
+ return -EIO;
+ } else {
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+ }
+ bg->zone_capacity += zone_info[i].capacity;
+ bg->alloc_offset += zone_info[i].alloc_offset;
+ }
+
+ return 0;
+}
+
+static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
+ struct map_lookup *map,
+ struct zone_info *zone_info,
+ unsigned long *active)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
+ btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
+ btrfs_bg_type_to_raid_name(map->type));
+ return -EINVAL;
+ }
+
+ for (int i = 0; i < map->num_stripes; i++) {
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
+ zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ continue;
+
+ if (test_bit(0, active) != test_bit(i, active)) {
+ if (!btrfs_zone_activate(bg))
+ return -EIO;
+ } else {
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+ }
+
+ if ((i % map->sub_stripes) == 0) {
+ bg->zone_capacity += zone_info[i].capacity;
+ bg->alloc_offset += zone_info[i].alloc_offset;
+ }
+ }
+
+ return 0;
+}
+
int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct map_lookup *map;
- struct btrfs_device *device;
u64 logical = cache->start;
u64 length = cache->length;
+ struct zone_info *zone_info = NULL;
int ret;
int i;
- unsigned int nofs_flag;
- u64 *alloc_offsets = NULL;
- u64 *caps = NULL;
- u64 *physical = NULL;
unsigned long *active = NULL;
u64 last_alloc = 0;
u32 num_sequential = 0, num_conventional = 0;
@@ -1328,20 +1591,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
}
- alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS);
- if (!alloc_offsets) {
- ret = -ENOMEM;
- goto out;
- }
-
- caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS);
- if (!caps) {
- ret = -ENOMEM;
- goto out;
- }
-
- physical = kcalloc(map->num_stripes, sizeof(*physical), GFP_NOFS);
- if (!physical) {
+ zone_info = kcalloc(map->num_stripes, sizeof(*zone_info), GFP_NOFS);
+ if (!zone_info) {
ret = -ENOMEM;
goto out;
}
@@ -1353,98 +1604,14 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
for (i = 0; i < map->num_stripes; i++) {
- bool is_sequential;
- struct blk_zone zone;
- struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
- int dev_replace_is_ongoing = 0;
-
- device = map->stripes[i].dev;
- physical[i] = map->stripes[i].physical;
-
- if (device->bdev == NULL) {
- alloc_offsets[i] = WP_MISSING_DEV;
- continue;
- }
-
- is_sequential = btrfs_dev_is_sequential(device, physical[i]);
- if (is_sequential)
- num_sequential++;
- else
- num_conventional++;
-
- /*
- * Consider a zone as active if we can allow any number of
- * active zones.
- */
- if (!device->zone_info->max_active_zones)
- __set_bit(i, active);
-
- if (!is_sequential) {
- alloc_offsets[i] = WP_CONVENTIONAL;
- continue;
- }
-
- /*
- * This zone will be used for allocation, so mark this zone
- * non-empty.
- */
- btrfs_dev_clear_zone_empty(device, physical[i]);
-
- down_read(&dev_replace->rwsem);
- dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
- if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
- btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical[i]);
- up_read(&dev_replace->rwsem);
-
- /*
- * The group is mapped to a sequential zone. Get the zone write
- * pointer to determine the allocation offset within the zone.
- */
- WARN_ON(!IS_ALIGNED(physical[i], fs_info->zone_size));
- nofs_flag = memalloc_nofs_save();
- ret = btrfs_get_dev_zone(device, physical[i], &zone);
- memalloc_nofs_restore(nofs_flag);
- if (ret == -EIO || ret == -EOPNOTSUPP) {
- ret = 0;
- alloc_offsets[i] = WP_MISSING_DEV;
- continue;
- } else if (ret) {
- goto out;
- }
-
- if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
- btrfs_err_in_rcu(fs_info,
- "zoned: unexpected conventional zone %llu on device %s (devid %llu)",
- zone.start << SECTOR_SHIFT,
- rcu_str_deref(device->name), device->devid);
- ret = -EIO;
+ ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map);
+ if (ret)
goto out;
- }
- caps[i] = (zone.capacity << SECTOR_SHIFT);
-
- switch (zone.cond) {
- case BLK_ZONE_COND_OFFLINE:
- case BLK_ZONE_COND_READONLY:
- btrfs_err(fs_info,
- "zoned: offline/readonly zone %llu on device %s (devid %llu)",
- physical[i] >> device->zone_info->zone_size_shift,
- rcu_str_deref(device->name), device->devid);
- alloc_offsets[i] = WP_MISSING_DEV;
- break;
- case BLK_ZONE_COND_EMPTY:
- alloc_offsets[i] = 0;
- break;
- case BLK_ZONE_COND_FULL:
- alloc_offsets[i] = caps[i];
- break;
- default:
- /* Partially used zone */
- alloc_offsets[i] =
- ((zone.wp - zone.start) << SECTOR_SHIFT);
- __set_bit(i, active);
- break;
- }
+ if (zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ num_conventional++;
+ else
+ num_sequential++;
}
if (num_sequential > 0)
@@ -1468,63 +1635,24 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
case 0: /* single */
- if (alloc_offsets[0] == WP_MISSING_DEV) {
- btrfs_err(fs_info,
- "zoned: cannot recover write pointer for zone %llu",
- physical[0]);
- ret = -EIO;
- goto out;
- }
- cache->alloc_offset = alloc_offsets[0];
- cache->zone_capacity = caps[0];
- if (test_bit(0, active))
- set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
+ ret = btrfs_load_block_group_single(cache, &zone_info[0], active);
break;
case BTRFS_BLOCK_GROUP_DUP:
- if (map->type & BTRFS_BLOCK_GROUP_DATA) {
- btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg");
- ret = -EINVAL;
- goto out;
- }
- if (alloc_offsets[0] == WP_MISSING_DEV) {
- btrfs_err(fs_info,
- "zoned: cannot recover write pointer for zone %llu",
- physical[0]);
- ret = -EIO;
- goto out;
- }
- if (alloc_offsets[1] == WP_MISSING_DEV) {
- btrfs_err(fs_info,
- "zoned: cannot recover write pointer for zone %llu",
- physical[1]);
- ret = -EIO;
- goto out;
- }
- if (alloc_offsets[0] != alloc_offsets[1]) {
- btrfs_err(fs_info,
- "zoned: write pointer offset mismatch of zones in DUP profile");
- ret = -EIO;
- goto out;
- }
- if (test_bit(0, active) != test_bit(1, active)) {
- if (!btrfs_zone_activate(cache)) {
- ret = -EIO;
- goto out;
- }
- } else {
- if (test_bit(0, active))
- set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
- &cache->runtime_flags);
- }
- cache->alloc_offset = alloc_offsets[0];
- cache->zone_capacity = min(caps[0], caps[1]);
+ ret = btrfs_load_block_group_dup(cache, map, zone_info, active);
break;
case BTRFS_BLOCK_GROUP_RAID1:
+ case BTRFS_BLOCK_GROUP_RAID1C3:
+ case BTRFS_BLOCK_GROUP_RAID1C4:
+ ret = btrfs_load_block_group_raid1(cache, map, zone_info, active);
+ break;
case BTRFS_BLOCK_GROUP_RAID0:
+ ret = btrfs_load_block_group_raid0(cache, map, zone_info, active);
+ break;
case BTRFS_BLOCK_GROUP_RAID10:
+ ret = btrfs_load_block_group_raid10(cache, map, zone_info, active);
+ break;
case BTRFS_BLOCK_GROUP_RAID5:
case BTRFS_BLOCK_GROUP_RAID6:
- /* non-single profiles are not supported yet */
default:
btrfs_err(fs_info, "zoned: profile %s not yet supported",
btrfs_bg_type_to_raid_name(map->type));
@@ -1533,13 +1661,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
out:
- if (cache->alloc_offset > fs_info->zone_size) {
- btrfs_err(fs_info,
- "zoned: invalid write pointer %llu in block group %llu",
- cache->alloc_offset, cache->start);
- ret = -EIO;
- }
-
if (cache->alloc_offset > cache->zone_capacity) {
btrfs_err(fs_info,
"zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
@@ -1570,9 +1691,7 @@ out:
cache->physical_map = NULL;
}
bitmap_free(active);
- kfree(physical);
- kfree(caps);
- kfree(alloc_offsets);
+ kfree(zone_info);
free_extent_map(em);
return ret;
@@ -1609,7 +1728,7 @@ void btrfs_redirty_list_add(struct btrfs_transaction *trans,
set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
set_extent_buffer_dirty(eb);
set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1,
- EXTENT_DIRTY | EXTENT_NOWAIT, NULL);
+ EXTENT_DIRTY, NULL);
}
bool btrfs_use_zone_append(struct btrfs_bio *bbio)
@@ -1887,7 +2006,7 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
int i, ret;
ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
- &mapped_length, &bioc, NULL, NULL, 1);
+ &mapped_length, &bioc, NULL, NULL);
if (ret || !bioc || mapped_length < PAGE_SIZE) {
ret = -EIO;
goto out_put_bioc;
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index e7ac4ec809a4..5511766485cd 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -145,7 +145,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
}
/*
- * zstd_calc_ws_mem_sizes - calculate monotonic memory bounds
+ * Calculate monotonic memory bounds.
*
* It is possible based on the level configurations that a higher level
* workspace uses less memory than a lower level workspace. In order to reuse
@@ -218,7 +218,8 @@ void zstd_cleanup_workspace_manager(void)
}
/*
- * zstd_find_workspace - find workspace
+ * Find workspace for given level.
+ *
* @level: compression level
*
* This iterates over the set bits in the active_map beginning at the requested
@@ -256,7 +257,8 @@ static struct list_head *zstd_find_workspace(unsigned int level)
}
/*
- * zstd_get_workspace - zstd's get_workspace
+ * Zstd get_workspace for level.
+ *
* @level: compression level
*
* If @level is 0, then any compression level can be used. Therefore, we begin
@@ -296,7 +298,8 @@ again:
}
/*
- * zstd_put_workspace - zstd put_workspace
+ * Zstd put_workspace.
+ *
* @ws: list_head for the workspace
*
* When putting back a workspace, we only need to update the LRU if we are of
diff --git a/fs/buffer.c b/fs/buffer.c
index 12e9a71c693d..967f34b70aa8 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -282,13 +282,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
} while (tmp != bh);
spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
- /*
- * If all of the buffers are uptodate then we can set the page
- * uptodate.
- */
- if (folio_uptodate)
- folio_mark_uptodate(folio);
- folio_unlock(folio);
+ folio_end_read(folio, folio_uptodate);
return;
still_busy:
@@ -915,16 +909,12 @@ int remove_inode_buffers(struct inode *inode)
* which may not fail from ordinary buffer allocations.
*/
struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
- bool retry)
+ gfp_t gfp)
{
struct buffer_head *bh, *head;
- gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
long offset;
struct mem_cgroup *memcg, *old_memcg;
- if (retry)
- gfp |= __GFP_NOFAIL;
-
/* The folio lock pins the memcg */
memcg = folio_memcg(folio);
old_memcg = set_active_memcg(memcg);
@@ -967,7 +957,11 @@ EXPORT_SYMBOL_GPL(folio_alloc_buffers);
struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
bool retry)
{
- return folio_alloc_buffers(page_folio(page), size, retry);
+ gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
+ if (retry)
+ gfp |= __GFP_NOFAIL;
+
+ return folio_alloc_buffers(page_folio(page), size, gfp);
}
EXPORT_SYMBOL_GPL(alloc_page_buffers);
@@ -1043,20 +1037,11 @@ grow_dev_page(struct block_device *bdev, sector_t block,
struct buffer_head *bh;
sector_t end_block;
int ret = 0;
- gfp_t gfp_mask;
-
- gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
-
- /*
- * XXX: __getblk_slow() can not really deal with failure and
- * will endlessly loop on improvised global reclaim. Prefer
- * looping in the allocator rather than here, at least that
- * code knows what it's doing.
- */
- gfp_mask |= __GFP_NOFAIL;
folio = __filemap_get_folio(inode->i_mapping, index,
- FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp_mask);
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
bh = folio_buffers(folio);
if (bh) {
@@ -1069,7 +1054,10 @@ grow_dev_page(struct block_device *bdev, sector_t block,
goto failed;
}
- bh = folio_alloc_buffers(folio, size, true);
+ ret = -ENOMEM;
+ bh = folio_alloc_buffers(folio, size, gfp | __GFP_ACCOUNT);
+ if (!bh)
+ goto failed;
/*
* Link the folio to the buffers and initialise them. Take the
@@ -1420,33 +1408,36 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
}
EXPORT_SYMBOL(__find_get_block);
-/*
- * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
- * which corresponds to the passed block_device, block and size. The
- * returned buffer has its reference count incremented.
+/**
+ * bdev_getblk - Get a buffer_head in a block device's buffer cache.
+ * @bdev: The block device.
+ * @block: The block number.
+ * @size: The size of buffer_heads for this @bdev.
+ * @gfp: The memory allocation flags to use.
*
- * __getblk_gfp() will lock up the machine if grow_dev_page's
- * try_to_free_buffers() attempt is failing. FIXME, perhaps?
+ * Return: The buffer head, or NULL if memory could not be allocated.
*/
-struct buffer_head *
-__getblk_gfp(struct block_device *bdev, sector_t block,
- unsigned size, gfp_t gfp)
+struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
+ unsigned size, gfp_t gfp)
{
struct buffer_head *bh = __find_get_block(bdev, block, size);
- might_sleep();
- if (bh == NULL)
- bh = __getblk_slow(bdev, block, size, gfp);
- return bh;
+ might_alloc(gfp);
+ if (bh)
+ return bh;
+
+ return __getblk_slow(bdev, block, size, gfp);
}
-EXPORT_SYMBOL(__getblk_gfp);
+EXPORT_SYMBOL(bdev_getblk);
/*
* Do async read-ahead on a buffer..
*/
void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
{
- struct buffer_head *bh = __getblk(bdev, block, size);
+ struct buffer_head *bh = bdev_getblk(bdev, block, size,
+ GFP_NOWAIT | __GFP_MOVABLE);
+
if (likely(bh)) {
bh_readahead(bh, REQ_RAHEAD);
brelse(bh);
@@ -1470,7 +1461,17 @@ struct buffer_head *
__bread_gfp(struct block_device *bdev, sector_t block,
unsigned size, gfp_t gfp)
{
- struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
+ struct buffer_head *bh;
+
+ gfp |= mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS);
+
+ /*
+ * Prefer looping in the allocator rather than here, at least that
+ * code knows what it's doing.
+ */
+ gfp |= __GFP_NOFAIL;
+
+ bh = bdev_getblk(bdev, block, size, gfp);
if (likely(bh) && !buffer_uptodate(bh))
bh = __bread_slow(bh);
@@ -1640,12 +1641,13 @@ EXPORT_SYMBOL(block_invalidate_folio);
* block_dirty_folio() via private_lock. try_to_free_buffers
* is already excluded via the folio lock.
*/
-void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize,
- unsigned long b_state)
+struct buffer_head *create_empty_buffers(struct folio *folio,
+ unsigned long blocksize, unsigned long b_state)
{
struct buffer_head *bh, *head, *tail;
+ gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT | __GFP_NOFAIL;
- head = folio_alloc_buffers(folio, blocksize, true);
+ head = folio_alloc_buffers(folio, blocksize, gfp);
bh = head;
do {
bh->b_state |= b_state;
@@ -1667,13 +1669,8 @@ void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize,
}
folio_attach_private(folio, head);
spin_unlock(&folio->mapping->private_lock);
-}
-EXPORT_SYMBOL(folio_create_empty_buffers);
-void create_empty_buffers(struct page *page,
- unsigned long blocksize, unsigned long b_state)
-{
- folio_create_empty_buffers(page_folio(page), blocksize, b_state);
+ return head;
}
EXPORT_SYMBOL(create_empty_buffers);
@@ -1768,13 +1765,15 @@ static struct buffer_head *folio_create_buffers(struct folio *folio,
struct inode *inode,
unsigned int b_state)
{
+ struct buffer_head *bh;
+
BUG_ON(!folio_test_locked(folio));
- if (!folio_buffers(folio))
- folio_create_empty_buffers(folio,
- 1 << READ_ONCE(inode->i_blkbits),
- b_state);
- return folio_buffers(folio);
+ bh = folio_buffers(folio);
+ if (!bh)
+ bh = create_empty_buffers(folio,
+ 1 << READ_ONCE(inode->i_blkbits), b_state);
+ return bh;
}
/*
@@ -2425,12 +2424,10 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
if (!nr) {
/*
- * All buffers are uptodate - we can set the folio uptodate
- * as well. But not if get_block() returned an error.
+ * All buffers are uptodate or get_block() returned an
+ * error when trying to map them - we can finish the read.
*/
- if (!page_error)
- folio_mark_uptodate(folio);
- folio_unlock(folio);
+ folio_end_read(folio, !page_error);
return 0;
}
@@ -2676,10 +2673,8 @@ int block_truncate_page(struct address_space *mapping,
return PTR_ERR(folio);
bh = folio_buffers(folio);
- if (!bh) {
- folio_create_empty_buffers(folio, blocksize, 0);
- bh = folio_buffers(folio);
- }
+ if (!bh)
+ bh = create_empty_buffers(folio, blocksize, 0);
/* Find the buffer that contains "offset" */
offset = offset_in_folio(folio, from);
@@ -2988,13 +2983,13 @@ EXPORT_SYMBOL(try_to_free_buffers);
/*
* Buffer-head allocation
*/
-static struct kmem_cache *bh_cachep __read_mostly;
+static struct kmem_cache *bh_cachep __ro_after_init;
/*
* Once the number of bh's in the machine exceeds this level, we start
* stripping them in writeback.
*/
-static unsigned long max_buffer_heads;
+static unsigned long max_buffer_heads __ro_after_init;
int buffer_heads_over_limit;
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index c53a1d220622..1564eacc253d 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -15,6 +15,7 @@
#include <linux/slab.h>
#include "super.h"
+#include "mds_client.h"
static inline void ceph_set_cached_acl(struct inode *inode,
int type, struct posix_acl *acl)
@@ -31,6 +32,7 @@ static inline void ceph_set_cached_acl(struct inode *inode,
struct posix_acl *ceph_get_acl(struct inode *inode, int type, bool rcu)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int size;
unsigned int retry_cnt = 0;
const char *name;
@@ -72,8 +74,8 @@ retry:
} else if (size == -ENODATA || size == 0) {
acl = NULL;
} else {
- pr_err_ratelimited("get acl %llx.%llx failed, err=%d\n",
- ceph_vinop(inode), size);
+ pr_err_ratelimited_client(cl, "%llx.%llx failed, err=%d\n",
+ ceph_vinop(inode), size);
acl = ERR_PTR(-EIO);
}
@@ -105,7 +107,7 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
case ACL_TYPE_ACCESS:
name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
- ret = posix_acl_update_mode(&nop_mnt_idmap, inode,
+ ret = posix_acl_update_mode(idmap, inode,
&new_mode, &acl);
if (ret)
goto out;
@@ -140,7 +142,7 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
newattrs.ia_ctime = current_time(inode);
newattrs.ia_mode = new_mode;
newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
- ret = __ceph_setattr(inode, &newattrs, NULL);
+ ret = __ceph_setattr(idmap, inode, &newattrs, NULL);
if (ret)
goto out_free;
}
@@ -151,7 +153,7 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
newattrs.ia_ctime = old_ctime;
newattrs.ia_mode = old_mode;
newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
- __ceph_setattr(inode, &newattrs, NULL);
+ __ceph_setattr(idmap, inode, &newattrs, NULL);
}
goto out_free;
}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index f4863078f7fe..85be3bf18cdf 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -79,18 +79,18 @@ static inline struct ceph_snap_context *page_snap_context(struct page *page)
*/
static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
{
- struct inode *inode;
+ struct inode *inode = mapping->host;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci;
struct ceph_snap_context *snapc;
if (folio_test_dirty(folio)) {
- dout("%p dirty_folio %p idx %lu -- already dirty\n",
- mapping->host, folio, folio->index);
+ doutc(cl, "%llx.%llx %p idx %lu -- already dirty\n",
+ ceph_vinop(inode), folio, folio->index);
VM_BUG_ON_FOLIO(!folio_test_private(folio), folio);
return false;
}
- inode = mapping->host;
ci = ceph_inode(inode);
/* dirty the head */
@@ -111,12 +111,12 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
if (ci->i_wrbuffer_ref == 0)
ihold(inode);
++ci->i_wrbuffer_ref;
- dout("%p dirty_folio %p idx %lu head %d/%d -> %d/%d "
- "snapc %p seq %lld (%d snaps)\n",
- mapping->host, folio, folio->index,
- ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
- ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
- snapc, snapc->seq, snapc->num_snaps);
+ doutc(cl, "%llx.%llx %p idx %lu head %d/%d -> %d/%d "
+ "snapc %p seq %lld (%d snaps)\n",
+ ceph_vinop(inode), folio, folio->index,
+ ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
+ ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
+ snapc, snapc->seq, snapc->num_snaps);
spin_unlock(&ci->i_ceph_lock);
/*
@@ -137,23 +137,22 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
static void ceph_invalidate_folio(struct folio *folio, size_t offset,
size_t length)
{
- struct inode *inode;
- struct ceph_inode_info *ci;
+ struct inode *inode = folio->mapping->host;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_snap_context *snapc;
- inode = folio->mapping->host;
- ci = ceph_inode(inode);
if (offset != 0 || length != folio_size(folio)) {
- dout("%p invalidate_folio idx %lu partial dirty page %zu~%zu\n",
- inode, folio->index, offset, length);
+ doutc(cl, "%llx.%llx idx %lu partial dirty page %zu~%zu\n",
+ ceph_vinop(inode), folio->index, offset, length);
return;
}
WARN_ON(!folio_test_locked(folio));
if (folio_test_private(folio)) {
- dout("%p invalidate_folio idx %lu full dirty page\n",
- inode, folio->index);
+ doutc(cl, "%llx.%llx idx %lu full dirty page\n",
+ ceph_vinop(inode), folio->index);
snapc = folio_detach_private(folio);
ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
@@ -166,10 +165,10 @@ static void ceph_invalidate_folio(struct folio *folio, size_t offset,
static bool ceph_release_folio(struct folio *folio, gfp_t gfp)
{
struct inode *inode = folio->mapping->host;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
- dout("%llx:%llx release_folio idx %lu (%sdirty)\n",
- ceph_vinop(inode),
- folio->index, folio_test_dirty(folio) ? "" : "not ");
+ doutc(cl, "%llx.%llx idx %lu (%sdirty)\n", ceph_vinop(inode),
+ folio->index, folio_test_dirty(folio) ? "" : "not ");
if (folio_test_private(folio))
return false;
@@ -229,7 +228,7 @@ static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq)
static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq)
{
struct inode *inode = subreq->rreq->inode;
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
u64 objno, objoff;
u32 xlen;
@@ -244,7 +243,8 @@ static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq)
static void finish_netfs_read(struct ceph_osd_request *req)
{
struct inode *inode = req->r_inode;
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
struct netfs_io_subrequest *subreq = req->r_priv;
struct ceph_osd_req_op *op = &req->r_ops[0];
@@ -254,8 +254,8 @@ static void finish_netfs_read(struct ceph_osd_request *req)
ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, osd_data->length, err);
- dout("%s: result %d subreq->len=%zu i_size=%lld\n", __func__, req->r_result,
- subreq->len, i_size_read(req->r_inode));
+ doutc(cl, "result %d subreq->len=%zu i_size=%lld\n", req->r_result,
+ subreq->len, i_size_read(req->r_inode));
/* no object means success but no data */
if (err == -ENOENT)
@@ -348,7 +348,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
struct netfs_io_request *rreq = subreq->rreq;
struct inode *inode = rreq->inode;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
struct ceph_osd_request *req = NULL;
struct ceph_vino vino = ceph_vino(inode);
struct iov_iter iter;
@@ -383,7 +384,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
goto out;
}
- dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len);
+ doutc(cl, "%llx.%llx pos=%llu orig_len=%zu len=%llu\n",
+ ceph_vinop(inode), subreq->start, subreq->len, len);
iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len);
@@ -400,8 +402,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off);
if (err < 0) {
- dout("%s: iov_ter_get_pages_alloc returned %d\n",
- __func__, err);
+ doutc(cl, "%llx.%llx failed to allocate pages, %d\n",
+ ceph_vinop(inode), err);
goto out;
}
@@ -429,12 +431,13 @@ out:
ceph_osdc_put_request(req);
if (err)
netfs_subreq_terminated(subreq, err, false);
- dout("%s: result %d\n", __func__, err);
+ doutc(cl, "%llx.%llx result %d\n", ceph_vinop(inode), err);
}
static int ceph_init_request(struct netfs_io_request *rreq, struct file *file)
{
struct inode *inode = rreq->inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int got = 0, want = CEPH_CAP_FILE_CACHE;
struct ceph_netfs_request_data *priv;
int ret = 0;
@@ -466,12 +469,12 @@ static int ceph_init_request(struct netfs_io_request *rreq, struct file *file)
*/
ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got);
if (ret < 0) {
- dout("start_read %p, error getting cap\n", inode);
+ doutc(cl, "%llx.%llx, error getting cap\n", ceph_vinop(inode));
goto out;
}
if (!(got & want)) {
- dout("start_read %p, no cache cap\n", inode);
+ doutc(cl, "%llx.%llx, no cache cap\n", ceph_vinop(inode));
ret = -EACCES;
goto out;
}
@@ -563,13 +566,14 @@ get_oldest_context(struct inode *inode, struct ceph_writeback_ctl *ctl,
struct ceph_snap_context *page_snapc)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_snap_context *snapc = NULL;
struct ceph_cap_snap *capsnap = NULL;
spin_lock(&ci->i_ceph_lock);
list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
- dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
- capsnap->context, capsnap->dirty_pages);
+ doutc(cl, " capsnap %p snapc %p has %d dirty pages\n",
+ capsnap, capsnap->context, capsnap->dirty_pages);
if (!capsnap->dirty_pages)
continue;
@@ -601,8 +605,8 @@ get_oldest_context(struct inode *inode, struct ceph_writeback_ctl *ctl,
}
if (!snapc && ci->i_wrbuffer_ref_head) {
snapc = ceph_get_snap_context(ci->i_head_snapc);
- dout(" head snapc %p has %d dirty pages\n",
- snapc, ci->i_wrbuffer_ref_head);
+ doutc(cl, " head snapc %p has %d dirty pages\n", snapc,
+ ci->i_wrbuffer_ref_head);
if (ctl) {
ctl->i_size = i_size_read(inode);
ctl->truncate_size = ci->i_truncate_size;
@@ -658,7 +662,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
struct folio *folio = page_folio(page);
struct inode *inode = page->mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
struct ceph_snap_context *snapc, *oldest;
loff_t page_off = page_offset(page);
int err;
@@ -670,7 +675,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
bool caching = ceph_is_cache_enabled(inode);
struct page *bounce_page = NULL;
- dout("writepage %p idx %lu\n", page, page->index);
+ doutc(cl, "%llx.%llx page %p idx %lu\n", ceph_vinop(inode), page,
+ page->index);
if (ceph_inode_is_shutdown(inode))
return -EIO;
@@ -678,13 +684,14 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
/* verify this is a writeable snap context */
snapc = page_snap_context(page);
if (!snapc) {
- dout("writepage %p page %p not dirty?\n", inode, page);
+ doutc(cl, "%llx.%llx page %p not dirty?\n", ceph_vinop(inode),
+ page);
return 0;
}
oldest = get_oldest_context(inode, &ceph_wbc, snapc);
if (snapc->seq > oldest->seq) {
- dout("writepage %p page %p snapc %p not writeable - noop\n",
- inode, page, snapc);
+ doutc(cl, "%llx.%llx page %p snapc %p not writeable - noop\n",
+ ceph_vinop(inode), page, snapc);
/* we should only noop if called by kswapd */
WARN_ON(!(current->flags & PF_MEMALLOC));
ceph_put_snap_context(oldest);
@@ -695,8 +702,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
/* is this a partial page at end of file? */
if (page_off >= ceph_wbc.i_size) {
- dout("folio at %lu beyond eof %llu\n", folio->index,
- ceph_wbc.i_size);
+ doutc(cl, "%llx.%llx folio at %lu beyond eof %llu\n",
+ ceph_vinop(inode), folio->index, ceph_wbc.i_size);
folio_invalidate(folio, 0, folio_size(folio));
return 0;
}
@@ -705,8 +712,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
len = ceph_wbc.i_size - page_off;
wlen = IS_ENCRYPTED(inode) ? round_up(len, CEPH_FSCRYPT_BLOCK_SIZE) : len;
- dout("writepage %p page %p index %lu on %llu~%llu snapc %p seq %lld\n",
- inode, page, page->index, page_off, wlen, snapc, snapc->seq);
+ doutc(cl, "%llx.%llx page %p index %lu on %llu~%llu snapc %p seq %lld\n",
+ ceph_vinop(inode), page, page->index, page_off, wlen, snapc,
+ snapc->seq);
if (atomic_long_inc_return(&fsc->writeback_count) >
CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
@@ -747,10 +755,11 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
osd_req_op_extent_osd_data_pages(req, 0,
bounce_page ? &bounce_page : &page, wlen, 0,
false, false);
- dout("writepage %llu~%llu (%llu bytes, %sencrypted)\n",
- page_off, len, wlen, IS_ENCRYPTED(inode) ? "" : "not ");
+ doutc(cl, "%llx.%llx %llu~%llu (%llu bytes, %sencrypted)\n",
+ ceph_vinop(inode), page_off, len, wlen,
+ IS_ENCRYPTED(inode) ? "" : "not ");
- req->r_mtime = inode->i_mtime;
+ req->r_mtime = inode_get_mtime(inode);
ceph_osdc_start_request(osdc, req);
err = ceph_osdc_wait_request(osdc, req);
@@ -767,19 +776,21 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
wbc = &tmp_wbc;
if (err == -ERESTARTSYS) {
/* killed by SIGKILL */
- dout("writepage interrupted page %p\n", page);
+ doutc(cl, "%llx.%llx interrupted page %p\n",
+ ceph_vinop(inode), page);
redirty_page_for_writepage(wbc, page);
end_page_writeback(page);
return err;
}
if (err == -EBLOCKLISTED)
fsc->blocklisted = true;
- dout("writepage setting page/mapping error %d %p\n",
- err, page);
+ doutc(cl, "%llx.%llx setting page/mapping error %d %p\n",
+ ceph_vinop(inode), err, page);
mapping_set_error(&inode->i_data, err);
wbc->pages_skipped++;
} else {
- dout("writepage cleaned page %p\n", page);
+ doutc(cl, "%llx.%llx cleaned page %p\n",
+ ceph_vinop(inode), page);
err = 0; /* vfs expects us to return 0 */
}
oldest = detach_page_private(page);
@@ -803,7 +814,7 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
ihold(inode);
if (wbc->sync_mode == WB_SYNC_NONE &&
- ceph_inode_to_client(inode)->write_congested)
+ ceph_inode_to_fs_client(inode)->write_congested)
return AOP_WRITEPAGE_ACTIVATE;
wait_on_page_fscache(page);
@@ -829,6 +840,7 @@ static void writepages_finish(struct ceph_osd_request *req)
{
struct inode *inode = req->r_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_osd_data *osd_data;
struct page *page;
int num_pages, total_pages = 0;
@@ -836,11 +848,11 @@ static void writepages_finish(struct ceph_osd_request *req)
int rc = req->r_result;
struct ceph_snap_context *snapc = req->r_snapc;
struct address_space *mapping = inode->i_mapping;
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
unsigned int len = 0;
bool remove_page;
- dout("writepages_finish %p rc %d\n", inode, rc);
+ doutc(cl, "%llx.%llx rc %d\n", ceph_vinop(inode), rc);
if (rc < 0) {
mapping_set_error(mapping, rc);
ceph_set_error_write(ci);
@@ -862,8 +874,10 @@ static void writepages_finish(struct ceph_osd_request *req)
/* clean all pages */
for (i = 0; i < req->r_num_ops; i++) {
if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) {
- pr_warn("%s incorrect op %d req %p index %d tid %llu\n",
- __func__, req->r_ops[i].op, req, i, req->r_tid);
+ pr_warn_client(cl,
+ "%llx.%llx incorrect op %d req %p index %d tid %llu\n",
+ ceph_vinop(inode), req->r_ops[i].op, req, i,
+ req->r_tid);
break;
}
@@ -890,7 +904,7 @@ static void writepages_finish(struct ceph_osd_request *req)
ceph_put_snap_context(detach_page_private(page));
end_page_writeback(page);
- dout("unlocking %p\n", page);
+ doutc(cl, "unlocking %p\n", page);
if (remove_page)
generic_error_remove_page(inode->i_mapping,
@@ -898,8 +912,9 @@ static void writepages_finish(struct ceph_osd_request *req)
unlock_page(page);
}
- dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n",
- inode, osd_data->length, rc >= 0 ? num_pages : 0);
+ doutc(cl, "%llx.%llx wrote %llu bytes cleaned %d pages\n",
+ ceph_vinop(inode), osd_data->length,
+ rc >= 0 ? num_pages : 0);
release_pages(osd_data->pages, num_pages);
}
@@ -926,7 +941,8 @@ static int ceph_writepages_start(struct address_space *mapping,
{
struct inode *inode = mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
struct ceph_vino vino = ceph_vino(inode);
pgoff_t index, start_index, end = -1;
struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
@@ -944,15 +960,15 @@ static int ceph_writepages_start(struct address_space *mapping,
fsc->write_congested)
return 0;
- dout("writepages_start %p (mode=%s)\n", inode,
- wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
- (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
+ doutc(cl, "%llx.%llx (mode=%s)\n", ceph_vinop(inode),
+ wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
+ (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
if (ceph_inode_is_shutdown(inode)) {
if (ci->i_wrbuffer_ref > 0) {
- pr_warn_ratelimited(
- "writepage_start %p %lld forced umount\n",
- inode, ceph_ino(inode));
+ pr_warn_ratelimited_client(cl,
+ "%llx.%llx %lld forced umount\n",
+ ceph_vinop(inode), ceph_ino(inode));
}
mapping_set_error(mapping, -EIO);
return -EIO; /* we're in a forced umount, don't write! */
@@ -976,11 +992,11 @@ retry:
if (!snapc) {
/* hmm, why does writepages get called when there
is no dirty data? */
- dout(" no snap context with dirty data?\n");
+ doutc(cl, " no snap context with dirty data?\n");
goto out;
}
- dout(" oldest snapc is %p seq %lld (%d snaps)\n",
- snapc, snapc->seq, snapc->num_snaps);
+ doutc(cl, " oldest snapc is %p seq %lld (%d snaps)\n", snapc,
+ snapc->seq, snapc->num_snaps);
should_loop = false;
if (ceph_wbc.head_snapc && snapc != last_snapc) {
@@ -990,13 +1006,13 @@ retry:
end = -1;
if (index > 0)
should_loop = true;
- dout(" cyclic, start at %lu\n", index);
+ doutc(cl, " cyclic, start at %lu\n", index);
} else {
index = wbc->range_start >> PAGE_SHIFT;
end = wbc->range_end >> PAGE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = true;
- dout(" not cyclic, %lu to %lu\n", index, end);
+ doutc(cl, " not cyclic, %lu to %lu\n", index, end);
}
} else if (!ceph_wbc.head_snapc) {
/* Do not respect wbc->range_{start,end}. Dirty pages
@@ -1005,7 +1021,7 @@ retry:
* associated with 'snapc' get written */
if (index > 0)
should_loop = true;
- dout(" non-head snapc, range whole\n");
+ doutc(cl, " non-head snapc, range whole\n");
}
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
@@ -1028,12 +1044,12 @@ retry:
get_more_pages:
nr_folios = filemap_get_folios_tag(mapping, &index,
end, tag, &fbatch);
- dout("pagevec_lookup_range_tag got %d\n", nr_folios);
+ doutc(cl, "pagevec_lookup_range_tag got %d\n", nr_folios);
if (!nr_folios && !locked_pages)
break;
for (i = 0; i < nr_folios && locked_pages < max_pages; i++) {
page = &fbatch.folios[i]->page;
- dout("? %p idx %lu\n", page, page->index);
+ doutc(cl, "? %p idx %lu\n", page, page->index);
if (locked_pages == 0)
lock_page(page); /* first page */
else if (!trylock_page(page))
@@ -1042,15 +1058,15 @@ get_more_pages:
/* only dirty pages, or our accounting breaks */
if (unlikely(!PageDirty(page)) ||
unlikely(page->mapping != mapping)) {
- dout("!dirty or !mapping %p\n", page);
+ doutc(cl, "!dirty or !mapping %p\n", page);
unlock_page(page);
continue;
}
/* only if matching snap context */
pgsnapc = page_snap_context(page);
if (pgsnapc != snapc) {
- dout("page snapc %p %lld != oldest %p %lld\n",
- pgsnapc, pgsnapc->seq, snapc, snapc->seq);
+ doutc(cl, "page snapc %p %lld != oldest %p %lld\n",
+ pgsnapc, pgsnapc->seq, snapc, snapc->seq);
if (!should_loop &&
!ceph_wbc.head_snapc &&
wbc->sync_mode != WB_SYNC_NONE)
@@ -1061,8 +1077,8 @@ get_more_pages:
if (page_offset(page) >= ceph_wbc.i_size) {
struct folio *folio = page_folio(page);
- dout("folio at %lu beyond eof %llu\n",
- folio->index, ceph_wbc.i_size);
+ doutc(cl, "folio at %lu beyond eof %llu\n",
+ folio->index, ceph_wbc.i_size);
if ((ceph_wbc.size_stable ||
folio_pos(folio) >= i_size_read(inode)) &&
folio_clear_dirty_for_io(folio))
@@ -1072,23 +1088,23 @@ get_more_pages:
continue;
}
if (strip_unit_end && (page->index > strip_unit_end)) {
- dout("end of strip unit %p\n", page);
+ doutc(cl, "end of strip unit %p\n", page);
unlock_page(page);
break;
}
if (PageWriteback(page) || PageFsCache(page)) {
if (wbc->sync_mode == WB_SYNC_NONE) {
- dout("%p under writeback\n", page);
+ doutc(cl, "%p under writeback\n", page);
unlock_page(page);
continue;
}
- dout("waiting on writeback %p\n", page);
+ doutc(cl, "waiting on writeback %p\n", page);
wait_on_page_writeback(page);
wait_on_page_fscache(page);
}
if (!clear_page_dirty_for_io(page)) {
- dout("%p !clear_page_dirty_for_io\n", page);
+ doutc(cl, "%p !clear_page_dirty_for_io\n", page);
unlock_page(page);
continue;
}
@@ -1143,8 +1159,8 @@ get_more_pages:
}
/* note position of first page in fbatch */
- dout("%p will write page %p idx %lu\n",
- inode, page, page->index);
+ doutc(cl, "%llx.%llx will write page %p idx %lu\n",
+ ceph_vinop(inode), page, page->index);
if (atomic_long_inc_return(&fsc->writeback_count) >
CONGESTION_ON_THRESH(
@@ -1158,8 +1174,9 @@ get_more_pages:
locked_pages ? GFP_NOWAIT : GFP_NOFS);
if (IS_ERR(pages[locked_pages])) {
if (PTR_ERR(pages[locked_pages]) == -EINVAL)
- pr_err("%s: inode->i_blkbits=%hhu\n",
- __func__, inode->i_blkbits);
+ pr_err_client(cl,
+ "inode->i_blkbits=%hhu\n",
+ inode->i_blkbits);
/* better not fail on first page! */
BUG_ON(locked_pages == 0);
pages[locked_pages] = NULL;
@@ -1193,7 +1210,7 @@ get_more_pages:
if (nr_folios && i == nr_folios &&
locked_pages < max_pages) {
- dout("reached end fbatch, trying for more\n");
+ doutc(cl, "reached end fbatch, trying for more\n");
folio_batch_release(&fbatch);
goto get_more_pages;
}
@@ -1254,8 +1271,8 @@ new_request:
/* Start a new extent */
osd_req_op_extent_dup_last(req, op_idx,
cur_offset - offset);
- dout("writepages got pages at %llu~%llu\n",
- offset, len);
+ doutc(cl, "got pages at %llu~%llu\n", offset,
+ len);
osd_req_op_extent_osd_data_pages(req, op_idx,
data_pages, len, 0,
from_pool, false);
@@ -1288,12 +1305,13 @@ new_request:
if (IS_ENCRYPTED(inode))
len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE);
- dout("writepages got pages at %llu~%llu\n", offset, len);
+ doutc(cl, "got pages at %llu~%llu\n", offset, len);
if (IS_ENCRYPTED(inode) &&
((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK))
- pr_warn("%s: bad encrypted write offset=%lld len=%llu\n",
- __func__, offset, len);
+ pr_warn_client(cl,
+ "bad encrypted write offset=%lld len=%llu\n",
+ offset, len);
osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len,
0, from_pool, false);
@@ -1327,7 +1345,7 @@ new_request:
pages = NULL;
}
- req->r_mtime = inode->i_mtime;
+ req->r_mtime = inode_get_mtime(inode);
ceph_osdc_start_request(&fsc->client->osdc, req);
req = NULL;
@@ -1345,14 +1363,14 @@ new_request:
done = true;
release_folios:
- dout("folio_batch release on %d folios (%p)\n", (int)fbatch.nr,
- fbatch.nr ? fbatch.folios[0] : NULL);
+ doutc(cl, "folio_batch release on %d folios (%p)\n",
+ (int)fbatch.nr, fbatch.nr ? fbatch.folios[0] : NULL);
folio_batch_release(&fbatch);
}
if (should_loop && !done) {
/* more to do; loop back to beginning of file */
- dout("writepages looping back to beginning of file\n");
+ doutc(cl, "looping back to beginning of file\n");
end = start_index - 1; /* OK even when start_index == 0 */
/* to write dirty pages associated with next snapc,
@@ -1390,7 +1408,8 @@ release_folios:
out:
ceph_osdc_put_request(req);
ceph_put_snap_context(last_snapc);
- dout("writepages dend - startone, rc = %d\n", rc);
+ doutc(cl, "%llx.%llx dend - startone, rc = %d\n", ceph_vinop(inode),
+ rc);
return rc;
}
@@ -1424,11 +1443,12 @@ static struct ceph_snap_context *
ceph_find_incompatible(struct page *page)
{
struct inode *inode = page->mapping->host;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
if (ceph_inode_is_shutdown(inode)) {
- dout(" page %p %llx:%llx is shutdown\n", page,
- ceph_vinop(inode));
+ doutc(cl, " %llx.%llx page %p is shutdown\n",
+ ceph_vinop(inode), page);
return ERR_PTR(-ESTALE);
}
@@ -1449,13 +1469,15 @@ ceph_find_incompatible(struct page *page)
if (snapc->seq > oldest->seq) {
/* not writeable -- return it for the caller to deal with */
ceph_put_snap_context(oldest);
- dout(" page %p snapc %p not current or oldest\n", page, snapc);
+ doutc(cl, " %llx.%llx page %p snapc %p not current or oldest\n",
+ ceph_vinop(inode), page, snapc);
return ceph_get_snap_context(snapc);
}
ceph_put_snap_context(oldest);
/* yay, writeable, do it now (without dropping page lock) */
- dout(" page %p snapc %p not current, but oldest\n", page, snapc);
+ doutc(cl, " %llx.%llx page %p snapc %p not current, but oldest\n",
+ ceph_vinop(inode), page, snapc);
if (clear_page_dirty_for_io(page)) {
int r = writepage_nounlock(page, NULL);
if (r < 0)
@@ -1524,10 +1546,11 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
{
struct folio *folio = page_folio(subpage);
struct inode *inode = file_inode(file);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
bool check_cap = false;
- dout("write_end file %p inode %p folio %p %d~%d (%d)\n", file,
- inode, folio, (int)pos, (int)copied, (int)len);
+ doutc(cl, "%llx.%llx file %p folio %p %d~%d (%d)\n", ceph_vinop(inode),
+ file, folio, (int)pos, (int)copied, (int)len);
if (!folio_test_uptodate(folio)) {
/* just return that nothing was copied on a short copy */
@@ -1587,6 +1610,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
struct inode *inode = file_inode(vma->vm_file);
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_file_info *fi = vma->vm_file->private_data;
loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT;
int want, got, err;
@@ -1598,8 +1622,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
ceph_block_sigs(&oldset);
- dout("filemap_fault %p %llx.%llx %llu trying to get caps\n",
- inode, ceph_vinop(inode), off);
+ doutc(cl, "%llx.%llx %llu trying to get caps\n",
+ ceph_vinop(inode), off);
if (fi->fmode & CEPH_FILE_MODE_LAZY)
want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
else
@@ -1610,8 +1634,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
if (err < 0)
goto out_restore;
- dout("filemap_fault %p %llu got cap refs on %s\n",
- inode, off, ceph_cap_string(got));
+ doutc(cl, "%llx.%llx %llu got cap refs on %s\n", ceph_vinop(inode),
+ off, ceph_cap_string(got));
if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
!ceph_has_inline_data(ci)) {
@@ -1619,8 +1643,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
ceph_add_rw_context(fi, &rw_ctx);
ret = filemap_fault(vmf);
ceph_del_rw_context(fi, &rw_ctx);
- dout("filemap_fault %p %llu drop cap refs %s ret %x\n",
- inode, off, ceph_cap_string(got), ret);
+ doutc(cl, "%llx.%llx %llu drop cap refs %s ret %x\n",
+ ceph_vinop(inode), off, ceph_cap_string(got), ret);
} else
err = -EAGAIN;
@@ -1661,8 +1685,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
out_inline:
filemap_invalidate_unlock_shared(mapping);
- dout("filemap_fault %p %llu read inline data ret %x\n",
- inode, off, ret);
+ doutc(cl, "%llx.%llx %llu read inline data ret %x\n",
+ ceph_vinop(inode), off, ret);
}
out_restore:
ceph_restore_sigs(&oldset);
@@ -1676,6 +1700,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct inode *inode = file_inode(vma->vm_file);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_file_info *fi = vma->vm_file->private_data;
struct ceph_cap_flush *prealloc_cf;
@@ -1702,8 +1727,8 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
else
len = offset_in_thp(page, size);
- dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n",
- inode, ceph_vinop(inode), off, len, size);
+ doutc(cl, "%llx.%llx %llu~%zd getting caps i_size %llu\n",
+ ceph_vinop(inode), off, len, size);
if (fi->fmode & CEPH_FILE_MODE_LAZY)
want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
else
@@ -1714,8 +1739,8 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
if (err < 0)
goto out_free;
- dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
- inode, off, len, ceph_cap_string(got));
+ doutc(cl, "%llx.%llx %llu~%zd got cap refs on %s\n", ceph_vinop(inode),
+ off, len, ceph_cap_string(got));
/* Update time before taking page lock */
file_update_time(vma->vm_file);
@@ -1763,8 +1788,8 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
__mark_inode_dirty(inode, dirty);
}
- dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %x\n",
- inode, off, len, ceph_cap_string(got), ret);
+ doutc(cl, "%llx.%llx %llu~%zd dropping cap refs on %s ret %x\n",
+ ceph_vinop(inode), off, len, ceph_cap_string(got), ret);
ceph_put_cap_refs_async(ci, got);
out_free:
ceph_restore_sigs(&oldset);
@@ -1778,6 +1803,7 @@ out_free:
void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
char *data, size_t len)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct address_space *mapping = inode->i_mapping;
struct page *page;
@@ -1798,8 +1824,8 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
}
}
- dout("fill_inline_data %p %llx.%llx len %zu locked_page %p\n",
- inode, ceph_vinop(inode), len, locked_page);
+ doutc(cl, "%p %llx.%llx len %zu locked_page %p\n", inode,
+ ceph_vinop(inode), len, locked_page);
if (len > 0) {
void *kaddr = kmap_atomic(page);
@@ -1823,7 +1849,8 @@ int ceph_uninline_data(struct file *file)
{
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
struct ceph_osd_request *req = NULL;
struct ceph_cap_flush *prealloc_cf = NULL;
struct folio *folio = NULL;
@@ -1836,8 +1863,8 @@ int ceph_uninline_data(struct file *file)
inline_version = ci->i_inline_version;
spin_unlock(&ci->i_ceph_lock);
- dout("uninline_data %p %llx.%llx inline_version %llu\n",
- inode, ceph_vinop(inode), inline_version);
+ doutc(cl, "%llx.%llx inline_version %llu\n", ceph_vinop(inode),
+ inline_version);
if (ceph_inode_is_shutdown(inode)) {
err = -EIO;
@@ -1875,7 +1902,7 @@ int ceph_uninline_data(struct file *file)
goto out_unlock;
}
- req->r_mtime = inode->i_mtime;
+ req->r_mtime = inode_get_mtime(inode);
ceph_osdc_start_request(&fsc->client->osdc, req);
err = ceph_osdc_wait_request(&fsc->client->osdc, req);
ceph_osdc_put_request(req);
@@ -1917,7 +1944,7 @@ int ceph_uninline_data(struct file *file)
goto out_put_req;
}
- req->r_mtime = inode->i_mtime;
+ req->r_mtime = inode_get_mtime(inode);
ceph_osdc_start_request(&fsc->client->osdc, req);
err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1949,8 +1976,8 @@ out_unlock:
}
out:
ceph_free_cap_flush(prealloc_cf);
- dout("uninline_data %p %llx.%llx inline_version %llu = %d\n",
- inode, ceph_vinop(inode), inline_version, err);
+ doutc(cl, "%llx.%llx inline_version %llu = %d\n",
+ ceph_vinop(inode), inline_version, err);
return err;
}
@@ -1977,8 +2004,9 @@ enum {
static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
s64 pool, struct ceph_string *pool_ns)
{
- struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->netfs.inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(&ci->netfs.inode);
struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_client *cl = fsc->client;
struct ceph_osd_request *rd_req = NULL, *wr_req = NULL;
struct rb_node **p, *parent;
struct ceph_pool_perm *perm;
@@ -2013,10 +2041,10 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
goto out;
if (pool_ns)
- dout("__ceph_pool_perm_get pool %lld ns %.*s no perm cached\n",
- pool, (int)pool_ns->len, pool_ns->str);
+ doutc(cl, "pool %lld ns %.*s no perm cached\n", pool,
+ (int)pool_ns->len, pool_ns->str);
else
- dout("__ceph_pool_perm_get pool %lld no perm cached\n", pool);
+ doutc(cl, "pool %lld no perm cached\n", pool);
down_write(&mdsc->pool_perm_rwsem);
p = &mdsc->pool_perm_tree.rb_node;
@@ -2092,7 +2120,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
0, false, true);
ceph_osdc_start_request(&fsc->client->osdc, rd_req);
- wr_req->r_mtime = ci->netfs.inode.i_mtime;
+ wr_req->r_mtime = inode_get_mtime(&ci->netfs.inode);
ceph_osdc_start_request(&fsc->client->osdc, wr_req);
err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req);
@@ -2141,15 +2169,16 @@ out:
if (!err)
err = have;
if (pool_ns)
- dout("__ceph_pool_perm_get pool %lld ns %.*s result = %d\n",
- pool, (int)pool_ns->len, pool_ns->str, err);
+ doutc(cl, "pool %lld ns %.*s result = %d\n", pool,
+ (int)pool_ns->len, pool_ns->str, err);
else
- dout("__ceph_pool_perm_get pool %lld result = %d\n", pool, err);
+ doutc(cl, "pool %lld result = %d\n", pool, err);
return err;
}
int ceph_pool_perm_check(struct inode *inode, int need)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_string *pool_ns;
s64 pool;
@@ -2168,7 +2197,7 @@ int ceph_pool_perm_check(struct inode *inode, int need)
return 0;
}
- if (ceph_test_mount_opt(ceph_inode_to_client(inode),
+ if (ceph_test_mount_opt(ceph_inode_to_fs_client(inode),
NOPOOLPERM))
return 0;
@@ -2179,13 +2208,11 @@ int ceph_pool_perm_check(struct inode *inode, int need)
check:
if (flags & CEPH_I_POOL_PERM) {
if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) {
- dout("ceph_pool_perm_check pool %lld no read perm\n",
- pool);
+ doutc(cl, "pool %lld no read perm\n", pool);
return -EPERM;
}
if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) {
- dout("ceph_pool_perm_check pool %lld no write perm\n",
- pool);
+ doutc(cl, "pool %lld no write perm\n", pool);
return -EPERM;
}
return 0;
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index de1dee46d3df..930fbd54d2c8 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -15,7 +15,7 @@
void ceph_fscache_register_inode_cookie(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
/* No caching for filesystem? */
if (!fsc->fscache)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 14215ec646f7..2c0b8dc3dd0d 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -186,10 +186,10 @@ static void __ceph_unreserve_caps(struct ceph_mds_client *mdsc, int nr_caps)
mdsc->caps_avail_count += nr_caps;
}
- dout("%s: caps %d = %d used + %d resv + %d avail\n",
- __func__,
- mdsc->caps_total_count, mdsc->caps_use_count,
- mdsc->caps_reserve_count, mdsc->caps_avail_count);
+ doutc(mdsc->fsc->client,
+ "caps %d = %d used + %d resv + %d avail\n",
+ mdsc->caps_total_count, mdsc->caps_use_count,
+ mdsc->caps_reserve_count, mdsc->caps_avail_count);
BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
mdsc->caps_reserve_count +
mdsc->caps_avail_count);
@@ -202,6 +202,7 @@ static void __ceph_unreserve_caps(struct ceph_mds_client *mdsc, int nr_caps)
int ceph_reserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx, int need)
{
+ struct ceph_client *cl = mdsc->fsc->client;
int i, j;
struct ceph_cap *cap;
int have;
@@ -212,7 +213,7 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *s;
LIST_HEAD(newcaps);
- dout("reserve caps ctx=%p need=%d\n", ctx, need);
+ doutc(cl, "ctx=%p need=%d\n", ctx, need);
/* first reserve any caps that are already allocated */
spin_lock(&mdsc->caps_list_lock);
@@ -272,8 +273,8 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
continue;
}
- pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
- ctx, need, have + alloc);
+ pr_warn_client(cl, "ctx=%p ENOMEM need=%d got=%d\n", ctx, need,
+ have + alloc);
err = -ENOMEM;
break;
}
@@ -298,20 +299,21 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
spin_unlock(&mdsc->caps_list_lock);
- dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
- ctx, mdsc->caps_total_count, mdsc->caps_use_count,
- mdsc->caps_reserve_count, mdsc->caps_avail_count);
+ doutc(cl, "ctx=%p %d = %d used + %d resv + %d avail\n", ctx,
+ mdsc->caps_total_count, mdsc->caps_use_count,
+ mdsc->caps_reserve_count, mdsc->caps_avail_count);
return err;
}
void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx)
{
+ struct ceph_client *cl = mdsc->fsc->client;
bool reclaim = false;
if (!ctx->count)
return;
- dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
+ doutc(cl, "ctx=%p count=%d\n", ctx, ctx->count);
spin_lock(&mdsc->caps_list_lock);
__ceph_unreserve_caps(mdsc, ctx->count);
ctx->count = 0;
@@ -328,6 +330,7 @@ void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_cap *cap = NULL;
/* temporary, until we do something about cap import/export */
@@ -359,9 +362,9 @@ struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
}
spin_lock(&mdsc->caps_list_lock);
- dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
- ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
- mdsc->caps_reserve_count, mdsc->caps_avail_count);
+ doutc(cl, "ctx=%p (%d) %d = %d used + %d resv + %d avail\n", ctx,
+ ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
+ mdsc->caps_reserve_count, mdsc->caps_avail_count);
BUG_ON(!ctx->count);
BUG_ON(ctx->count > mdsc->caps_reserve_count);
BUG_ON(list_empty(&mdsc->caps_list));
@@ -382,10 +385,12 @@ struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
{
+ struct ceph_client *cl = mdsc->fsc->client;
+
spin_lock(&mdsc->caps_list_lock);
- dout("put_cap %p %d = %d used + %d resv + %d avail\n",
- cap, mdsc->caps_total_count, mdsc->caps_use_count,
- mdsc->caps_reserve_count, mdsc->caps_avail_count);
+ doutc(cl, "%p %d = %d used + %d resv + %d avail\n", cap,
+ mdsc->caps_total_count, mdsc->caps_use_count,
+ mdsc->caps_reserve_count, mdsc->caps_avail_count);
mdsc->caps_use_count--;
/*
* Keep some preallocated caps around (ceph_min_count), to
@@ -491,11 +496,13 @@ static void __insert_cap_node(struct ceph_inode_info *ci,
static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
+ struct inode *inode = &ci->netfs.inode;
struct ceph_mount_options *opt = mdsc->fsc->mount_options;
+
ci->i_hold_caps_max = round_jiffies(jiffies +
opt->caps_wanted_delay_max * HZ);
- dout("__cap_set_timeouts %p %lu\n", &ci->netfs.inode,
- ci->i_hold_caps_max - jiffies);
+ doutc(mdsc->fsc->client, "%p %llx.%llx %lu\n", inode,
+ ceph_vinop(inode), ci->i_hold_caps_max - jiffies);
}
/*
@@ -509,8 +516,11 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->netfs.inode,
- ci->i_ceph_flags, ci->i_hold_caps_max);
+ struct inode *inode = &ci->netfs.inode;
+
+ doutc(mdsc->fsc->client, "%p %llx.%llx flags 0x%lx at %lu\n",
+ inode, ceph_vinop(inode), ci->i_ceph_flags,
+ ci->i_hold_caps_max);
if (!mdsc->stopping) {
spin_lock(&mdsc->cap_delay_lock);
if (!list_empty(&ci->i_cap_delay_list)) {
@@ -533,7 +543,9 @@ no_change:
static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- dout("__cap_delay_requeue_front %p\n", &ci->netfs.inode);
+ struct inode *inode = &ci->netfs.inode;
+
+ doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode));
spin_lock(&mdsc->cap_delay_lock);
ci->i_ceph_flags |= CEPH_I_FLUSH;
if (!list_empty(&ci->i_cap_delay_list))
@@ -550,7 +562,9 @@ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- dout("__cap_delay_cancel %p\n", &ci->netfs.inode);
+ struct inode *inode = &ci->netfs.inode;
+
+ doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode));
if (list_empty(&ci->i_cap_delay_list))
return;
spin_lock(&mdsc->cap_delay_lock);
@@ -562,6 +576,9 @@ static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
unsigned issued)
{
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+
unsigned had = __ceph_caps_issued(ci, NULL);
lockdep_assert_held(&ci->i_ceph_lock);
@@ -586,7 +603,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
if (issued & CEPH_CAP_FILE_SHARED)
atomic_inc(&ci->i_shared_gen);
if (S_ISDIR(ci->netfs.inode.i_mode)) {
- dout(" marking %p NOT complete\n", &ci->netfs.inode);
+ doutc(cl, " marking %p NOT complete\n", inode);
__ceph_dir_clear_complete(ci);
}
}
@@ -635,7 +652,8 @@ void ceph_add_cap(struct inode *inode,
unsigned seq, unsigned mseq, u64 realmino, int flags,
struct ceph_cap **new_cap)
{
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_cap *cap;
int mds = session->s_mds;
@@ -644,8 +662,9 @@ void ceph_add_cap(struct inode *inode,
lockdep_assert_held(&ci->i_ceph_lock);
- dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
- session->s_mds, cap_id, ceph_cap_string(issued), seq);
+ doutc(cl, "%p %llx.%llx mds%d cap %llx %s seq %d\n", inode,
+ ceph_vinop(inode), session->s_mds, cap_id,
+ ceph_cap_string(issued), seq);
gen = atomic_read(&session->s_cap_gen);
@@ -723,9 +742,9 @@ void ceph_add_cap(struct inode *inode,
actual_wanted = __ceph_caps_wanted(ci);
if ((wanted & ~actual_wanted) ||
(issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
- dout(" issued %s, mds wanted %s, actual %s, queueing\n",
- ceph_cap_string(issued), ceph_cap_string(wanted),
- ceph_cap_string(actual_wanted));
+ doutc(cl, "issued %s, mds wanted %s, actual %s, queueing\n",
+ ceph_cap_string(issued), ceph_cap_string(wanted),
+ ceph_cap_string(actual_wanted));
__cap_delay_requeue(mdsc, ci);
}
@@ -742,9 +761,9 @@ void ceph_add_cap(struct inode *inode,
WARN_ON(ci->i_auth_cap == cap);
}
- dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
- inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
- ceph_cap_string(issued|cap->issued), seq, mds);
+ doutc(cl, "inode %p %llx.%llx cap %p %s now %s seq %d mds%d\n",
+ inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
+ ceph_cap_string(issued|cap->issued), seq, mds);
cap->cap_id = cap_id;
cap->issued = issued;
cap->implemented |= issued;
@@ -766,6 +785,8 @@ void ceph_add_cap(struct inode *inode,
*/
static int __cap_is_valid(struct ceph_cap *cap)
{
+ struct inode *inode = &cap->ci->netfs.inode;
+ struct ceph_client *cl = cap->session->s_mdsc->fsc->client;
unsigned long ttl;
u32 gen;
@@ -773,9 +794,9 @@ static int __cap_is_valid(struct ceph_cap *cap)
ttl = cap->session->s_cap_ttl;
if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
- dout("__cap_is_valid %p cap %p issued %s "
- "but STALE (gen %u vs %u)\n", &cap->ci->netfs.inode,
- cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
+ doutc(cl, "%p %llx.%llx cap %p issued %s but STALE (gen %u vs %u)\n",
+ inode, ceph_vinop(inode), cap,
+ ceph_cap_string(cap->issued), cap->cap_gen, gen);
return 0;
}
@@ -789,6 +810,8 @@ static int __cap_is_valid(struct ceph_cap *cap)
*/
int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
{
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int have = ci->i_snap_caps;
struct ceph_cap *cap;
struct rb_node *p;
@@ -799,8 +822,8 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
cap = rb_entry(p, struct ceph_cap, ci_node);
if (!__cap_is_valid(cap))
continue;
- dout("__ceph_caps_issued %p cap %p issued %s\n",
- &ci->netfs.inode, cap, ceph_cap_string(cap->issued));
+ doutc(cl, "%p %llx.%llx cap %p issued %s\n", inode,
+ ceph_vinop(inode), cap, ceph_cap_string(cap->issued));
have |= cap->issued;
if (implemented)
*implemented |= cap->implemented;
@@ -843,16 +866,18 @@ int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
*/
static void __touch_cap(struct ceph_cap *cap)
{
+ struct inode *inode = &cap->ci->netfs.inode;
struct ceph_mds_session *s = cap->session;
+ struct ceph_client *cl = s->s_mdsc->fsc->client;
spin_lock(&s->s_cap_lock);
if (!s->s_cap_iterator) {
- dout("__touch_cap %p cap %p mds%d\n", &cap->ci->netfs.inode, cap,
- s->s_mds);
+ doutc(cl, "%p %llx.%llx cap %p mds%d\n", inode,
+ ceph_vinop(inode), cap, s->s_mds);
list_move_tail(&cap->session_caps, &s->s_caps);
} else {
- dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
- &cap->ci->netfs.inode, cap, s->s_mds);
+ doutc(cl, "%p %llx.%llx cap %p mds%d NOP, iterating over caps\n",
+ inode, ceph_vinop(inode), cap, s->s_mds);
}
spin_unlock(&s->s_cap_lock);
}
@@ -864,15 +889,16 @@ static void __touch_cap(struct ceph_cap *cap)
*/
int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
{
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_cap *cap;
struct rb_node *p;
int have = ci->i_snap_caps;
if ((have & mask) == mask) {
- dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
- " (mask %s)\n", ceph_ino(&ci->netfs.inode),
- ceph_cap_string(have),
- ceph_cap_string(mask));
+ doutc(cl, "mask %p %llx.%llx snap issued %s (mask %s)\n",
+ inode, ceph_vinop(inode), ceph_cap_string(have),
+ ceph_cap_string(mask));
return 1;
}
@@ -881,10 +907,10 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
if (!__cap_is_valid(cap))
continue;
if ((cap->issued & mask) == mask) {
- dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
- " (mask %s)\n", ceph_ino(&ci->netfs.inode), cap,
- ceph_cap_string(cap->issued),
- ceph_cap_string(mask));
+ doutc(cl, "mask %p %llx.%llx cap %p issued %s (mask %s)\n",
+ inode, ceph_vinop(inode), cap,
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(mask));
if (touch)
__touch_cap(cap);
return 1;
@@ -893,10 +919,10 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
/* does a combination of caps satisfy mask? */
have |= cap->issued;
if ((have & mask) == mask) {
- dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
- " (mask %s)\n", ceph_ino(&ci->netfs.inode),
- ceph_cap_string(cap->issued),
- ceph_cap_string(mask));
+ doutc(cl, "mask %p %llx.%llx combo issued %s (mask %s)\n",
+ inode, ceph_vinop(inode),
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(mask));
if (touch) {
struct rb_node *q;
@@ -922,7 +948,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask,
int touch)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb);
int r;
r = __ceph_caps_issued_mask(ci, mask, touch);
@@ -954,13 +980,14 @@ int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
{
struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int ret;
spin_lock(&ci->i_ceph_lock);
ret = __ceph_caps_revoking_other(ci, NULL, mask);
spin_unlock(&ci->i_ceph_lock);
- dout("ceph_caps_revoking %p %s = %d\n", inode,
- ceph_cap_string(mask), ret);
+ doutc(cl, "%p %llx.%llx %s = %d\n", inode, ceph_vinop(inode),
+ ceph_cap_string(mask), ret);
return ret;
}
@@ -996,7 +1023,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR);
const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY);
struct ceph_mount_options *opt =
- ceph_inode_to_client(&ci->netfs.inode)->mount_options;
+ ceph_inode_to_fs_client(&ci->netfs.inode)->mount_options;
unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ;
unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ;
@@ -1107,21 +1134,23 @@ int ceph_is_any_caps(struct inode *inode)
void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
{
struct ceph_mds_session *session = cap->session;
+ struct ceph_client *cl = session->s_mdsc->fsc->client;
struct ceph_inode_info *ci = cap->ci;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc;
int removed = 0;
/* 'ci' being NULL means the remove have already occurred */
if (!ci) {
- dout("%s: cap inode is NULL\n", __func__);
+ doutc(cl, "inode is NULL\n");
return;
}
lockdep_assert_held(&ci->i_ceph_lock);
- dout("__ceph_remove_cap %p from %p\n", cap, &ci->netfs.inode);
+ doutc(cl, "%p from %p %llx.%llx\n", cap, inode, ceph_vinop(inode));
- mdsc = ceph_inode_to_client(&ci->netfs.inode)->mdsc;
+ mdsc = ceph_inode_to_fs_client(&ci->netfs.inode)->mdsc;
/* remove from inode's cap rbtree, and clear auth cap */
rb_erase(&cap->ci_node, &ci->i_caps);
@@ -1132,8 +1161,8 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
spin_lock(&session->s_cap_lock);
if (session->s_cap_iterator == cap) {
/* not yet, we are iterating over this very cap */
- dout("__ceph_remove_cap delaying %p removal from session %p\n",
- cap, cap->session);
+ doutc(cl, "delaying %p removal from session %p\n", cap,
+ cap->session);
} else {
list_del_init(&cap->session_caps);
session->s_nr_caps--;
@@ -1178,20 +1207,21 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
}
}
-void ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
+void ceph_remove_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
+ bool queue_release)
{
struct ceph_inode_info *ci = cap->ci;
struct ceph_fs_client *fsc;
/* 'ci' being NULL means the remove have already occurred */
if (!ci) {
- dout("%s: cap inode is NULL\n", __func__);
+ doutc(mdsc->fsc->client, "inode is NULL\n");
return;
}
lockdep_assert_held(&ci->i_ceph_lock);
- fsc = ceph_inode_to_client(&ci->netfs.inode);
+ fsc = ceph_inode_to_fs_client(&ci->netfs.inode);
WARN_ON_ONCE(ci->i_auth_cap == cap &&
!list_empty(&ci->i_dirty_item) &&
!fsc->blocklisted &&
@@ -1227,15 +1257,19 @@ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg)
{
struct ceph_mds_caps *fc;
void *p;
- struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
-
- dout("%s %s %llx %llx caps %s wanted %s dirty %s seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu xattr_ver %llu xattr_len %d\n",
- __func__, ceph_cap_op_name(arg->op), arg->cid, arg->ino,
- ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted),
- ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq,
- arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows,
- arg->size, arg->max_size, arg->xattr_version,
- arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
+ struct ceph_mds_client *mdsc = arg->session->s_mdsc;
+ struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
+
+ doutc(mdsc->fsc->client,
+ "%s %llx %llx caps %s wanted %s dirty %s seq %u/%u"
+ " tid %llu/%llu mseq %u follows %lld size %llu/%llu"
+ " xattr_ver %llu xattr_len %d\n",
+ ceph_cap_op_name(arg->op), arg->cid, arg->ino,
+ ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted),
+ ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq,
+ arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows,
+ arg->size, arg->max_size, arg->xattr_version,
+ arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
msg->hdr.version = cpu_to_le16(12);
msg->hdr.tid = cpu_to_le64(arg->flush_tid);
@@ -1342,6 +1376,8 @@ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg)
*/
void __ceph_remove_caps(struct ceph_inode_info *ci)
{
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
struct rb_node *p;
/* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU)
@@ -1351,7 +1387,7 @@ void __ceph_remove_caps(struct ceph_inode_info *ci)
while (p) {
struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
p = rb_next(p);
- ceph_remove_cap(cap, true);
+ ceph_remove_cap(mdsc, cap, true);
}
spin_unlock(&ci->i_ceph_lock);
}
@@ -1370,6 +1406,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
{
struct ceph_inode_info *ci = cap->ci;
struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int held, revoking;
lockdep_assert_held(&ci->i_ceph_lock);
@@ -1378,10 +1415,10 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
revoking = cap->implemented & ~cap->issued;
retain &= ~revoking;
- dout("%s %p cap %p session %p %s -> %s (revoking %s)\n",
- __func__, inode, cap, cap->session,
- ceph_cap_string(held), ceph_cap_string(held & retain),
- ceph_cap_string(revoking));
+ doutc(cl, "%p %llx.%llx cap %p session %p %s -> %s (revoking %s)\n",
+ inode, ceph_vinop(inode), cap, cap->session,
+ ceph_cap_string(held), ceph_cap_string(held & retain),
+ ceph_cap_string(revoking));
BUG_ON((retain & CEPH_CAP_PIN) == 0);
ci->i_ceph_flags &= ~CEPH_I_FLUSH;
@@ -1421,8 +1458,8 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
arg->old_xattr_buf = NULL;
}
- arg->mtime = inode->i_mtime;
- arg->atime = inode->i_atime;
+ arg->mtime = inode_get_mtime(inode);
+ arg->atime = inode_get_atime(inode);
arg->ctime = inode_get_ctime(inode);
arg->btime = ci->i_btime;
arg->change_attr = inode_peek_iversion_raw(inode);
@@ -1497,13 +1534,16 @@ static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci)
{
struct ceph_msg *msg;
struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(arg), GFP_NOFS,
false);
if (!msg) {
- pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n",
- ceph_vinop(inode), ceph_cap_string(arg->dirty),
- arg->flush_tid);
+ pr_err_client(cl,
+ "error allocating cap msg: ino (%llx.%llx)"
+ " flushing %s tid %llu, requeuing cap.\n",
+ ceph_vinop(inode), ceph_cap_string(arg->dirty),
+ arg->flush_tid);
spin_lock(&ci->i_ceph_lock);
__cap_delay_requeue(arg->session->s_mdsc, ci);
spin_unlock(&ci->i_ceph_lock);
@@ -1592,11 +1632,13 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
{
struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc = session->s_mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_cap_snap *capsnap;
u64 oldest_flush_tid = 0;
u64 first_tid = 1, last_tid = 0;
- dout("__flush_snaps %p session %p\n", inode, session);
+ doutc(cl, "%p %llx.%llx session %p\n", inode, ceph_vinop(inode),
+ session);
list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
/*
@@ -1611,7 +1653,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
/* only flush each capsnap once */
if (capsnap->cap_flush.tid > 0) {
- dout(" already flushed %p, skipping\n", capsnap);
+ doutc(cl, "already flushed %p, skipping\n", capsnap);
continue;
}
@@ -1643,8 +1685,8 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
int ret;
if (!(cap && cap->session == session)) {
- dout("__flush_snaps %p auth cap %p not mds%d, "
- "stop\n", inode, cap, session->s_mds);
+ doutc(cl, "%p %llx.%llx auth cap %p not mds%d, stop\n",
+ inode, ceph_vinop(inode), cap, session->s_mds);
break;
}
@@ -1665,15 +1707,17 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
refcount_inc(&capsnap->nref);
spin_unlock(&ci->i_ceph_lock);
- dout("__flush_snaps %p capsnap %p tid %llu %s\n",
- inode, capsnap, cf->tid, ceph_cap_string(capsnap->dirty));
+ doutc(cl, "%p %llx.%llx capsnap %p tid %llu %s\n", inode,
+ ceph_vinop(inode), capsnap, cf->tid,
+ ceph_cap_string(capsnap->dirty));
ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
oldest_flush_tid);
if (ret < 0) {
- pr_err("__flush_snaps: error sending cap flushsnap, "
- "ino (%llx.%llx) tid %llu follows %llu\n",
- ceph_vinop(inode), cf->tid, capsnap->follows);
+ pr_err_client(cl, "error sending cap flushsnap, "
+ "ino (%llx.%llx) tid %llu follows %llu\n",
+ ceph_vinop(inode), cf->tid,
+ capsnap->follows);
}
ceph_put_cap_snap(capsnap);
@@ -1685,28 +1729,29 @@ void ceph_flush_snaps(struct ceph_inode_info *ci,
struct ceph_mds_session **psession)
{
struct inode *inode = &ci->netfs.inode;
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_mds_session *session = NULL;
bool need_put = false;
int mds;
- dout("ceph_flush_snaps %p\n", inode);
+ doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
if (psession)
session = *psession;
retry:
spin_lock(&ci->i_ceph_lock);
if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) {
- dout(" no capsnap needs flush, doing nothing\n");
+ doutc(cl, " no capsnap needs flush, doing nothing\n");
goto out;
}
if (!ci->i_auth_cap) {
- dout(" no auth cap (migrating?), doing nothing\n");
+ doutc(cl, " no auth cap (migrating?), doing nothing\n");
goto out;
}
mds = ci->i_auth_cap->session->s_mds;
if (session && session->s_mds != mds) {
- dout(" oops, wrong session %p mutex\n", session);
+ doutc(cl, " oops, wrong session %p mutex\n", session);
ceph_put_mds_session(session);
session = NULL;
}
@@ -1750,23 +1795,25 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
struct ceph_cap_flush **pcf)
{
struct ceph_mds_client *mdsc =
- ceph_sb_to_client(ci->netfs.inode.i_sb)->mdsc;
+ ceph_sb_to_fs_client(ci->netfs.inode.i_sb)->mdsc;
struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int was = ci->i_dirty_caps;
int dirty = 0;
lockdep_assert_held(&ci->i_ceph_lock);
if (!ci->i_auth_cap) {
- pr_warn("__mark_dirty_caps %p %llx mask %s, "
- "but no auth cap (session was closed?)\n",
- inode, ceph_ino(inode), ceph_cap_string(mask));
+ pr_warn_client(cl, "%p %llx.%llx mask %s, "
+ "but no auth cap (session was closed?)\n",
+ inode, ceph_vinop(inode),
+ ceph_cap_string(mask));
return 0;
}
- dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->netfs.inode,
- ceph_cap_string(mask), ceph_cap_string(was),
- ceph_cap_string(was | mask));
+ doutc(cl, "%p %llx.%llx %s dirty %s -> %s\n", inode,
+ ceph_vinop(inode), ceph_cap_string(mask),
+ ceph_cap_string(was), ceph_cap_string(was | mask));
ci->i_dirty_caps |= mask;
if (was == 0) {
struct ceph_mds_session *session = ci->i_auth_cap->session;
@@ -1779,8 +1826,9 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
ci->i_head_snapc = ceph_get_snap_context(
ci->i_snap_realm->cached_context);
}
- dout(" inode %p now dirty snapc %p auth cap %p\n",
- &ci->netfs.inode, ci->i_head_snapc, ci->i_auth_cap);
+ doutc(cl, "%p %llx.%llx now dirty snapc %p auth cap %p\n",
+ inode, ceph_vinop(inode), ci->i_head_snapc,
+ ci->i_auth_cap);
BUG_ON(!list_empty(&ci->i_dirty_item));
spin_lock(&mdsc->cap_dirty_lock);
list_add(&ci->i_dirty_item, &session->s_cap_dirty);
@@ -1873,7 +1921,8 @@ static u64 __mark_caps_flushing(struct inode *inode,
struct ceph_mds_session *session, bool wake,
u64 *oldest_flush_tid)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_cap_flush *cf = NULL;
int flushing;
@@ -1884,13 +1933,13 @@ static u64 __mark_caps_flushing(struct inode *inode,
BUG_ON(!ci->i_prealloc_cap_flush);
flushing = ci->i_dirty_caps;
- dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
- ceph_cap_string(flushing),
- ceph_cap_string(ci->i_flushing_caps),
- ceph_cap_string(ci->i_flushing_caps | flushing));
+ doutc(cl, "flushing %s, flushing_caps %s -> %s\n",
+ ceph_cap_string(flushing),
+ ceph_cap_string(ci->i_flushing_caps),
+ ceph_cap_string(ci->i_flushing_caps | flushing));
ci->i_flushing_caps |= flushing;
ci->i_dirty_caps = 0;
- dout(" inode %p now !dirty\n", inode);
+ doutc(cl, "%p %llx.%llx now !dirty\n", inode, ceph_vinop(inode));
swap(cf, ci->i_prealloc_cap_flush);
cf->caps = flushing;
@@ -1921,6 +1970,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
__releases(ci->i_ceph_lock)
__acquires(ci->i_ceph_lock)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
u32 invalidating_gen = ci->i_rdcache_gen;
@@ -1932,12 +1982,13 @@ static int try_nonblocking_invalidate(struct inode *inode)
if (inode->i_data.nrpages == 0 &&
invalidating_gen == ci->i_rdcache_gen) {
/* success. */
- dout("try_nonblocking_invalidate %p success\n", inode);
+ doutc(cl, "%p %llx.%llx success\n", inode,
+ ceph_vinop(inode));
/* save any racing async invalidate some trouble */
ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;
return 0;
}
- dout("try_nonblocking_invalidate %p failed\n", inode);
+ doutc(cl, "%p %llx.%llx failed\n", inode, ceph_vinop(inode));
return -1;
}
@@ -1969,6 +2020,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags)
{
struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_cap *cap;
u64 flush_tid, oldest_flush_tid;
int file_wanted, used, cap_used;
@@ -2043,9 +2095,9 @@ retry:
}
}
- dout("check_caps %llx.%llx file_want %s used %s dirty %s flushing %s"
- " issued %s revoking %s retain %s %s%s%s\n", ceph_vinop(inode),
- ceph_cap_string(file_wanted),
+ doutc(cl, "%p %llx.%llx file_want %s used %s dirty %s "
+ "flushing %s issued %s revoking %s retain %s %s%s%s\n",
+ inode, ceph_vinop(inode), ceph_cap_string(file_wanted),
ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
ceph_cap_string(ci->i_flushing_caps),
ceph_cap_string(issued), ceph_cap_string(revoking),
@@ -2066,10 +2118,10 @@ retry:
(revoking & (CEPH_CAP_FILE_CACHE|
CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */
!tried_invalidate) {
- dout("check_caps trying to invalidate on %llx.%llx\n",
- ceph_vinop(inode));
+ doutc(cl, "trying to invalidate on %p %llx.%llx\n",
+ inode, ceph_vinop(inode));
if (try_nonblocking_invalidate(inode) < 0) {
- dout("check_caps queuing invalidate\n");
+ doutc(cl, "queuing invalidate\n");
queue_invalidate = true;
ci->i_rdcache_revoking = ci->i_rdcache_gen;
}
@@ -2097,35 +2149,35 @@ retry:
cap_used &= ~ci->i_auth_cap->issued;
revoking = cap->implemented & ~cap->issued;
- dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
- cap->mds, cap, ceph_cap_string(cap_used),
- ceph_cap_string(cap->issued),
- ceph_cap_string(cap->implemented),
- ceph_cap_string(revoking));
+ doutc(cl, " mds%d cap %p used %s issued %s implemented %s revoking %s\n",
+ cap->mds, cap, ceph_cap_string(cap_used),
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(cap->implemented),
+ ceph_cap_string(revoking));
if (cap == ci->i_auth_cap &&
(cap->issued & CEPH_CAP_FILE_WR)) {
/* request larger max_size from MDS? */
if (ci->i_wanted_max_size > ci->i_max_size &&
ci->i_wanted_max_size > ci->i_requested_max_size) {
- dout("requesting new max_size\n");
+ doutc(cl, "requesting new max_size\n");
goto ack;
}
/* approaching file_max? */
if (__ceph_should_report_size(ci)) {
- dout("i_size approaching max_size\n");
+ doutc(cl, "i_size approaching max_size\n");
goto ack;
}
}
/* flush anything dirty? */
if (cap == ci->i_auth_cap) {
if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) {
- dout("flushing dirty caps\n");
+ doutc(cl, "flushing dirty caps\n");
goto ack;
}
if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) {
- dout("flushing snap caps\n");
+ doutc(cl, "flushing snap caps\n");
goto ack;
}
}
@@ -2133,7 +2185,7 @@ retry:
/* completed revocation? going down and there are no caps? */
if (revoking) {
if ((revoking & cap_used) == 0) {
- dout("completed revocation of %s\n",
+ doutc(cl, "completed revocation of %s\n",
ceph_cap_string(cap->implemented & ~cap->issued));
goto ack;
}
@@ -2232,7 +2284,7 @@ ack:
*/
static int try_flush_caps(struct inode *inode, u64 *ptid)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
int flushing = 0;
u64 flush_tid = 0, oldest_flush_tid = 0;
@@ -2310,7 +2362,8 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)
*/
static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_request *req1 = NULL, *req2 = NULL;
int ret, err = 0;
@@ -2400,8 +2453,9 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
kfree(sessions);
}
- dout("%s %p wait on tid %llu %llu\n", __func__,
- inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
+ doutc(cl, "%p %llx.%llx wait on tid %llu %llu\n", inode,
+ ceph_vinop(inode), req1 ? req1->r_tid : 0ULL,
+ req2 ? req2->r_tid : 0ULL);
if (req1) {
ret = !wait_for_completion_timeout(&req1->r_safe_completion,
ceph_timeout_jiffies(req1->r_timeout));
@@ -2427,11 +2481,13 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
struct inode *inode = file->f_mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
u64 flush_tid;
int ret, err;
int dirty;
- dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
+ doutc(cl, "%p %llx.%llx%s\n", inode, ceph_vinop(inode),
+ datasync ? " datasync" : "");
ret = file_write_and_wait_range(file, start, end);
if (datasync)
@@ -2442,7 +2498,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
goto out;
dirty = try_flush_caps(inode, &flush_tid);
- dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
+ doutc(cl, "dirty caps are %s\n", ceph_cap_string(dirty));
err = flush_mdlog_and_wait_inode_unsafe_requests(inode);
@@ -2463,7 +2519,8 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
if (err < 0)
ret = err;
out:
- dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
+ doutc(cl, "%p %llx.%llx%s result=%d\n", inode, ceph_vinop(inode),
+ datasync ? " datasync" : "", ret);
return ret;
}
@@ -2476,12 +2533,13 @@ out:
int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
u64 flush_tid;
int err = 0;
int dirty;
int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync);
- dout("write_inode %p wait=%d\n", inode, wait);
+ doutc(cl, "%p %llx.%llx wait=%d\n", inode, ceph_vinop(inode), wait);
ceph_fscache_unpin_writeback(inode, wbc);
if (wait) {
err = ceph_wait_on_async_create(inode);
@@ -2493,7 +2551,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
caps_are_flushed(inode, flush_tid));
} else {
struct ceph_mds_client *mdsc =
- ceph_sb_to_client(inode->i_sb)->mdsc;
+ ceph_sb_to_fs_client(inode->i_sb)->mdsc;
spin_lock(&ci->i_ceph_lock);
if (__ceph_caps_dirty(ci))
@@ -2511,6 +2569,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
__acquires(ci->i_ceph_lock)
{
struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_cap *cap;
struct ceph_cap_flush *cf;
int ret;
@@ -2536,8 +2595,8 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
cap = ci->i_auth_cap;
if (!(cap && cap->session == session)) {
- pr_err("%p auth cap %p not mds%d ???\n",
- inode, cap, session->s_mds);
+ pr_err_client(cl, "%p auth cap %p not mds%d ???\n",
+ inode, cap, session->s_mds);
break;
}
@@ -2546,8 +2605,9 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
if (!cf->is_capsnap) {
struct cap_msg_args arg;
- dout("kick_flushing_caps %p cap %p tid %llu %s\n",
- inode, cap, cf->tid, ceph_cap_string(cf->caps));
+ doutc(cl, "%p %llx.%llx cap %p tid %llu %s\n",
+ inode, ceph_vinop(inode), cap, cf->tid,
+ ceph_cap_string(cf->caps));
__prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH,
(cf->tid < last_snap_flush ?
CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0),
@@ -2561,9 +2621,9 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_snap *capsnap =
container_of(cf, struct ceph_cap_snap,
cap_flush);
- dout("kick_flushing_caps %p capsnap %p tid %llu %s\n",
- inode, capsnap, cf->tid,
- ceph_cap_string(capsnap->dirty));
+ doutc(cl, "%p %llx.%llx capsnap %p tid %llu %s\n",
+ inode, ceph_vinop(inode), capsnap, cf->tid,
+ ceph_cap_string(capsnap->dirty));
refcount_inc(&capsnap->nref);
spin_unlock(&ci->i_ceph_lock);
@@ -2571,11 +2631,10 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
oldest_flush_tid);
if (ret < 0) {
- pr_err("kick_flushing_caps: error sending "
- "cap flushsnap, ino (%llx.%llx) "
- "tid %llu follows %llu\n",
- ceph_vinop(inode), cf->tid,
- capsnap->follows);
+ pr_err_client(cl, "error sending cap flushsnap,"
+ " %p %llx.%llx tid %llu follows %llu\n",
+ inode, ceph_vinop(inode), cf->tid,
+ capsnap->follows);
}
ceph_put_cap_snap(capsnap);
@@ -2588,22 +2647,26 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_inode_info *ci;
struct ceph_cap *cap;
u64 oldest_flush_tid;
- dout("early_kick_flushing_caps mds%d\n", session->s_mds);
+ doutc(cl, "mds%d\n", session->s_mds);
spin_lock(&mdsc->cap_dirty_lock);
oldest_flush_tid = __get_oldest_flush_tid(mdsc);
spin_unlock(&mdsc->cap_dirty_lock);
list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
+ struct inode *inode = &ci->netfs.inode;
+
spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap;
if (!(cap && cap->session == session)) {
- pr_err("%p auth cap %p not mds%d ???\n",
- &ci->netfs.inode, cap, session->s_mds);
+ pr_err_client(cl, "%p %llx.%llx auth cap %p not mds%d ???\n",
+ inode, ceph_vinop(inode), cap,
+ session->s_mds);
spin_unlock(&ci->i_ceph_lock);
continue;
}
@@ -2636,24 +2699,28 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_inode_info *ci;
struct ceph_cap *cap;
u64 oldest_flush_tid;
lockdep_assert_held(&session->s_mutex);
- dout("kick_flushing_caps mds%d\n", session->s_mds);
+ doutc(cl, "mds%d\n", session->s_mds);
spin_lock(&mdsc->cap_dirty_lock);
oldest_flush_tid = __get_oldest_flush_tid(mdsc);
spin_unlock(&mdsc->cap_dirty_lock);
list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
+ struct inode *inode = &ci->netfs.inode;
+
spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap;
if (!(cap && cap->session == session)) {
- pr_err("%p auth cap %p not mds%d ???\n",
- &ci->netfs.inode, cap, session->s_mds);
+ pr_err_client(cl, "%p %llx.%llx auth cap %p not mds%d ???\n",
+ inode, ceph_vinop(inode), cap,
+ session->s_mds);
spin_unlock(&ci->i_ceph_lock);
continue;
}
@@ -2670,11 +2737,13 @@ void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
{
struct ceph_mds_client *mdsc = session->s_mdsc;
struct ceph_cap *cap = ci->i_auth_cap;
+ struct inode *inode = &ci->netfs.inode;
lockdep_assert_held(&ci->i_ceph_lock);
- dout("%s %p flushing %s\n", __func__, &ci->netfs.inode,
- ceph_cap_string(ci->i_flushing_caps));
+ doutc(mdsc->fsc->client, "%p %llx.%llx flushing %s\n",
+ inode, ceph_vinop(inode),
+ ceph_cap_string(ci->i_flushing_caps));
if (!list_empty(&ci->i_cap_flush_list)) {
u64 oldest_flush_tid;
@@ -2696,6 +2765,9 @@ void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
void ceph_take_cap_refs(struct ceph_inode_info *ci, int got,
bool snap_rwsem_locked)
{
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+
lockdep_assert_held(&ci->i_ceph_lock);
if (got & CEPH_CAP_PIN)
@@ -2716,10 +2788,10 @@ void ceph_take_cap_refs(struct ceph_inode_info *ci, int got,
}
if (got & CEPH_CAP_FILE_BUFFER) {
if (ci->i_wb_ref == 0)
- ihold(&ci->netfs.inode);
+ ihold(inode);
ci->i_wb_ref++;
- dout("%s %p wb %d -> %d (?)\n", __func__,
- &ci->netfs.inode, ci->i_wb_ref-1, ci->i_wb_ref);
+ doutc(cl, "%p %llx.%llx wb %d -> %d (?)\n", inode,
+ ceph_vinop(inode), ci->i_wb_ref-1, ci->i_wb_ref);
}
}
@@ -2746,20 +2818,23 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
loff_t endoff, int flags, int *got)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int ret = 0;
int have, implemented;
bool snap_rwsem_locked = false;
- dout("get_cap_refs %p need %s want %s\n", inode,
- ceph_cap_string(need), ceph_cap_string(want));
+ doutc(cl, "%p %llx.%llx need %s want %s\n", inode,
+ ceph_vinop(inode), ceph_cap_string(need),
+ ceph_cap_string(want));
again:
spin_lock(&ci->i_ceph_lock);
if ((flags & CHECK_FILELOCK) &&
(ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) {
- dout("try_get_cap_refs %p error filelock\n", inode);
+ doutc(cl, "%p %llx.%llx error filelock\n", inode,
+ ceph_vinop(inode));
ret = -EIO;
goto out_unlock;
}
@@ -2779,8 +2854,8 @@ again:
if (have & need & CEPH_CAP_FILE_WR) {
if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
- dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
- inode, endoff, ci->i_max_size);
+ doutc(cl, "%p %llx.%llx endoff %llu > maxsize %llu\n",
+ inode, ceph_vinop(inode), endoff, ci->i_max_size);
if (endoff > ci->i_requested_max_size)
ret = ci->i_auth_cap ? -EFBIG : -EUCLEAN;
goto out_unlock;
@@ -2790,7 +2865,8 @@ again:
* can get a final snapshot value for size+mtime.
*/
if (__ceph_have_pending_cap_snap(ci)) {
- dout("get_cap_refs %p cap_snap_pending\n", inode);
+ doutc(cl, "%p %llx.%llx cap_snap_pending\n", inode,
+ ceph_vinop(inode));
goto out_unlock;
}
}
@@ -2808,9 +2884,9 @@ again:
int not = want & ~(have & need);
int revoking = implemented & ~have;
int exclude = revoking & not;
- dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
- inode, ceph_cap_string(have), ceph_cap_string(not),
- ceph_cap_string(revoking));
+ doutc(cl, "%p %llx.%llx have %s but not %s (revoking %s)\n",
+ inode, ceph_vinop(inode), ceph_cap_string(have),
+ ceph_cap_string(not), ceph_cap_string(revoking));
if (!exclude || !(exclude & CEPH_CAP_FILE_BUFFER)) {
if (!snap_rwsem_locked &&
!ci->i_head_snapc &&
@@ -2850,28 +2926,31 @@ again:
spin_unlock(&s->s_cap_lock);
}
if (session_readonly) {
- dout("get_cap_refs %p need %s but mds%d readonly\n",
- inode, ceph_cap_string(need), ci->i_auth_cap->mds);
+ doutc(cl, "%p %llx.%llx need %s but mds%d readonly\n",
+ inode, ceph_vinop(inode), ceph_cap_string(need),
+ ci->i_auth_cap->mds);
ret = -EROFS;
goto out_unlock;
}
if (ceph_inode_is_shutdown(inode)) {
- dout("get_cap_refs %p inode is shutdown\n", inode);
+ doutc(cl, "%p %llx.%llx inode is shutdown\n",
+ inode, ceph_vinop(inode));
ret = -ESTALE;
goto out_unlock;
}
mds_wanted = __ceph_caps_mds_wanted(ci, false);
if (need & ~mds_wanted) {
- dout("get_cap_refs %p need %s > mds_wanted %s\n",
- inode, ceph_cap_string(need),
- ceph_cap_string(mds_wanted));
+ doutc(cl, "%p %llx.%llx need %s > mds_wanted %s\n",
+ inode, ceph_vinop(inode), ceph_cap_string(need),
+ ceph_cap_string(mds_wanted));
ret = -EUCLEAN;
goto out_unlock;
}
- dout("get_cap_refs %p have %s need %s\n", inode,
- ceph_cap_string(have), ceph_cap_string(need));
+ doutc(cl, "%p %llx.%llx have %s need %s\n", inode,
+ ceph_vinop(inode), ceph_cap_string(have),
+ ceph_cap_string(need));
}
out_unlock:
@@ -2886,8 +2965,8 @@ out_unlock:
else if (ret == 1)
ceph_update_cap_hit(&mdsc->metric);
- dout("get_cap_refs %p ret %d got %s\n", inode,
- ret, ceph_cap_string(*got));
+ doutc(cl, "%p %llx.%llx ret %d got %s\n", inode,
+ ceph_vinop(inode), ret, ceph_cap_string(*got));
return ret;
}
@@ -2899,13 +2978,14 @@ out_unlock:
static void check_max_size(struct inode *inode, loff_t endoff)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int check = 0;
/* do we need to explicitly request a larger max_size? */
spin_lock(&ci->i_ceph_lock);
if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) {
- dout("write %p at large endoff %llu, req max_size\n",
- inode, endoff);
+ doutc(cl, "write %p %llx.%llx at large endoff %llu, req max_size\n",
+ inode, ceph_vinop(inode), endoff);
ci->i_wanted_max_size = endoff;
}
/* duplicate ceph_check_caps()'s logic */
@@ -2964,7 +3044,7 @@ int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need,
int want, loff_t endoff, int *got)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
int ret, _got, flags;
ret = ceph_pool_perm_check(inode, need);
@@ -3115,10 +3195,12 @@ void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap)
{
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+
if (!capsnap->need_flush &&
!capsnap->writing && !capsnap->dirty_pages) {
- dout("dropping cap_snap %p follows %llu\n",
- capsnap, capsnap->follows);
+ doutc(cl, "%p follows %llu\n", capsnap, capsnap->follows);
BUG_ON(capsnap->cap_flush.tid > 0);
ceph_put_snap_context(capsnap->context);
if (!list_is_last(&capsnap->ci_item, &ci->i_cap_snaps))
@@ -3150,6 +3232,7 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
enum put_cap_refs_mode mode)
{
struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int last = 0, put = 0, flushsnaps = 0, wake = 0;
bool check_flushsnaps = false;
@@ -3172,8 +3255,8 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
put++;
check_flushsnaps = true;
}
- dout("put_cap_refs %p wb %d -> %d (?)\n",
- inode, ci->i_wb_ref+1, ci->i_wb_ref);
+ doutc(cl, "%p %llx.%llx wb %d -> %d (?)\n", inode,
+ ceph_vinop(inode), ci->i_wb_ref+1, ci->i_wb_ref);
}
if (had & CEPH_CAP_FILE_WR) {
if (--ci->i_wr_ref == 0) {
@@ -3213,8 +3296,8 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
}
spin_unlock(&ci->i_ceph_lock);
- dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
- last ? " last" : "", put ? " put" : "");
+ doutc(cl, "%p %llx.%llx had %s%s%s\n", inode, ceph_vinop(inode),
+ ceph_cap_string(had), last ? " last" : "", put ? " put" : "");
switch (mode) {
case PUT_CAP_REFS_SYNC:
@@ -3264,6 +3347,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
struct ceph_snap_context *snapc)
{
struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_cap_snap *capsnap = NULL, *iter;
int put = 0;
bool last = false;
@@ -3287,11 +3371,10 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
ceph_put_snap_context(ci->i_head_snapc);
ci->i_head_snapc = NULL;
}
- dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
- inode,
- ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
- ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
- last ? " LAST" : "");
+ doutc(cl, "on %p %llx.%llx head %d/%d -> %d/%d %s\n",
+ inode, ceph_vinop(inode), ci->i_wrbuffer_ref+nr,
+ ci->i_wrbuffer_ref_head+nr, ci->i_wrbuffer_ref,
+ ci->i_wrbuffer_ref_head, last ? " LAST" : "");
} else {
list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) {
if (iter->context == snapc) {
@@ -3321,13 +3404,12 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
}
}
}
- dout("put_wrbuffer_cap_refs on %p cap_snap %p "
- " snap %lld %d/%d -> %d/%d %s%s\n",
- inode, capsnap, capsnap->context->seq,
- ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
- ci->i_wrbuffer_ref, capsnap->dirty_pages,
- last ? " (wrbuffer last)" : "",
- complete_capsnap ? " (complete capsnap)" : "");
+ doutc(cl, "%p %llx.%llx cap_snap %p snap %lld %d/%d -> %d/%d %s%s\n",
+ inode, ceph_vinop(inode), capsnap, capsnap->context->seq,
+ ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
+ ci->i_wrbuffer_ref, capsnap->dirty_pages,
+ last ? " (wrbuffer last)" : "",
+ complete_capsnap ? " (complete capsnap)" : "");
}
unlock:
@@ -3350,9 +3432,10 @@ unlock:
*/
static void invalidate_aliases(struct inode *inode)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct dentry *dn, *prev = NULL;
- dout("invalidate_aliases inode %p\n", inode);
+ doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
d_prune_aliases(inode);
/*
* For non-directory inode, d_find_alias() only returns
@@ -3411,6 +3494,7 @@ static void handle_cap_grant(struct inode *inode,
__releases(ci->i_ceph_lock)
__releases(session->s_mdsc->snap_rwsem)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
int seq = le32_to_cpu(grant->seq);
int newcaps = le32_to_cpu(grant->caps);
@@ -3434,10 +3518,11 @@ static void handle_cap_grant(struct inode *inode,
if (IS_ENCRYPTED(inode) && size)
size = extra_info->fscrypt_file_size;
- dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
- inode, cap, session->s_mds, seq, ceph_cap_string(newcaps));
- dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
- i_size_read(inode));
+ doutc(cl, "%p %llx.%llx cap %p mds%d seq %d %s\n", inode,
+ ceph_vinop(inode), cap, session->s_mds, seq,
+ ceph_cap_string(newcaps));
+ doutc(cl, " size %llu max_size %llu, i_size %llu\n", size,
+ max_size, i_size_read(inode));
/*
@@ -3497,15 +3582,17 @@ static void handle_cap_grant(struct inode *inode,
inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
ci->i_btime = extra_info->btime;
- dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
- from_kuid(&init_user_ns, inode->i_uid),
- from_kgid(&init_user_ns, inode->i_gid));
+ doutc(cl, "%p %llx.%llx mode 0%o uid.gid %d.%d\n", inode,
+ ceph_vinop(inode), inode->i_mode,
+ from_kuid(&init_user_ns, inode->i_uid),
+ from_kgid(&init_user_ns, inode->i_gid));
#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
if (ci->fscrypt_auth_len != extra_info->fscrypt_auth_len ||
memcmp(ci->fscrypt_auth, extra_info->fscrypt_auth,
ci->fscrypt_auth_len))
- pr_warn_ratelimited("%s: cap grant attempt to change fscrypt_auth on non-I_NEW inode (old len %d new len %d)\n",
- __func__, ci->fscrypt_auth_len,
+ pr_warn_ratelimited_client(cl,
+ "cap grant attempt to change fscrypt_auth on non-I_NEW inode (old len %d new len %d)\n",
+ ci->fscrypt_auth_len,
extra_info->fscrypt_auth_len);
#endif
}
@@ -3523,8 +3610,8 @@ static void handle_cap_grant(struct inode *inode,
u64 version = le64_to_cpu(grant->xattr_version);
if (version > ci->i_xattrs.version) {
- dout(" got new xattrs v%llu on %p len %d\n",
- version, inode, len);
+ doutc(cl, " got new xattrs v%llu on %p %llx.%llx len %d\n",
+ version, inode, ceph_vinop(inode), len);
if (ci->i_xattrs.blob)
ceph_buffer_put(ci->i_xattrs.blob);
ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
@@ -3575,8 +3662,8 @@ static void handle_cap_grant(struct inode *inode,
if (ci->i_auth_cap == cap && (newcaps & CEPH_CAP_ANY_FILE_WR)) {
if (max_size != ci->i_max_size) {
- dout("max_size %lld -> %llu\n",
- ci->i_max_size, max_size);
+ doutc(cl, "max_size %lld -> %llu\n", ci->i_max_size,
+ max_size);
ci->i_max_size = max_size;
if (max_size >= ci->i_wanted_max_size) {
ci->i_wanted_max_size = 0; /* reset */
@@ -3590,10 +3677,9 @@ static void handle_cap_grant(struct inode *inode,
wanted = __ceph_caps_wanted(ci);
used = __ceph_caps_used(ci);
dirty = __ceph_caps_dirty(ci);
- dout(" my wanted = %s, used = %s, dirty %s\n",
- ceph_cap_string(wanted),
- ceph_cap_string(used),
- ceph_cap_string(dirty));
+ doutc(cl, " my wanted = %s, used = %s, dirty %s\n",
+ ceph_cap_string(wanted), ceph_cap_string(used),
+ ceph_cap_string(dirty));
if ((was_stale || le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) &&
(wanted & ~(cap->mds_wanted | newcaps))) {
@@ -3614,10 +3700,9 @@ static void handle_cap_grant(struct inode *inode,
if (cap->issued & ~newcaps) {
int revoking = cap->issued & ~newcaps;
- dout("revocation: %s -> %s (revoking %s)\n",
- ceph_cap_string(cap->issued),
- ceph_cap_string(newcaps),
- ceph_cap_string(revoking));
+ doutc(cl, "revocation: %s -> %s (revoking %s)\n",
+ ceph_cap_string(cap->issued), ceph_cap_string(newcaps),
+ ceph_cap_string(revoking));
if (S_ISREG(inode->i_mode) &&
(revoking & used & CEPH_CAP_FILE_BUFFER))
writeback = true; /* initiate writeback; will delay ack */
@@ -3635,11 +3720,12 @@ static void handle_cap_grant(struct inode *inode,
cap->issued = newcaps;
cap->implemented |= newcaps;
} else if (cap->issued == newcaps) {
- dout("caps unchanged: %s -> %s\n",
- ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
+ doutc(cl, "caps unchanged: %s -> %s\n",
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(newcaps));
} else {
- dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
- ceph_cap_string(newcaps));
+ doutc(cl, "grant: %s -> %s\n", ceph_cap_string(cap->issued),
+ ceph_cap_string(newcaps));
/* non-auth MDS is revoking the newly grant caps ? */
if (cap == ci->i_auth_cap &&
__ceph_caps_revoking_other(ci, cap, newcaps))
@@ -3727,7 +3813,8 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
__releases(ci->i_ceph_lock)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_cap_flush *cf, *tmp_cf;
LIST_HEAD(to_remove);
unsigned seq = le32_to_cpu(m->seq);
@@ -3764,11 +3851,11 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
}
}
- dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
- " flushing %s -> %s\n",
- inode, session->s_mds, seq, ceph_cap_string(dirty),
- ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
- ceph_cap_string(ci->i_flushing_caps & ~cleaned));
+ doutc(cl, "%p %llx.%llx mds%d seq %d on %s cleaned %s, flushing %s -> %s\n",
+ inode, ceph_vinop(inode), session->s_mds, seq,
+ ceph_cap_string(dirty), ceph_cap_string(cleaned),
+ ceph_cap_string(ci->i_flushing_caps),
+ ceph_cap_string(ci->i_flushing_caps & ~cleaned));
if (list_empty(&to_remove) && !cleaned)
goto out;
@@ -3784,18 +3871,21 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
if (list_empty(&ci->i_cap_flush_list)) {
list_del_init(&ci->i_flushing_item);
if (!list_empty(&session->s_cap_flushing)) {
- dout(" mds%d still flushing cap on %p\n",
- session->s_mds,
- &list_first_entry(&session->s_cap_flushing,
- struct ceph_inode_info,
- i_flushing_item)->netfs.inode);
+ struct inode *inode =
+ &list_first_entry(&session->s_cap_flushing,
+ struct ceph_inode_info,
+ i_flushing_item)->netfs.inode;
+ doutc(cl, " mds%d still flushing cap on %p %llx.%llx\n",
+ session->s_mds, inode, ceph_vinop(inode));
}
}
mdsc->num_cap_flushing--;
- dout(" inode %p now !flushing\n", inode);
+ doutc(cl, " %p %llx.%llx now !flushing\n", inode,
+ ceph_vinop(inode));
if (ci->i_dirty_caps == 0) {
- dout(" inode %p now clean\n", inode);
+ doutc(cl, " %p %llx.%llx now clean\n", inode,
+ ceph_vinop(inode));
BUG_ON(!list_empty(&ci->i_dirty_item));
drop = true;
if (ci->i_wr_ref == 0 &&
@@ -3833,12 +3923,14 @@ void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap,
bool *wake_ci, bool *wake_mdsc)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
bool ret;
lockdep_assert_held(&ci->i_ceph_lock);
- dout("removing capsnap %p, inode %p ci %p\n", capsnap, inode, ci);
+ doutc(cl, "removing capsnap %p, %p %llx.%llx ci %p\n", capsnap,
+ inode, ceph_vinop(inode), ci);
list_del_init(&capsnap->ci_item);
ret = __detach_cap_flush_from_ci(ci, &capsnap->cap_flush);
@@ -3877,29 +3969,31 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
struct ceph_mds_session *session)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
u64 follows = le64_to_cpu(m->snap_follows);
struct ceph_cap_snap *capsnap = NULL, *iter;
bool wake_ci = false;
bool wake_mdsc = false;
- dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
- inode, ci, session->s_mds, follows);
+ doutc(cl, "%p %llx.%llx ci %p mds%d follows %lld\n", inode,
+ ceph_vinop(inode), ci, session->s_mds, follows);
spin_lock(&ci->i_ceph_lock);
list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) {
if (iter->follows == follows) {
if (iter->cap_flush.tid != flush_tid) {
- dout(" cap_snap %p follows %lld tid %lld !="
- " %lld\n", iter, follows,
- flush_tid, iter->cap_flush.tid);
+ doutc(cl, " cap_snap %p follows %lld "
+ "tid %lld != %lld\n", iter,
+ follows, flush_tid,
+ iter->cap_flush.tid);
break;
}
capsnap = iter;
break;
} else {
- dout(" skipping cap_snap %p follows %lld\n",
- iter, iter->follows);
+ doutc(cl, " skipping cap_snap %p follows %lld\n",
+ iter, iter->follows);
}
}
if (capsnap)
@@ -3928,6 +4022,7 @@ static bool handle_cap_trunc(struct inode *inode,
struct cap_extra_info *extra_info)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int mds = session->s_mds;
int seq = le32_to_cpu(trunc->seq);
u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
@@ -3950,8 +4045,8 @@ static bool handle_cap_trunc(struct inode *inode,
if (IS_ENCRYPTED(inode) && size)
size = extra_info->fscrypt_file_size;
- dout("%s inode %p mds%d seq %d to %lld truncate seq %d\n",
- __func__, inode, mds, seq, truncate_size, truncate_seq);
+ doutc(cl, "%p %llx.%llx mds%d seq %d to %lld truncate seq %d\n",
+ inode, ceph_vinop(inode), mds, seq, truncate_size, truncate_seq);
queue_trunc = ceph_fill_file_size(inode, issued,
truncate_seq, truncate_size, size);
return queue_trunc;
@@ -3969,7 +4064,8 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
struct ceph_mds_cap_peer *ph,
struct ceph_mds_session *session)
{
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_session *tsession = NULL;
struct ceph_cap *cap, *tcap, *new_cap = NULL;
struct ceph_inode_info *ci = ceph_inode(inode);
@@ -3989,8 +4085,8 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
target = -1;
}
- dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n",
- inode, ci, mds, mseq, target);
+ doutc(cl, "%p %llx.%llx ci %p mds%d mseq %d target %d\n",
+ inode, ceph_vinop(inode), ci, mds, mseq, target);
retry:
down_read(&mdsc->snap_rwsem);
spin_lock(&ci->i_ceph_lock);
@@ -3999,7 +4095,7 @@ retry:
goto out_unlock;
if (target < 0) {
- ceph_remove_cap(cap, false);
+ ceph_remove_cap(mdsc, cap, false);
goto out_unlock;
}
@@ -4010,12 +4106,13 @@ retry:
issued = cap->issued;
if (issued != cap->implemented)
- pr_err_ratelimited("handle_cap_export: issued != implemented: "
- "ino (%llx.%llx) mds%d seq %d mseq %d "
- "issued %s implemented %s\n",
- ceph_vinop(inode), mds, cap->seq, cap->mseq,
- ceph_cap_string(issued),
- ceph_cap_string(cap->implemented));
+ pr_err_ratelimited_client(cl, "issued != implemented: "
+ "%p %llx.%llx mds%d seq %d mseq %d"
+ " issued %s implemented %s\n",
+ inode, ceph_vinop(inode), mds,
+ cap->seq, cap->mseq,
+ ceph_cap_string(issued),
+ ceph_cap_string(cap->implemented));
tcap = __get_cap_for_mds(ci, target);
@@ -4023,7 +4120,8 @@ retry:
/* already have caps from the target */
if (tcap->cap_id == t_cap_id &&
ceph_seq_cmp(tcap->seq, t_seq) < 0) {
- dout(" updating import cap %p mds%d\n", tcap, target);
+ doutc(cl, " updating import cap %p mds%d\n", tcap,
+ target);
tcap->cap_id = t_cap_id;
tcap->seq = t_seq - 1;
tcap->issue_seq = t_seq - 1;
@@ -4034,7 +4132,7 @@ retry:
change_auth_cap_ses(ci, tcap->session);
}
}
- ceph_remove_cap(cap, false);
+ ceph_remove_cap(mdsc, cap, false);
goto out_unlock;
} else if (tsession) {
/* add placeholder for the export tagert */
@@ -4051,7 +4149,7 @@ retry:
spin_unlock(&mdsc->cap_dirty_lock);
}
- ceph_remove_cap(cap, false);
+ ceph_remove_cap(mdsc, cap, false);
goto out_unlock;
}
@@ -4104,6 +4202,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
struct ceph_cap **target_cap, int *old_issued)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_cap *cap, *ocap, *new_cap = NULL;
int mds = session->s_mds;
int issued;
@@ -4124,8 +4223,8 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
peer = -1;
}
- dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n",
- inode, ci, mds, mseq, peer);
+ doutc(cl, "%p %llx.%llx ci %p mds%d mseq %d peer %d\n",
+ inode, ceph_vinop(inode), ci, mds, mseq, peer);
retry:
cap = __get_cap_for_mds(ci, mds);
if (!cap) {
@@ -4151,20 +4250,20 @@ retry:
ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
if (ocap && ocap->cap_id == p_cap_id) {
- dout(" remove export cap %p mds%d flags %d\n",
- ocap, peer, ph->flags);
+ doutc(cl, " remove export cap %p mds%d flags %d\n",
+ ocap, peer, ph->flags);
if ((ph->flags & CEPH_CAP_FLAG_AUTH) &&
(ocap->seq != le32_to_cpu(ph->seq) ||
ocap->mseq != le32_to_cpu(ph->mseq))) {
- pr_err_ratelimited("handle_cap_import: "
- "mismatched seq/mseq: ino (%llx.%llx) "
- "mds%d seq %d mseq %d importer mds%d "
- "has peer seq %d mseq %d\n",
- ceph_vinop(inode), peer, ocap->seq,
- ocap->mseq, mds, le32_to_cpu(ph->seq),
+ pr_err_ratelimited_client(cl, "mismatched seq/mseq: "
+ "%p %llx.%llx mds%d seq %d mseq %d"
+ " importer mds%d has peer seq %d mseq %d\n",
+ inode, ceph_vinop(inode), peer,
+ ocap->seq, ocap->mseq, mds,
+ le32_to_cpu(ph->seq),
le32_to_cpu(ph->mseq));
}
- ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
+ ceph_remove_cap(mdsc, ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
}
*old_issued = issued;
@@ -4227,6 +4326,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
struct ceph_msg *msg)
{
struct ceph_mds_client *mdsc = session->s_mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
struct inode *inode;
struct ceph_inode_info *ci;
struct ceph_cap *cap;
@@ -4245,7 +4345,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
bool close_sessions = false;
bool do_cap_release = false;
- dout("handle_caps from mds%d\n", session->s_mds);
+ doutc(cl, "from mds%d\n", session->s_mds);
if (!ceph_inc_mds_stopping_blocker(mdsc, session))
return;
@@ -4347,15 +4447,15 @@ void ceph_handle_caps(struct ceph_mds_session *session,
/* lookup ino */
inode = ceph_find_inode(mdsc->fsc->sb, vino);
- dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
- vino.snap, inode);
+ doutc(cl, " op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op),
+ vino.ino, vino.snap, inode);
mutex_lock(&session->s_mutex);
- dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
- (unsigned)seq);
+ doutc(cl, " mds%d seq %lld cap seq %u\n", session->s_mds,
+ session->s_seq, (unsigned)seq);
if (!inode) {
- dout(" i don't have ino %llx\n", vino.ino);
+ doutc(cl, " i don't have ino %llx\n", vino.ino);
switch (op) {
case CEPH_CAP_OP_IMPORT:
@@ -4410,9 +4510,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
spin_lock(&ci->i_ceph_lock);
cap = __get_cap_for_mds(ceph_inode(inode), session->s_mds);
if (!cap) {
- dout(" no cap on %p ino %llx.%llx from mds%d\n",
- inode, ceph_ino(inode), ceph_snap(inode),
- session->s_mds);
+ doutc(cl, " no cap on %p ino %llx.%llx from mds%d\n",
+ inode, ceph_ino(inode), ceph_snap(inode),
+ session->s_mds);
spin_unlock(&ci->i_ceph_lock);
switch (op) {
case CEPH_CAP_OP_REVOKE:
@@ -4450,8 +4550,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
default:
spin_unlock(&ci->i_ceph_lock);
- pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
- ceph_cap_op_name(op));
+ pr_err_client(cl, "unknown cap op %d %s\n", op,
+ ceph_cap_op_name(op));
}
done:
@@ -4492,7 +4592,7 @@ flush_cap_releases:
goto done;
bad:
- pr_err("ceph_handle_caps: corrupt message\n");
+ pr_err_client(cl, "corrupt message\n");
ceph_msg_dump(msg);
goto out;
}
@@ -4506,6 +4606,7 @@ bad:
*/
unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct inode *inode;
struct ceph_inode_info *ci;
struct ceph_mount_options *opt = mdsc->fsc->mount_options;
@@ -4513,14 +4614,14 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
unsigned long loop_start = jiffies;
unsigned long delay = 0;
- dout("check_delayed_caps\n");
+ doutc(cl, "begin\n");
spin_lock(&mdsc->cap_delay_lock);
while (!list_empty(&mdsc->cap_delay_list)) {
ci = list_first_entry(&mdsc->cap_delay_list,
struct ceph_inode_info,
i_cap_delay_list);
if (time_before(loop_start, ci->i_hold_caps_max - delay_max)) {
- dout("%s caps added recently. Exiting loop", __func__);
+ doutc(cl, "caps added recently. Exiting loop");
delay = ci->i_hold_caps_max;
break;
}
@@ -4532,13 +4633,15 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
inode = igrab(&ci->netfs.inode);
if (inode) {
spin_unlock(&mdsc->cap_delay_lock);
- dout("check_delayed_caps on %p\n", inode);
+ doutc(cl, "on %p %llx.%llx\n", inode,
+ ceph_vinop(inode));
ceph_check_caps(ci, 0);
iput(inode);
spin_lock(&mdsc->cap_delay_lock);
}
}
spin_unlock(&mdsc->cap_delay_lock);
+ doutc(cl, "done\n");
return delay;
}
@@ -4549,17 +4652,18 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
static void flush_dirty_session_caps(struct ceph_mds_session *s)
{
struct ceph_mds_client *mdsc = s->s_mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_inode_info *ci;
struct inode *inode;
- dout("flush_dirty_caps\n");
+ doutc(cl, "begin\n");
spin_lock(&mdsc->cap_dirty_lock);
while (!list_empty(&s->s_cap_dirty)) {
ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info,
i_dirty_item);
inode = &ci->netfs.inode;
ihold(inode);
- dout("flush_dirty_caps %llx.%llx\n", ceph_vinop(inode));
+ doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
spin_unlock(&mdsc->cap_dirty_lock);
ceph_wait_on_async_create(inode);
ceph_check_caps(ci, CHECK_CAPS_FLUSH);
@@ -4567,7 +4671,7 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s)
spin_lock(&mdsc->cap_dirty_lock);
}
spin_unlock(&mdsc->cap_dirty_lock);
- dout("flush_dirty_caps done\n");
+ doutc(cl, "done\n");
}
void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
@@ -4672,7 +4776,7 @@ int ceph_drop_caps_for_unlink(struct inode *inode)
if (__ceph_caps_dirty(ci)) {
struct ceph_mds_client *mdsc =
- ceph_inode_to_client(inode)->mdsc;
+ ceph_inode_to_fs_client(inode)->mdsc;
__cap_delay_requeue_front(mdsc, ci);
}
}
@@ -4692,6 +4796,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
int mds, int drop, int unless, int force)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_cap *cap;
struct ceph_mds_request_release *rel = *p;
int used, dirty;
@@ -4701,9 +4806,9 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
used = __ceph_caps_used(ci);
dirty = __ceph_caps_dirty(ci);
- dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n",
- inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop),
- ceph_cap_string(unless));
+ doutc(cl, "%p %llx.%llx mds%d used|dirty %s drop %s unless %s\n",
+ inode, ceph_vinop(inode), mds, ceph_cap_string(used|dirty),
+ ceph_cap_string(drop), ceph_cap_string(unless));
/* only drop unused, clean caps */
drop &= ~(used | dirty);
@@ -4725,12 +4830,13 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
if (force || (cap->issued & drop)) {
if (cap->issued & drop) {
int wanted = __ceph_caps_wanted(ci);
- dout("encode_inode_release %p cap %p "
- "%s -> %s, wanted %s -> %s\n", inode, cap,
- ceph_cap_string(cap->issued),
- ceph_cap_string(cap->issued & ~drop),
- ceph_cap_string(cap->mds_wanted),
- ceph_cap_string(wanted));
+ doutc(cl, "%p %llx.%llx cap %p %s -> %s, "
+ "wanted %s -> %s\n", inode,
+ ceph_vinop(inode), cap,
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(cap->issued & ~drop),
+ ceph_cap_string(cap->mds_wanted),
+ ceph_cap_string(wanted));
cap->issued &= ~drop;
cap->implemented &= ~drop;
@@ -4739,9 +4845,9 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
!(wanted & CEPH_CAP_ANY_FILE_WR))
ci->i_requested_max_size = 0;
} else {
- dout("encode_inode_release %p cap %p %s"
- " (force)\n", inode, cap,
- ceph_cap_string(cap->issued));
+ doutc(cl, "%p %llx.%llx cap %p %s (force)\n",
+ inode, ceph_vinop(inode), cap,
+ ceph_cap_string(cap->issued));
}
rel->ino = cpu_to_le64(ceph_ino(inode));
@@ -4756,8 +4862,9 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
*p += sizeof(*rel);
ret = 1;
} else {
- dout("encode_inode_release %p cap %p %s (noop)\n",
- inode, cap, ceph_cap_string(cap->issued));
+ doutc(cl, "%p %llx.%llx cap %p %s (noop)\n",
+ inode, ceph_vinop(inode), cap,
+ ceph_cap_string(cap->issued));
}
}
spin_unlock(&ci->i_ceph_lock);
@@ -4783,6 +4890,7 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
struct dentry *parent = NULL;
struct ceph_mds_request_release *rel = *p;
struct ceph_dentry_info *di = ceph_dentry(dentry);
+ struct ceph_client *cl;
int force = 0;
int ret;
@@ -4804,10 +4912,11 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
dput(parent);
+ cl = ceph_inode_to_client(dir);
spin_lock(&dentry->d_lock);
if (ret && di->lease_session && di->lease_session->s_mds == mds) {
- dout("encode_dentry_release %p mds%d seq %d\n",
- dentry, mds, (int)di->lease_seq);
+ doutc(cl, "%p mds%d seq %d\n", dentry, mds,
+ (int)di->lease_seq);
rel->dname_seq = cpu_to_le32(di->lease_seq);
__ceph_mdsc_drop_dentry_lease(dentry);
spin_unlock(&dentry->d_lock);
@@ -4833,12 +4942,14 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_cap_snap *capsnap;
int capsnap_release = 0;
lockdep_assert_held(&ci->i_ceph_lock);
- dout("removing capsnaps, ci is %p, inode is %p\n", ci, inode);
+ doutc(cl, "removing capsnaps, ci is %p, %p %llx.%llx\n",
+ ci, inode, ceph_vinop(inode));
while (!list_empty(&ci->i_cap_snaps)) {
capsnap = list_first_entry(&ci->i_cap_snaps,
@@ -4855,8 +4966,9 @@ static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode)
int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate)
{
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_client *cl = fsc->client;
struct ceph_inode_info *ci = ceph_inode(inode);
bool is_auth;
bool dirty_dropped = false;
@@ -4864,8 +4976,8 @@ int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invali
lockdep_assert_held(&ci->i_ceph_lock);
- dout("removing cap %p, ci is %p, inode is %p\n",
- cap, ci, &ci->netfs.inode);
+ doutc(cl, "removing cap %p, ci is %p, %p %llx.%llx\n",
+ cap, ci, inode, ceph_vinop(inode));
is_auth = (cap == ci->i_auth_cap);
__ceph_remove_cap(cap, false);
@@ -4892,19 +5004,19 @@ int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invali
}
if (!list_empty(&ci->i_dirty_item)) {
- pr_warn_ratelimited(
- " dropping dirty %s state for %p %lld\n",
+ pr_warn_ratelimited_client(cl,
+ " dropping dirty %s state for %p %llx.%llx\n",
ceph_cap_string(ci->i_dirty_caps),
- inode, ceph_ino(inode));
+ inode, ceph_vinop(inode));
ci->i_dirty_caps = 0;
list_del_init(&ci->i_dirty_item);
dirty_dropped = true;
}
if (!list_empty(&ci->i_flushing_item)) {
- pr_warn_ratelimited(
- " dropping dirty+flushing %s state for %p %lld\n",
+ pr_warn_ratelimited_client(cl,
+ " dropping dirty+flushing %s state for %p %llx.%llx\n",
ceph_cap_string(ci->i_flushing_caps),
- inode, ceph_ino(inode));
+ inode, ceph_vinop(inode));
ci->i_flushing_caps = 0;
list_del_init(&ci->i_flushing_item);
mdsc->num_cap_flushing--;
@@ -4927,8 +5039,9 @@ int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invali
if (atomic_read(&ci->i_filelock_ref) > 0) {
/* make further file lock syscall return -EIO */
ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK;
- pr_warn_ratelimited(" dropping file locks for %p %lld\n",
- inode, ceph_ino(inode));
+ pr_warn_ratelimited_client(cl,
+ " dropping file locks for %p %llx.%llx\n",
+ inode, ceph_vinop(inode));
}
if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
index 5b5112c78462..3b3c4d8d401e 100644
--- a/fs/ceph/crypto.c
+++ b/fs/ceph/crypto.c
@@ -113,7 +113,7 @@ static int ceph_crypt_set_context(struct inode *inode, const void *ctx,
cia.fscrypt_auth = cfa;
- ret = __ceph_setattr(inode, &attr, &cia);
+ ret = __ceph_setattr(&nop_mnt_idmap, inode, &attr, &cia);
if (ret == 0)
inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED);
kfree(cia.fscrypt_auth);
@@ -129,10 +129,11 @@ static bool ceph_crypt_empty_dir(struct inode *inode)
static const union fscrypt_policy *ceph_get_dummy_policy(struct super_block *sb)
{
- return ceph_sb_to_client(sb)->fsc_dummy_enc_policy.policy;
+ return ceph_sb_to_fs_client(sb)->fsc_dummy_enc_policy.policy;
}
static struct fscrypt_operations ceph_fscrypt_ops = {
+ .needs_bounce_pages = 1,
.get_context = ceph_crypt_get_context,
.set_context = ceph_crypt_set_context,
.get_dummy_policy = ceph_get_dummy_policy,
@@ -211,6 +212,7 @@ void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req,
static struct inode *parse_longname(const struct inode *parent,
const char *name, int *name_len)
{
+ struct ceph_client *cl = ceph_inode_to_client(parent);
struct inode *dir = NULL;
struct ceph_vino vino = { .snap = CEPH_NOSNAP };
char *inode_number;
@@ -222,12 +224,12 @@ static struct inode *parse_longname(const struct inode *parent,
name++;
name_end = strrchr(name, '_');
if (!name_end) {
- dout("Failed to parse long snapshot name: %s\n", name);
+ doutc(cl, "failed to parse long snapshot name: %s\n", name);
return ERR_PTR(-EIO);
}
*name_len = (name_end - name);
if (*name_len <= 0) {
- pr_err("Failed to parse long snapshot name\n");
+ pr_err_client(cl, "failed to parse long snapshot name\n");
return ERR_PTR(-EIO);
}
@@ -239,7 +241,7 @@ static struct inode *parse_longname(const struct inode *parent,
return ERR_PTR(-ENOMEM);
ret = kstrtou64(inode_number, 10, &vino.ino);
if (ret) {
- dout("Failed to parse inode number: %s\n", name);
+ doutc(cl, "failed to parse inode number: %s\n", name);
dir = ERR_PTR(ret);
goto out;
}
@@ -250,7 +252,7 @@ static struct inode *parse_longname(const struct inode *parent,
/* This can happen if we're not mounting cephfs on the root */
dir = ceph_get_inode(parent->i_sb, vino, NULL);
if (IS_ERR(dir))
- dout("Can't find inode %s (%s)\n", inode_number, name);
+ doutc(cl, "can't find inode %s (%s)\n", inode_number, name);
}
out:
@@ -261,6 +263,7 @@ out:
int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name,
char *buf)
{
+ struct ceph_client *cl = ceph_inode_to_client(parent);
struct inode *dir = parent;
struct qstr iname;
u32 len;
@@ -329,7 +332,7 @@ int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name,
/* base64 encode the encrypted name */
elen = ceph_base64_encode(cryptbuf, len, buf);
- dout("base64-encoded ciphertext name = %.*s\n", elen, buf);
+ doutc(cl, "base64-encoded ciphertext name = %.*s\n", elen, buf);
/* To understand the 240 limit, see CEPH_NOHASH_NAME_MAX comments */
WARN_ON(elen > 240);
@@ -504,7 +507,10 @@ int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode,
struct page *page, unsigned int len,
unsigned int offs, u64 lblk_num)
{
- dout("%s: len %u offs %u blk %llu\n", __func__, len, offs, lblk_num);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+
+ doutc(cl, "%p %llx.%llx len %u offs %u blk %llu\n", inode,
+ ceph_vinop(inode), len, offs, lblk_num);
return fscrypt_decrypt_block_inplace(inode, page, len, offs, lblk_num);
}
@@ -513,7 +519,10 @@ int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode,
unsigned int offs, u64 lblk_num,
gfp_t gfp_flags)
{
- dout("%s: len %u offs %u blk %llu\n", __func__, len, offs, lblk_num);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+
+ doutc(cl, "%p %llx.%llx len %u offs %u blk %llu\n", inode,
+ ceph_vinop(inode), len, offs, lblk_num);
return fscrypt_encrypt_block_inplace(inode, page, len, offs, lblk_num,
gfp_flags);
}
@@ -582,6 +591,7 @@ int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page,
u64 off, struct ceph_sparse_extent *map,
u32 ext_cnt)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int i, ret = 0;
struct ceph_inode_info *ci = ceph_inode(inode);
u64 objno, objoff;
@@ -589,7 +599,8 @@ int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page,
/* Nothing to do for empty array */
if (ext_cnt == 0) {
- dout("%s: empty array, ret 0\n", __func__);
+ doutc(cl, "%p %llx.%llx empty array, ret 0\n", inode,
+ ceph_vinop(inode));
return 0;
}
@@ -603,14 +614,17 @@ int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page,
int fret;
if ((ext->off | ext->len) & ~CEPH_FSCRYPT_BLOCK_MASK) {
- pr_warn("%s: bad encrypted sparse extent idx %d off %llx len %llx\n",
- __func__, i, ext->off, ext->len);
+ pr_warn_client(cl,
+ "%p %llx.%llx bad encrypted sparse extent "
+ "idx %d off %llx len %llx\n",
+ inode, ceph_vinop(inode), i, ext->off,
+ ext->len);
return -EIO;
}
fret = ceph_fscrypt_decrypt_pages(inode, &page[pgidx],
off + pgsoff, ext->len);
- dout("%s: [%d] 0x%llx~0x%llx fret %d\n", __func__, i,
- ext->off, ext->len, fret);
+ doutc(cl, "%p %llx.%llx [%d] 0x%llx~0x%llx fret %d\n", inode,
+ ceph_vinop(inode), i, ext->off, ext->len, fret);
if (fret < 0) {
if (ret == 0)
ret = fret;
@@ -618,7 +632,7 @@ int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page,
}
ret = pgsoff + fret;
}
- dout("%s: ret %d\n", __func__, ret);
+ doutc(cl, "ret %d\n", ret);
return ret;
}
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 3904333fa6c3..24c08078f5aa 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -81,7 +81,7 @@ static int mdsc_show(struct seq_file *s, void *p)
if (req->r_inode) {
seq_printf(s, " #%llx", ceph_ino(req->r_inode));
} else if (req->r_dentry) {
- path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+ path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen,
&pathbase, 0);
if (IS_ERR(path))
path = NULL;
@@ -100,7 +100,7 @@ static int mdsc_show(struct seq_file *s, void *p)
}
if (req->r_old_dentry) {
- path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
+ path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &pathlen,
&pathbase, 0);
if (IS_ERR(path))
path = NULL;
@@ -398,7 +398,7 @@ DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
{
- dout("ceph_fs_debugfs_cleanup\n");
+ doutc(fsc->client, "begin\n");
debugfs_remove(fsc->debugfs_bdi);
debugfs_remove(fsc->debugfs_congestion_kb);
debugfs_remove(fsc->debugfs_mdsmap);
@@ -407,13 +407,14 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
debugfs_remove(fsc->debugfs_status);
debugfs_remove(fsc->debugfs_mdsc);
debugfs_remove_recursive(fsc->debugfs_metrics_dir);
+ doutc(fsc->client, "done\n");
}
void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
{
char name[100];
- dout("ceph_fs_debugfs_init\n");
+ doutc(fsc->client, "begin\n");
fsc->debugfs_congestion_kb =
debugfs_create_file("writeback_congestion_kb",
0600,
@@ -469,6 +470,7 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
&metrics_size_fops);
debugfs_create_file("caps", 0400, fsc->debugfs_metrics_dir, fsc,
&metrics_caps_fops);
+ doutc(fsc->client, "done\n");
}
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 854cbdd66661..91709934c8b1 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -109,7 +109,9 @@ static int fpos_cmp(loff_t l, loff_t r)
* regardless of what dir changes take place on the
* server.
*/
-static int note_last_dentry(struct ceph_dir_file_info *dfi, const char *name,
+static int note_last_dentry(struct ceph_fs_client *fsc,
+ struct ceph_dir_file_info *dfi,
+ const char *name,
int len, unsigned next_offset)
{
char *buf = kmalloc(len+1, GFP_KERNEL);
@@ -120,7 +122,7 @@ static int note_last_dentry(struct ceph_dir_file_info *dfi, const char *name,
memcpy(dfi->last_name, name, len);
dfi->last_name[len] = 0;
dfi->next_offset = next_offset;
- dout("note_last_dentry '%s'\n", dfi->last_name);
+ doutc(fsc->client, "'%s'\n", dfi->last_name);
return 0;
}
@@ -130,6 +132,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
struct ceph_readdir_cache_control *cache_ctl)
{
struct inode *dir = d_inode(parent);
+ struct ceph_client *cl = ceph_inode_to_client(dir);
struct dentry *dentry;
unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1;
loff_t ptr_pos = idx * sizeof(struct dentry *);
@@ -142,7 +145,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
ceph_readdir_cache_release(cache_ctl);
cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff);
if (!cache_ctl->page) {
- dout(" page %lu not found\n", ptr_pgoff);
+ doutc(cl, " page %lu not found\n", ptr_pgoff);
return ERR_PTR(-EAGAIN);
}
/* reading/filling the cache are serialized by
@@ -185,13 +188,16 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
struct ceph_dir_file_info *dfi = file->private_data;
struct dentry *parent = file->f_path.dentry;
struct inode *dir = d_inode(parent);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(dir);
+ struct ceph_client *cl = ceph_inode_to_client(dir);
struct dentry *dentry, *last = NULL;
struct ceph_dentry_info *di;
struct ceph_readdir_cache_control cache_ctl = {};
u64 idx = 0;
int err = 0;
- dout("__dcache_readdir %p v%u at %llx\n", dir, (unsigned)shared_gen, ctx->pos);
+ doutc(cl, "%p %llx.%llx v%u at %llx\n", dir, ceph_vinop(dir),
+ (unsigned)shared_gen, ctx->pos);
/* search start position */
if (ctx->pos > 2) {
@@ -221,7 +227,8 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
dput(dentry);
}
- dout("__dcache_readdir %p cache idx %llu\n", dir, idx);
+ doutc(cl, "%p %llx.%llx cache idx %llu\n", dir,
+ ceph_vinop(dir), idx);
}
@@ -257,8 +264,8 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
spin_unlock(&dentry->d_lock);
if (emit_dentry) {
- dout(" %llx dentry %p %pd %p\n", di->offset,
- dentry, dentry, d_inode(dentry));
+ doutc(cl, " %llx dentry %p %pd %p\n", di->offset,
+ dentry, dentry, d_inode(dentry));
ctx->pos = di->offset;
if (!dir_emit(ctx, dentry->d_name.name,
dentry->d_name.len, ceph_present_inode(d_inode(dentry)),
@@ -281,7 +288,8 @@ out:
if (last) {
int ret;
di = ceph_dentry(last);
- ret = note_last_dentry(dfi, last->d_name.name, last->d_name.len,
+ ret = note_last_dentry(fsc, dfi, last->d_name.name,
+ last->d_name.len,
fpos_off(di->offset) + 1);
if (ret < 0)
err = ret;
@@ -310,20 +318,23 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
struct ceph_dir_file_info *dfi = file->private_data;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_client *cl = fsc->client;
int i;
int err;
unsigned frag = -1;
struct ceph_mds_reply_info_parsed *rinfo;
- dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);
+ doutc(cl, "%p %llx.%llx file %p pos %llx\n", inode,
+ ceph_vinop(inode), file, ctx->pos);
if (dfi->file_info.flags & CEPH_F_ATEND)
return 0;
/* always start with . and .. */
if (ctx->pos == 0) {
- dout("readdir off 0 -> '.'\n");
+ doutc(cl, "%p %llx.%llx off 0 -> '.'\n", inode,
+ ceph_vinop(inode));
if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode),
inode->i_mode >> 12))
return 0;
@@ -337,7 +348,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
ino = ceph_present_inode(dentry->d_parent->d_inode);
spin_unlock(&dentry->d_lock);
- dout("readdir off 1 -> '..'\n");
+ doutc(cl, "%p %llx.%llx off 1 -> '..'\n", inode,
+ ceph_vinop(inode));
if (!dir_emit(ctx, "..", 2, ino, inode->i_mode >> 12))
return 0;
ctx->pos = 2;
@@ -391,8 +403,8 @@ more:
frag = fpos_frag(ctx->pos);
}
- dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
- ceph_vinop(inode), frag, dfi->last_name);
+ doutc(cl, "fetching %p %llx.%llx frag %x offset '%s'\n",
+ inode, ceph_vinop(inode), frag, dfi->last_name);
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -446,12 +458,12 @@ more:
ceph_mdsc_put_request(req);
return err;
}
- dout("readdir got and parsed readdir result=%d on "
- "frag %x, end=%d, complete=%d, hash_order=%d\n",
- err, frag,
- (int)req->r_reply_info.dir_end,
- (int)req->r_reply_info.dir_complete,
- (int)req->r_reply_info.hash_order);
+ doutc(cl, "%p %llx.%llx got and parsed readdir result=%d"
+ "on frag %x, end=%d, complete=%d, hash_order=%d\n",
+ inode, ceph_vinop(inode), err, frag,
+ (int)req->r_reply_info.dir_end,
+ (int)req->r_reply_info.dir_complete,
+ (int)req->r_reply_info.hash_order);
rinfo = &req->r_reply_info;
if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
@@ -481,7 +493,8 @@ more:
dfi->dir_ordered_count = req->r_dir_ordered_cnt;
}
} else {
- dout("readdir !did_prepopulate\n");
+ doutc(cl, "%p %llx.%llx !did_prepopulate\n", inode,
+ ceph_vinop(inode));
/* disable readdir cache */
dfi->readdir_cache_idx = -1;
/* preclude from marking dir complete */
@@ -494,8 +507,8 @@ more:
rinfo->dir_entries + (rinfo->dir_nr-1);
unsigned next_offset = req->r_reply_info.dir_end ?
2 : (fpos_off(rde->offset) + 1);
- err = note_last_dentry(dfi, rde->name, rde->name_len,
- next_offset);
+ err = note_last_dentry(fsc, dfi, rde->name,
+ rde->name_len, next_offset);
if (err) {
ceph_mdsc_put_request(dfi->last_readdir);
dfi->last_readdir = NULL;
@@ -508,9 +521,9 @@ more:
}
rinfo = &dfi->last_readdir->r_reply_info;
- dout("readdir frag %x num %d pos %llx chunk first %llx\n",
- dfi->frag, rinfo->dir_nr, ctx->pos,
- rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
+ doutc(cl, "%p %llx.%llx frag %x num %d pos %llx chunk first %llx\n",
+ inode, ceph_vinop(inode), dfi->frag, rinfo->dir_nr, ctx->pos,
+ rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
i = 0;
/* search start position */
@@ -530,8 +543,9 @@ more:
struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
if (rde->offset < ctx->pos) {
- pr_warn("%s: rde->offset 0x%llx ctx->pos 0x%llx\n",
- __func__, rde->offset, ctx->pos);
+ pr_warn_client(cl,
+ "%p %llx.%llx rde->offset 0x%llx ctx->pos 0x%llx\n",
+ inode, ceph_vinop(inode), rde->offset, ctx->pos);
return -EIO;
}
@@ -539,9 +553,9 @@ more:
return -EIO;
ctx->pos = rde->offset;
- dout("readdir (%d/%d) -> %llx '%.*s' %p\n",
- i, rinfo->dir_nr, ctx->pos,
- rde->name_len, rde->name, &rde->inode.in);
+ doutc(cl, "%p %llx.%llx (%d/%d) -> %llx '%.*s' %p\n", inode,
+ ceph_vinop(inode), i, rinfo->dir_nr, ctx->pos,
+ rde->name_len, rde->name, &rde->inode.in);
if (!dir_emit(ctx, rde->name, rde->name_len,
ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
@@ -552,7 +566,7 @@ more:
* doesn't have enough memory, etc. So for next readdir
* it will continue.
*/
- dout("filldir stopping us...\n");
+ doutc(cl, "filldir stopping us...\n");
return 0;
}
@@ -583,7 +597,8 @@ more:
kfree(dfi->last_name);
dfi->last_name = NULL;
}
- dout("readdir next frag is %x\n", frag);
+ doutc(cl, "%p %llx.%llx next frag is %x\n", inode,
+ ceph_vinop(inode), frag);
goto more;
}
dfi->file_info.flags |= CEPH_F_ATEND;
@@ -598,20 +613,23 @@ more:
spin_lock(&ci->i_ceph_lock);
if (dfi->dir_ordered_count ==
atomic64_read(&ci->i_ordered_count)) {
- dout(" marking %p complete and ordered\n", inode);
+ doutc(cl, " marking %p %llx.%llx complete and ordered\n",
+ inode, ceph_vinop(inode));
/* use i_size to track number of entries in
* readdir cache */
BUG_ON(dfi->readdir_cache_idx < 0);
i_size_write(inode, dfi->readdir_cache_idx *
sizeof(struct dentry*));
} else {
- dout(" marking %p complete\n", inode);
+ doutc(cl, " marking %llx.%llx complete\n",
+ ceph_vinop(inode));
}
__ceph_dir_set_complete(ci, dfi->dir_release_count,
dfi->dir_ordered_count);
spin_unlock(&ci->i_ceph_lock);
}
- dout("readdir %p file %p done.\n", inode, file);
+ doutc(cl, "%p %llx.%llx file %p done.\n", inode, ceph_vinop(inode),
+ file);
return 0;
}
@@ -657,6 +675,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
{
struct ceph_dir_file_info *dfi = file->private_data;
struct inode *inode = file->f_mapping->host;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
loff_t retval;
inode_lock(inode);
@@ -676,7 +695,8 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
if (offset >= 0) {
if (need_reset_readdir(dfi, offset)) {
- dout("dir_llseek dropping %p content\n", file);
+ doutc(cl, "%p %llx.%llx dropping %p content\n",
+ inode, ceph_vinop(inode), file);
reset_readdir(dfi);
} else if (is_hash_order(offset) && offset > file->f_pos) {
/* for hash offset, we don't know if a forward seek
@@ -703,8 +723,9 @@ out:
struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req,
struct dentry *dentry)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dentry->d_sb);
struct inode *parent = d_inode(dentry->d_parent); /* we hold i_rwsem */
+ struct ceph_client *cl = ceph_inode_to_client(parent);
/* .snap dir? */
if (ceph_snap(parent) == CEPH_NOSNAP &&
@@ -713,8 +734,9 @@ struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req,
struct inode *inode = ceph_get_snapdir(parent);
res = d_splice_alias(inode, dentry);
- dout("ENOENT on snapdir %p '%pd', linking to snapdir %p. Spliced dentry %p\n",
- dentry, dentry, inode, res);
+ doutc(cl, "ENOENT on snapdir %p '%pd', linking to "
+ "snapdir %p %llx.%llx. Spliced dentry %p\n",
+ dentry, dentry, inode, ceph_vinop(inode), res);
if (res)
dentry = res;
}
@@ -735,12 +757,15 @@ struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req,
struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
struct dentry *dentry, int err)
{
+ struct ceph_client *cl = req->r_mdsc->fsc->client;
+
if (err == -ENOENT) {
/* no trace? */
err = 0;
if (!req->r_reply_info.head->is_dentry) {
- dout("ENOENT and no trace, dentry %p inode %p\n",
- dentry, d_inode(dentry));
+ doutc(cl,
+ "ENOENT and no trace, dentry %p inode %llx.%llx\n",
+ dentry, ceph_vinop(d_inode(dentry)));
if (d_really_is_positive(dentry)) {
d_drop(dentry);
err = -ENOENT;
@@ -771,15 +796,16 @@ static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dir->i_sb);
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
+ struct ceph_client *cl = fsc->client;
struct ceph_mds_request *req;
int op;
int mask;
int err;
- dout("lookup %p dentry %p '%pd'\n",
- dir, dentry, dentry);
+ doutc(cl, "%p %llx.%llx/'%pd' dentry %p\n", dir, ceph_vinop(dir),
+ dentry, dentry);
if (dentry->d_name.len > NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
@@ -802,7 +828,8 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
struct ceph_dentry_info *di = ceph_dentry(dentry);
spin_lock(&ci->i_ceph_lock);
- dout(" dir %p flags are 0x%lx\n", dir, ci->i_ceph_flags);
+ doutc(cl, " dir %llx.%llx flags are 0x%lx\n",
+ ceph_vinop(dir), ci->i_ceph_flags);
if (strncmp(dentry->d_name.name,
fsc->mount_options->snapdir_name,
dentry->d_name.len) &&
@@ -812,7 +839,8 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
__ceph_caps_issued_mask_metric(ci, CEPH_CAP_FILE_SHARED, 1)) {
__ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD);
spin_unlock(&ci->i_ceph_lock);
- dout(" dir %p complete, -ENOENT\n", dir);
+ doutc(cl, " dir %llx.%llx complete, -ENOENT\n",
+ ceph_vinop(dir));
d_add(dentry, NULL);
di->lease_shared_gen = atomic_read(&ci->i_shared_gen);
return NULL;
@@ -850,7 +878,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
}
dentry = ceph_finish_lookup(req, dentry, err);
ceph_mdsc_put_request(req); /* will dput(dentry) */
- dout("lookup result=%p\n", dentry);
+ doutc(cl, "result=%p\n", dentry);
return dentry;
}
@@ -885,6 +913,7 @@ static int ceph_mknod(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, dev_t rdev)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
struct ceph_acl_sec_ctx as_ctx = {};
int err;
@@ -901,8 +930,8 @@ static int ceph_mknod(struct mnt_idmap *idmap, struct inode *dir,
goto out;
}
- dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n",
- dir, dentry, mode, rdev);
+ doutc(cl, "%p %llx.%llx/'%pd' dentry %p mode 0%ho rdev %d\n",
+ dir, ceph_vinop(dir), dentry, dentry, mode, rdev);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
if (IS_ERR(req)) {
err = PTR_ERR(req);
@@ -924,6 +953,7 @@ static int ceph_mknod(struct mnt_idmap *idmap, struct inode *dir,
req->r_parent = dir;
ihold(dir);
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+ req->r_mnt_idmap = mnt_idmap_get(idmap);
req->r_args.mknod.mode = cpu_to_le32(mode);
req->r_args.mknod.rdev = cpu_to_le32(rdev);
req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL |
@@ -993,6 +1023,7 @@ static int ceph_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const char *dest)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
struct ceph_acl_sec_ctx as_ctx = {};
umode_t mode = S_IFLNK | 0777;
@@ -1010,7 +1041,8 @@ static int ceph_symlink(struct mnt_idmap *idmap, struct inode *dir,
goto out;
}
- dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
+ doutc(cl, "%p %llx.%llx/'%pd' to '%s'\n", dir, ceph_vinop(dir), dentry,
+ dest);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
if (IS_ERR(req)) {
err = PTR_ERR(req);
@@ -1040,6 +1072,7 @@ static int ceph_symlink(struct mnt_idmap *idmap, struct inode *dir,
}
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+ req->r_mnt_idmap = mnt_idmap_get(idmap);
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL |
@@ -1064,6 +1097,7 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
struct ceph_acl_sec_ctx as_ctx = {};
int err;
@@ -1076,10 +1110,11 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir,
if (ceph_snap(dir) == CEPH_SNAPDIR) {
/* mkdir .snap/foo is a MKSNAP */
op = CEPH_MDS_OP_MKSNAP;
- dout("mksnap dir %p snap '%pd' dn %p\n", dir,
- dentry, dentry);
+ doutc(cl, "mksnap %llx.%llx/'%pd' dentry %p\n",
+ ceph_vinop(dir), dentry, dentry);
} else if (ceph_snap(dir) == CEPH_NOSNAP) {
- dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
+ doutc(cl, "mkdir %llx.%llx/'%pd' dentry %p mode 0%ho\n",
+ ceph_vinop(dir), dentry, dentry, mode);
op = CEPH_MDS_OP_MKDIR;
} else {
err = -EROFS;
@@ -1117,6 +1152,8 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir,
req->r_parent = dir;
ihold(dir);
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+ if (op == CEPH_MDS_OP_MKDIR)
+ req->r_mnt_idmap = mnt_idmap_get(idmap);
req->r_args.mkdir.mode = cpu_to_le32(mode);
req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL |
CEPH_CAP_XATTR_EXCL;
@@ -1144,6 +1181,7 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
int err;
@@ -1161,8 +1199,8 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
if (err)
return err;
- dout("link in dir %p %llx.%llx old_dentry %p:'%pd' dentry %p:'%pd'\n",
- dir, ceph_vinop(dir), old_dentry, old_dentry, dentry, dentry);
+ doutc(cl, "%p %llx.%llx/'%pd' to '%pd'\n", dir, ceph_vinop(dir),
+ old_dentry, dentry);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
if (IS_ERR(req)) {
d_drop(dentry);
@@ -1199,14 +1237,16 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req)
{
struct dentry *dentry = req->r_dentry;
- struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dentry->d_sb);
+ struct ceph_client *cl = fsc->client;
struct ceph_dentry_info *di = ceph_dentry(dentry);
int result = req->r_err ? req->r_err :
le32_to_cpu(req->r_reply_info.head->result);
if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
- pr_warn("%s dentry %p:%pd async unlink bit is not set\n",
- __func__, dentry, dentry);
+ pr_warn_client(cl,
+ "dentry %p:%pd async unlink bit is not set\n",
+ dentry, dentry);
spin_lock(&fsc->async_unlink_conflict_lock);
hash_del_rcu(&di->hnode);
@@ -1226,7 +1266,7 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
if (result) {
int pathlen = 0;
u64 base = 0;
- char *path = ceph_mdsc_build_path(dentry, &pathlen,
+ char *path = ceph_mdsc_build_path(mdsc, dentry, &pathlen,
&base, 0);
/* mark error on parent + clear complete */
@@ -1240,8 +1280,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
/* mark inode itself for an error (since metadata is bogus) */
mapping_set_error(req->r_old_inode->i_mapping, result);
- pr_warn("async unlink failure path=(%llx)%s result=%d!\n",
- base, IS_ERR(path) ? "<<bad>>" : path, result);
+ pr_warn_client(cl, "failure path=(%llx)%s result=%d!\n",
+ base, IS_ERR(path) ? "<<bad>>" : path, result);
ceph_mdsc_free_path(path, pathlen);
}
out:
@@ -1290,7 +1330,8 @@ static int get_caps_for_async_unlink(struct inode *dir, struct dentry *dentry)
*/
static int ceph_unlink(struct inode *dir, struct dentry *dentry)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dir->i_sb);
+ struct ceph_client *cl = fsc->client;
struct ceph_mds_client *mdsc = fsc->mdsc;
struct inode *inode = d_inode(dentry);
struct ceph_mds_request *req;
@@ -1300,11 +1341,12 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
if (ceph_snap(dir) == CEPH_SNAPDIR) {
/* rmdir .snap/foo is RMSNAP */
- dout("rmsnap dir %p '%pd' dn %p\n", dir, dentry, dentry);
+ doutc(cl, "rmsnap %llx.%llx/'%pd' dn\n", ceph_vinop(dir),
+ dentry);
op = CEPH_MDS_OP_RMSNAP;
} else if (ceph_snap(dir) == CEPH_NOSNAP) {
- dout("unlink/rmdir dir %p dn %p inode %p\n",
- dir, dentry, inode);
+ doutc(cl, "unlink/rmdir %llx.%llx/'%pd' inode %llx.%llx\n",
+ ceph_vinop(dir), dentry, ceph_vinop(inode));
op = d_is_dir(dentry) ?
CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
} else
@@ -1327,9 +1369,9 @@ retry:
(req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
struct ceph_dentry_info *di = ceph_dentry(dentry);
- dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir),
- dentry->d_name.len, dentry->d_name.name,
- ceph_cap_string(req->r_dir_caps));
+ doutc(cl, "async unlink on %llx.%llx/'%pd' caps=%s",
+ ceph_vinop(dir), dentry,
+ ceph_cap_string(req->r_dir_caps));
set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
req->r_callback = ceph_async_unlink_cb;
req->r_old_inode = d_inode(dentry);
@@ -1384,6 +1426,7 @@ static int ceph_rename(struct mnt_idmap *idmap, struct inode *old_dir,
struct dentry *new_dentry, unsigned int flags)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old_dir->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
int op = CEPH_MDS_OP_RENAME;
int err;
@@ -1413,8 +1456,9 @@ static int ceph_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (err)
return err;
- dout("rename dir %p dentry %p to dir %p dentry %p\n",
- old_dir, old_dentry, new_dir, new_dentry);
+ doutc(cl, "%llx.%llx/'%pd' to %llx.%llx/'%pd'\n",
+ ceph_vinop(old_dir), old_dentry, ceph_vinop(new_dir),
+ new_dentry);
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -1459,9 +1503,10 @@ static int ceph_rename(struct mnt_idmap *idmap, struct inode *old_dir,
void __ceph_dentry_lease_touch(struct ceph_dentry_info *di)
{
struct dentry *dn = di->dentry;
- struct ceph_mds_client *mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dn->d_sb)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
- dout("dentry_lease_touch %p %p '%pd'\n", di, dn, dn);
+ doutc(cl, "%p %p '%pd'\n", di, dn, dn);
di->flags |= CEPH_DENTRY_LEASE_LIST;
if (di->flags & CEPH_DENTRY_SHRINK_LIST) {
@@ -1469,7 +1514,6 @@ void __ceph_dentry_lease_touch(struct ceph_dentry_info *di)
return;
}
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
spin_lock(&mdsc->dentry_list_lock);
list_move_tail(&di->lease_list, &mdsc->dentry_leases);
spin_unlock(&mdsc->dentry_list_lock);
@@ -1493,10 +1537,10 @@ static void __dentry_dir_lease_touch(struct ceph_mds_client* mdsc,
void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di)
{
struct dentry *dn = di->dentry;
- struct ceph_mds_client *mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dn->d_sb)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
- dout("dentry_dir_lease_touch %p %p '%pd' (offset 0x%llx)\n",
- di, dn, dn, di->offset);
+ doutc(cl, "%p %p '%pd' (offset 0x%llx)\n", di, dn, dn, di->offset);
if (!list_empty(&di->lease_list)) {
if (di->flags & CEPH_DENTRY_LEASE_LIST) {
@@ -1516,7 +1560,6 @@ void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di)
return;
}
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
spin_lock(&mdsc->dentry_list_lock);
__dentry_dir_lease_touch(mdsc, di),
spin_unlock(&mdsc->dentry_list_lock);
@@ -1530,7 +1573,7 @@ static void __dentry_lease_unlist(struct ceph_dentry_info *di)
if (list_empty(&di->lease_list))
return;
- mdsc = ceph_sb_to_client(di->dentry->d_sb)->mdsc;
+ mdsc = ceph_sb_to_fs_client(di->dentry->d_sb)->mdsc;
spin_lock(&mdsc->dentry_list_lock);
list_del_init(&di->lease_list);
spin_unlock(&mdsc->dentry_list_lock);
@@ -1757,6 +1800,8 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags)
{
struct ceph_dentry_info *di;
struct ceph_mds_session *session = NULL;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dentry->d_sb)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
u32 seq = 0;
int valid = 0;
@@ -1789,7 +1834,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags)
CEPH_MDS_LEASE_RENEW, seq);
ceph_put_mds_session(session);
}
- dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
+ doutc(cl, "dentry %p = %d\n", dentry, valid);
return valid;
}
@@ -1832,6 +1877,7 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry,
struct ceph_mds_client *mdsc)
{
struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_client *cl = mdsc->fsc->client;
int valid;
int shared_gen;
@@ -1853,8 +1899,9 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry,
valid = 0;
spin_unlock(&dentry->d_lock);
}
- dout("dir_lease_is_valid dir %p v%u dentry %p = %d\n",
- dir, (unsigned)atomic_read(&ci->i_shared_gen), dentry, valid);
+ doutc(cl, "dir %p %llx.%llx v%u dentry %p '%pd' = %d\n", dir,
+ ceph_vinop(dir), (unsigned)atomic_read(&ci->i_shared_gen),
+ dentry, dentry, valid);
return valid;
}
@@ -1863,10 +1910,11 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry,
*/
static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
{
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dentry->d_sb)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
int valid = 0;
struct dentry *parent;
struct inode *dir, *inode;
- struct ceph_mds_client *mdsc;
valid = fscrypt_d_revalidate(dentry, flags);
if (valid <= 0)
@@ -1884,16 +1932,16 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
inode = d_inode(dentry);
}
- dout("d_revalidate %p '%pd' inode %p offset 0x%llx nokey %d\n", dentry,
- dentry, inode, ceph_dentry(dentry)->offset,
- !!(dentry->d_flags & DCACHE_NOKEY_NAME));
+ doutc(cl, "%p '%pd' inode %p offset 0x%llx nokey %d\n",
+ dentry, dentry, inode, ceph_dentry(dentry)->offset,
+ !!(dentry->d_flags & DCACHE_NOKEY_NAME));
- mdsc = ceph_sb_to_client(dir->i_sb)->mdsc;
+ mdsc = ceph_sb_to_fs_client(dir->i_sb)->mdsc;
/* always trust cached snapped dentries, snapdir dentry */
if (ceph_snap(dir) != CEPH_NOSNAP) {
- dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry,
- dentry, inode);
+ doutc(cl, "%p '%pd' inode %p is SNAPPED\n", dentry,
+ dentry, inode);
valid = 1;
} else if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
valid = 1;
@@ -1948,14 +1996,14 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
break;
}
ceph_mdsc_put_request(req);
- dout("d_revalidate %p lookup result=%d\n",
- dentry, err);
+ doutc(cl, "%p '%pd', lookup result=%d\n", dentry,
+ dentry, err);
}
} else {
percpu_counter_inc(&mdsc->metric.d_lease_hit);
}
- dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
+ doutc(cl, "%p '%pd' %s\n", dentry, dentry, valid ? "valid" : "invalid");
if (!valid)
ceph_dir_clear_complete(dir);
@@ -1995,9 +2043,9 @@ static int ceph_d_delete(const struct dentry *dentry)
static void ceph_d_release(struct dentry *dentry)
{
struct ceph_dentry_info *di = ceph_dentry(dentry);
- struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dentry->d_sb);
- dout("d_release %p\n", dentry);
+ doutc(fsc->client, "dentry %p '%pd'\n", dentry, dentry);
atomic64_dec(&fsc->mdsc->metric.total_dentries);
@@ -2018,10 +2066,12 @@ static void ceph_d_release(struct dentry *dentry)
*/
static void ceph_d_prune(struct dentry *dentry)
{
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dentry->d_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_inode_info *dir_ci;
struct ceph_dentry_info *di;
- dout("ceph_d_prune %pd %p\n", dentry, dentry);
+ doutc(cl, "dentry %p '%pd'\n", dentry, dentry);
/* do we have a valid parent? */
if (IS_ROOT(dentry))
@@ -2064,7 +2114,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
int left;
const int bufsize = 1024;
- if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
+ if (!ceph_test_mount_opt(ceph_sb_to_fs_client(inode->i_sb), DIRSTAT))
return -EISDIR;
if (!dfi->dir_info) {
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 8559990a59a5..726af69d4d62 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -36,6 +36,7 @@ struct ceph_nfs_snapfh {
static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len,
struct inode *parent_inode)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
static const int snap_handle_length =
sizeof(struct ceph_nfs_snapfh) >> 2;
struct ceph_nfs_snapfh *sfh = (void *)rawfh;
@@ -79,13 +80,14 @@ static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len,
*max_len = snap_handle_length;
ret = FILEID_BTRFS_WITH_PARENT;
out:
- dout("encode_snapfh %llx.%llx ret=%d\n", ceph_vinop(inode), ret);
+ doutc(cl, "%p %llx.%llx ret=%d\n", inode, ceph_vinop(inode), ret);
return ret;
}
static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
struct inode *parent_inode)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
static const int handle_length =
sizeof(struct ceph_nfs_fh) >> 2;
static const int connected_handle_length =
@@ -105,15 +107,15 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
if (parent_inode) {
struct ceph_nfs_confh *cfh = (void *)rawfh;
- dout("encode_fh %llx with parent %llx\n",
- ceph_ino(inode), ceph_ino(parent_inode));
+ doutc(cl, "%p %llx.%llx with parent %p %llx.%llx\n", inode,
+ ceph_vinop(inode), parent_inode, ceph_vinop(parent_inode));
cfh->ino = ceph_ino(inode);
cfh->parent_ino = ceph_ino(parent_inode);
*max_len = connected_handle_length;
type = FILEID_INO32_GEN_PARENT;
} else {
struct ceph_nfs_fh *fh = (void *)rawfh;
- dout("encode_fh %llx\n", ceph_ino(inode));
+ doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
fh->ino = ceph_ino(inode);
*max_len = handle_length;
type = FILEID_INO32_GEN;
@@ -123,7 +125,7 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
static struct inode *__lookup_inode(struct super_block *sb, u64 ino)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(sb)->mdsc;
struct inode *inode;
struct ceph_vino vino;
int err;
@@ -205,7 +207,8 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb,
struct ceph_nfs_snapfh *sfh,
bool want_parent)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(sb)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
struct inode *inode;
struct ceph_vino vino;
@@ -278,11 +281,10 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb,
ceph_mdsc_put_request(req);
if (want_parent) {
- dout("snapfh_to_parent %llx.%llx\n err=%d\n",
- vino.ino, vino.snap, err);
+ doutc(cl, "%llx.%llx\n err=%d\n", vino.ino, vino.snap, err);
} else {
- dout("snapfh_to_dentry %llx.%llx parent %llx hash %x err=%d",
- vino.ino, vino.snap, sfh->parent_ino, sfh->hash, err);
+ doutc(cl, "%llx.%llx parent %llx hash %x err=%d", vino.ino,
+ vino.snap, sfh->parent_ino, sfh->hash, err);
}
if (IS_ERR(inode))
return ERR_CAST(inode);
@@ -297,6 +299,7 @@ static struct dentry *ceph_fh_to_dentry(struct super_block *sb,
struct fid *fid,
int fh_len, int fh_type)
{
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
struct ceph_nfs_fh *fh = (void *)fid->raw;
if (fh_type == FILEID_BTRFS_WITH_PARENT) {
@@ -310,14 +313,14 @@ static struct dentry *ceph_fh_to_dentry(struct super_block *sb,
if (fh_len < sizeof(*fh) / 4)
return NULL;
- dout("fh_to_dentry %llx\n", fh->ino);
+ doutc(fsc->client, "%llx\n", fh->ino);
return __fh_to_dentry(sb, fh->ino);
}
static struct dentry *__get_parent(struct super_block *sb,
struct dentry *child, u64 ino)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(sb)->mdsc;
struct ceph_mds_request *req;
struct inode *inode;
int mask;
@@ -363,6 +366,7 @@ static struct dentry *__get_parent(struct super_block *sb,
static struct dentry *ceph_get_parent(struct dentry *child)
{
struct inode *inode = d_inode(child);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct dentry *dn;
if (ceph_snap(inode) != CEPH_NOSNAP) {
@@ -402,8 +406,8 @@ static struct dentry *ceph_get_parent(struct dentry *child)
dn = __get_parent(child->d_sb, child, 0);
}
out:
- dout("get_parent %p ino %llx.%llx err=%ld\n",
- child, ceph_vinop(inode), (long)PTR_ERR_OR_ZERO(dn));
+ doutc(cl, "child %p %p %llx.%llx err=%ld\n", child, inode,
+ ceph_vinop(inode), (long)PTR_ERR_OR_ZERO(dn));
return dn;
}
@@ -414,6 +418,7 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
struct fid *fid,
int fh_len, int fh_type)
{
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
struct ceph_nfs_confh *cfh = (void *)fid->raw;
struct dentry *dentry;
@@ -427,7 +432,7 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
if (fh_len < sizeof(*cfh) / 4)
return NULL;
- dout("fh_to_parent %llx\n", cfh->parent_ino);
+ doutc(fsc->client, "%llx\n", cfh->parent_ino);
dentry = __get_parent(sb, NULL, cfh->ino);
if (unlikely(dentry == ERR_PTR(-ENOENT)))
dentry = __fh_to_dentry(sb, cfh->parent_ino);
@@ -439,7 +444,7 @@ static int __get_snap_name(struct dentry *parent, char *name,
{
struct inode *inode = d_inode(child);
struct inode *dir = d_inode(parent);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
struct ceph_mds_request *req = NULL;
char *last_name = NULL;
unsigned next_offset = 2;
@@ -526,8 +531,8 @@ out:
if (req)
ceph_mdsc_put_request(req);
kfree(last_name);
- dout("get_snap_name %p ino %llx.%llx err=%d\n",
- child, ceph_vinop(inode), err);
+ doutc(fsc->client, "child dentry %p %p %llx.%llx err=%d\n", child,
+ inode, ceph_vinop(inode), err);
return err;
}
@@ -544,7 +549,7 @@ static int ceph_get_name(struct dentry *parent, char *name,
if (ceph_snap(inode) != CEPH_NOSNAP)
return __get_snap_name(parent, name, child);
- mdsc = ceph_inode_to_client(inode)->mdsc;
+ mdsc = ceph_inode_to_fs_client(inode)->mdsc;
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME,
USE_ANY_MDS);
if (IS_ERR(req))
@@ -588,9 +593,9 @@ static int ceph_get_name(struct dentry *parent, char *name,
ceph_fname_free_buffer(dir, &oname);
}
out:
- dout("get_name %p ino %llx.%llx err %d %s%s\n",
- child, ceph_vinop(inode), err,
- err ? "" : "name ", err ? "" : name);
+ doutc(mdsc->fsc->client, "child dentry %p %p %llx.%llx err %d %s%s\n",
+ child, inode, ceph_vinop(inode), err, err ? "" : "name ",
+ err ? "" : name);
ceph_mdsc_put_request(req);
return err;
}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index b5f8038065d7..3b5aae29e944 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -19,8 +19,9 @@
#include "io.h"
#include "metric.h"
-static __le32 ceph_flags_sys2wire(u32 flags)
+static __le32 ceph_flags_sys2wire(struct ceph_mds_client *mdsc, u32 flags)
{
+ struct ceph_client *cl = mdsc->fsc->client;
u32 wire_flags = 0;
switch (flags & O_ACCMODE) {
@@ -48,7 +49,7 @@ static __le32 ceph_flags_sys2wire(u32 flags)
#undef ceph_sys2wire
if (flags)
- dout("unused open flags: %x\n", flags);
+ doutc(cl, "unused open flags: %x\n", flags);
return cpu_to_le32(wire_flags);
}
@@ -189,7 +190,7 @@ prepare_open_request(struct super_block *sb, int flags, int create_mode)
if (IS_ERR(req))
goto out;
req->r_fmode = ceph_flags_to_mode(flags);
- req->r_args.open.flags = ceph_flags_sys2wire(flags);
+ req->r_args.open.flags = ceph_flags_sys2wire(mdsc, flags);
req->r_args.open.mode = cpu_to_le32(create_mode);
out:
return req;
@@ -200,12 +201,13 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mount_options *opt =
- ceph_inode_to_client(&ci->netfs.inode)->mount_options;
+ ceph_inode_to_fs_client(&ci->netfs.inode)->mount_options;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_file_info *fi;
int ret;
- dout("%s %p %p 0%o (%s)\n", __func__, inode, file,
- inode->i_mode, isdir ? "dir" : "regular");
+ doutc(cl, "%p %llx.%llx %p 0%o (%s)\n", inode, ceph_vinop(inode),
+ file, inode->i_mode, isdir ? "dir" : "regular");
BUG_ON(inode->i_fop->release != ceph_release);
if (isdir) {
@@ -234,7 +236,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
spin_lock_init(&fi->rw_contexts_lock);
INIT_LIST_HEAD(&fi->rw_contexts);
- fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen);
+ fi->filp_gen = READ_ONCE(ceph_inode_to_fs_client(inode)->filp_gen);
if ((file->f_mode & FMODE_WRITE) && ceph_has_inline_data(ci)) {
ret = ceph_uninline_data(file);
@@ -259,6 +261,7 @@ error:
*/
static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int ret = 0;
switch (inode->i_mode & S_IFMT) {
@@ -271,13 +274,13 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
break;
case S_IFLNK:
- dout("init_file %p %p 0%o (symlink)\n", inode, file,
- inode->i_mode);
+ doutc(cl, "%p %llx.%llx %p 0%o (symlink)\n", inode,
+ ceph_vinop(inode), file, inode->i_mode);
break;
default:
- dout("init_file %p %p 0%o (special)\n", inode, file,
- inode->i_mode);
+ doutc(cl, "%p %llx.%llx %p 0%o (special)\n", inode,
+ ceph_vinop(inode), file, inode->i_mode);
/*
* we need to drop the open ref now, since we don't
* have .release set to ceph_release.
@@ -296,6 +299,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
int ceph_renew_caps(struct inode *inode, int fmode)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_request *req;
int err, flags, wanted;
@@ -307,8 +311,9 @@ int ceph_renew_caps(struct inode *inode, int fmode)
(!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) {
int issued = __ceph_caps_issued(ci, NULL);
spin_unlock(&ci->i_ceph_lock);
- dout("renew caps %p want %s issued %s updating mds_wanted\n",
- inode, ceph_cap_string(wanted), ceph_cap_string(issued));
+ doutc(cl, "%p %llx.%llx want %s issued %s updating mds_wanted\n",
+ inode, ceph_vinop(inode), ceph_cap_string(wanted),
+ ceph_cap_string(issued));
ceph_check_caps(ci, 0);
return 0;
}
@@ -339,7 +344,8 @@ int ceph_renew_caps(struct inode *inode, int fmode)
err = ceph_mdsc_do_request(mdsc, NULL, req);
ceph_mdsc_put_request(req);
out:
- dout("renew caps %p open result=%d\n", inode, err);
+ doutc(cl, "%p %llx.%llx open result=%d\n", inode, ceph_vinop(inode),
+ err);
return err < 0 ? err : 0;
}
@@ -352,7 +358,8 @@ out:
int ceph_open(struct inode *inode, struct file *file)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb);
+ struct ceph_client *cl = fsc->client;
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
struct ceph_file_info *fi = file->private_data;
@@ -360,7 +367,7 @@ int ceph_open(struct inode *inode, struct file *file)
int flags, fmode, wanted;
if (fi) {
- dout("open file %p is already opened\n", file);
+ doutc(cl, "file %p is already opened\n", file);
return 0;
}
@@ -374,8 +381,8 @@ int ceph_open(struct inode *inode, struct file *file)
return err;
}
- dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
- ceph_vinop(inode), file, flags, file->f_flags);
+ doutc(cl, "%p %llx.%llx file %p flags %d (%d)\n", inode,
+ ceph_vinop(inode), file, flags, file->f_flags);
fmode = ceph_flags_to_mode(flags);
wanted = ceph_caps_for_mode(fmode);
@@ -399,9 +406,9 @@ int ceph_open(struct inode *inode, struct file *file)
int mds_wanted = __ceph_caps_mds_wanted(ci, true);
int issued = __ceph_caps_issued(ci, NULL);
- dout("open %p fmode %d want %s issued %s using existing\n",
- inode, fmode, ceph_cap_string(wanted),
- ceph_cap_string(issued));
+ doutc(cl, "open %p fmode %d want %s issued %s using existing\n",
+ inode, fmode, ceph_cap_string(wanted),
+ ceph_cap_string(issued));
__ceph_touch_fmode(ci, mdsc, fmode);
spin_unlock(&ci->i_ceph_lock);
@@ -421,7 +428,7 @@ int ceph_open(struct inode *inode, struct file *file)
spin_unlock(&ci->i_ceph_lock);
- dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
+ doutc(cl, "open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
req = prepare_open_request(inode->i_sb, flags, 0);
if (IS_ERR(req)) {
err = PTR_ERR(req);
@@ -435,7 +442,7 @@ int ceph_open(struct inode *inode, struct file *file)
if (!err)
err = ceph_init_file(inode, file, req->r_fmode);
ceph_mdsc_put_request(req);
- dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
+ doutc(cl, "open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
out:
return err;
}
@@ -515,6 +522,7 @@ no_async:
static void restore_deleg_ino(struct inode *dir, u64 ino)
{
+ struct ceph_client *cl = ceph_inode_to_client(dir);
struct ceph_inode_info *ci = ceph_inode(dir);
struct ceph_mds_session *s = NULL;
@@ -525,7 +533,8 @@ static void restore_deleg_ino(struct inode *dir, u64 ino)
if (s) {
int err = ceph_restore_deleg_ino(s, ino);
if (err)
- pr_warn("ceph: unable to restore delegated ino 0x%llx to session: %d\n",
+ pr_warn_client(cl,
+ "unable to restore delegated ino 0x%llx to session: %d\n",
ino, err);
ceph_put_mds_session(s);
}
@@ -557,6 +566,7 @@ static void wake_async_create_waiters(struct inode *inode,
static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct dentry *dentry = req->r_dentry;
struct inode *dinode = d_inode(dentry);
struct inode *tinode = req->r_target_inode;
@@ -574,10 +584,11 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
if (result) {
int pathlen = 0;
u64 base = 0;
- char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+ char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen,
&base, 0);
- pr_warn("async create failure path=(%llx)%s result=%d!\n",
+ pr_warn_client(cl,
+ "async create failure path=(%llx)%s result=%d!\n",
base, IS_ERR(path) ? "<<bad>>" : path, result);
ceph_mdsc_free_path(path, pathlen);
@@ -596,14 +607,15 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
u64 ino = ceph_vino(tinode).ino;
if (req->r_deleg_ino != ino)
- pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n",
- __func__, req->r_err, req->r_deleg_ino, ino);
+ pr_warn_client(cl,
+ "inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n",
+ req->r_err, req->r_deleg_ino, ino);
mapping_set_error(tinode->i_mapping, result);
wake_async_create_waiters(tinode, req->r_session);
} else if (!result) {
- pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__,
- req->r_deleg_ino);
+ pr_warn_client(cl, "no req->r_target_inode for 0x%llx\n",
+ req->r_deleg_ino);
}
out:
ceph_mdsc_release_dir_caps(req);
@@ -625,6 +637,7 @@ static int ceph_finish_async_create(struct inode *dir, struct inode *inode,
struct timespec64 now;
struct ceph_string *pool_ns;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_vino vino = { .ino = req->r_deleg_ino,
.snap = CEPH_NOSNAP };
@@ -655,7 +668,9 @@ static int ceph_finish_async_create(struct inode *dir, struct inode *inode,
in.truncate_seq = cpu_to_le32(1);
in.truncate_size = cpu_to_le64(-1ULL);
in.xattr_version = cpu_to_le64(1);
- in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid()));
+ in.uid = cpu_to_le32(from_kuid(&init_user_ns,
+ mapped_fsuid(req->r_mnt_idmap,
+ &init_user_ns)));
if (dir->i_mode & S_ISGID) {
in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_gid));
@@ -663,7 +678,9 @@ static int ceph_finish_async_create(struct inode *dir, struct inode *inode,
if (S_ISDIR(mode))
mode |= S_ISGID;
} else {
- in.gid = cpu_to_le32(from_kgid(&init_user_ns, current_fsgid()));
+ in.gid = cpu_to_le32(from_kgid(&init_user_ns,
+ mapped_fsgid(req->r_mnt_idmap,
+ &init_user_ns)));
}
in.mode = cpu_to_le32((u32)mode);
@@ -683,7 +700,7 @@ static int ceph_finish_async_create(struct inode *dir, struct inode *inode,
req->r_fmode, NULL);
up_read(&mdsc->snap_rwsem);
if (ret) {
- dout("%s failed to fill inode: %d\n", __func__, ret);
+ doutc(cl, "failed to fill inode: %d\n", ret);
ceph_dir_clear_complete(dir);
if (!d_unhashed(dentry))
d_drop(dentry);
@@ -691,8 +708,8 @@ static int ceph_finish_async_create(struct inode *dir, struct inode *inode,
} else {
struct dentry *dn;
- dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__,
- vino.ino, ceph_ino(dir), dentry->d_name.name);
+ doutc(cl, "d_adding new inode 0x%llx to 0x%llx/%s\n",
+ vino.ino, ceph_ino(dir), dentry->d_name.name);
ceph_dir_clear_ordered(dir);
ceph_init_inode_acls(inode, as_ctx);
if (inode->i_state & I_NEW) {
@@ -730,7 +747,9 @@ static int ceph_finish_async_create(struct inode *dir, struct inode *inode,
int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct file *file, unsigned flags, umode_t mode)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+ struct mnt_idmap *idmap = file_mnt_idmap(file);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dir->i_sb);
+ struct ceph_client *cl = fsc->client;
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
struct inode *new_inode = NULL;
@@ -740,9 +759,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
int mask;
int err;
- dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n",
- dir, dentry, dentry,
- d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode);
+ doutc(cl, "%p %llx.%llx dentry %p '%pd' %s flags %d mode 0%o\n",
+ dir, ceph_vinop(dir), dentry, dentry,
+ d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode);
if (dentry->d_name.len > NAME_MAX)
return -ENAMETOOLONG;
@@ -788,6 +807,8 @@ retry:
mask |= CEPH_CAP_XATTR_SHARED;
req->r_args.open.mask = cpu_to_le32(mask);
req->r_parent = dir;
+ if (req->r_op == CEPH_MDS_OP_CREATE)
+ req->r_mnt_idmap = mnt_idmap_get(idmap);
ihold(dir);
if (IS_ENCRYPTED(dir)) {
set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags);
@@ -880,17 +901,18 @@ retry:
goto out_req;
if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) {
/* make vfs retry on splice, ENOENT, or symlink */
- dout("atomic_open finish_no_open on dn %p\n", dn);
+ doutc(cl, "finish_no_open on dn %p\n", dn);
err = finish_no_open(file, dn);
} else {
if (IS_ENCRYPTED(dir) &&
!fscrypt_has_permitted_context(dir, d_inode(dentry))) {
- pr_warn("Inconsistent encryption context (parent %llx:%llx child %llx:%llx)\n",
+ pr_warn_client(cl,
+ "Inconsistent encryption context (parent %llx:%llx child %llx:%llx)\n",
ceph_vinop(dir), ceph_vinop(d_inode(dentry)));
goto out_req;
}
- dout("atomic_open finish_open on dn %p\n", dn);
+ doutc(cl, "finish_open on dn %p\n", dn);
if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
struct inode *newino = d_inode(dentry);
@@ -905,17 +927,19 @@ out_req:
iput(new_inode);
out_ctx:
ceph_release_acl_sec_ctx(&as_ctx);
- dout("atomic_open result=%d\n", err);
+ doutc(cl, "result=%d\n", err);
return err;
}
int ceph_release(struct inode *inode, struct file *file)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
if (S_ISDIR(inode->i_mode)) {
struct ceph_dir_file_info *dfi = file->private_data;
- dout("release inode %p dir file %p\n", inode, file);
+ doutc(cl, "%p %llx.%llx dir file %p\n", inode,
+ ceph_vinop(inode), file);
WARN_ON(!list_empty(&dfi->file_info.rw_contexts));
ceph_put_fmode(ci, dfi->file_info.fmode, 1);
@@ -927,7 +951,8 @@ int ceph_release(struct inode *inode, struct file *file)
kmem_cache_free(ceph_dir_file_cachep, dfi);
} else {
struct ceph_file_info *fi = file->private_data;
- dout("release inode %p regular file %p\n", inode, file);
+ doutc(cl, "%p %llx.%llx regular file %p\n", inode,
+ ceph_vinop(inode), file);
WARN_ON(!list_empty(&fi->rw_contexts));
ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE);
@@ -962,7 +987,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
u64 *last_objver)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
struct ceph_osd_client *osdc = &fsc->client->osdc;
ssize_t ret;
u64 off = *ki_pos;
@@ -971,7 +997,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD);
u64 objver = 0;
- dout("sync_read on inode %p %llx~%llx\n", inode, *ki_pos, len);
+ doutc(cl, "on inode %p %llx.%llx %llx~%llx\n", inode,
+ ceph_vinop(inode), *ki_pos, len);
if (ceph_inode_is_shutdown(inode))
return -EIO;
@@ -1005,8 +1032,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
/* determine new offset/length if encrypted */
ceph_fscrypt_adjust_off_and_len(inode, &read_off, &read_len);
- dout("sync_read orig %llu~%llu reading %llu~%llu",
- off, len, read_off, read_len);
+ doutc(cl, "orig %llu~%llu reading %llu~%llu", off, len,
+ read_off, read_len);
req = ceph_osdc_new_request(osdc, &ci->i_layout,
ci->i_vino, read_off, &read_len, 0, 1,
@@ -1059,8 +1086,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
objver = req->r_version;
i_size = i_size_read(inode);
- dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
- off, len, ret, i_size, (more ? " MORE" : ""));
+ doutc(cl, "%llu~%llu got %zd i_size %llu%s\n", off, len,
+ ret, i_size, (more ? " MORE" : ""));
/* Fix it to go to end of extent map */
if (sparse && ret >= 0)
@@ -1101,8 +1128,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
int zlen = min(len - ret, i_size - off - ret);
int zoff = page_off + ret;
- dout("sync_read zero gap %llu~%llu\n",
- off + ret, off + ret + zlen);
+ doutc(cl, "zero gap %llu~%llu\n", off + ret,
+ off + ret + zlen);
ceph_zero_page_vector_range(zoff, zlen, pages);
ret += zlen;
}
@@ -1151,7 +1178,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
if (last_objver)
*last_objver = objver;
}
- dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
+ doutc(cl, "result %zd retry_op %d\n", ret, *retry_op);
return ret;
}
@@ -1160,9 +1187,11 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
- dout("sync_read on file %p %llx~%zx %s\n", file, iocb->ki_pos,
- iov_iter_count(to), (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+ doutc(cl, "on file %p %llx~%zx %s\n", file, iocb->ki_pos,
+ iov_iter_count(to),
+ (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
return __ceph_sync_read(inode, &iocb->ki_pos, to, retry_op, NULL);
}
@@ -1190,6 +1219,7 @@ static void ceph_aio_retry_work(struct work_struct *work);
static void ceph_aio_complete(struct inode *inode,
struct ceph_aio_request *aio_req)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
int ret;
@@ -1203,7 +1233,7 @@ static void ceph_aio_complete(struct inode *inode,
if (!ret)
ret = aio_req->total_len;
- dout("ceph_aio_complete %p rc %d\n", inode, ret);
+ doutc(cl, "%p %llx.%llx rc %d\n", inode, ceph_vinop(inode), ret);
if (ret >= 0 && aio_req->write) {
int dirty;
@@ -1242,11 +1272,13 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric;
unsigned int len = osd_data->bvec_pos.iter.bi_size;
bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS);
BUG_ON(!osd_data->num_bvecs);
- dout("ceph_aio_complete_req %p rc %d bytes %u\n", inode, rc, len);
+ doutc(cl, "req %p inode %p %llx.%llx, rc %d bytes %u\n", req,
+ inode, ceph_vinop(inode), rc, len);
if (rc == -EOLDSNAPC) {
struct ceph_aio_work *aio_work;
@@ -1256,7 +1288,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
if (aio_work) {
INIT_WORK(&aio_work->work, ceph_aio_retry_work);
aio_work->req = req;
- queue_work(ceph_inode_to_client(inode)->inode_wq,
+ queue_work(ceph_inode_to_fs_client(inode)->inode_wq,
&aio_work->work);
return;
}
@@ -1386,7 +1418,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
struct ceph_client_metric *metric = &fsc->mdsc->metric;
struct ceph_vino vino;
struct ceph_osd_request *req;
@@ -1405,9 +1438,9 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
return -EROFS;
- dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n",
- (write ? "write" : "read"), file, pos, (unsigned)count,
- snapc, snapc ? snapc->seq : 0);
+ doutc(cl, "sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n",
+ (write ? "write" : "read"), file, pos, (unsigned)count,
+ snapc, snapc ? snapc->seq : 0);
if (write) {
int ret2;
@@ -1418,7 +1451,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
pos >> PAGE_SHIFT,
(pos + count - 1) >> PAGE_SHIFT);
if (ret2 < 0)
- dout("invalidate_inode_pages2_range returned %d\n", ret2);
+ doutc(cl, "invalidate_inode_pages2_range returned %d\n",
+ ret2);
flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE;
} else {
@@ -1610,7 +1644,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_osd_request *req;
struct page **pages;
@@ -1625,8 +1660,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
return -EROFS;
- dout("sync_write on file %p %lld~%u snapc %p seq %lld\n",
- file, pos, (unsigned)count, snapc, snapc->seq);
+ doutc(cl, "on file %p %lld~%u snapc %p seq %lld\n", file, pos,
+ (unsigned)count, snapc, snapc->seq);
ret = filemap_write_and_wait_range(inode->i_mapping,
pos, pos + count - 1);
@@ -1670,9 +1705,9 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
last = (pos + len) != (write_pos + write_len);
rmw = first || last;
- dout("sync_write ino %llx %lld~%llu adjusted %lld~%llu -- %srmw\n",
- ci->i_vino.ino, pos, len, write_pos, write_len,
- rmw ? "" : "no ");
+ doutc(cl, "ino %llx %lld~%llu adjusted %lld~%llu -- %srmw\n",
+ ci->i_vino.ino, pos, len, write_pos, write_len,
+ rmw ? "" : "no ");
/*
* The data is emplaced into the page as it would be if it were
@@ -1881,7 +1916,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
left -= ret;
}
if (ret < 0) {
- dout("sync_write write failed with %d\n", ret);
+ doutc(cl, "write failed with %d\n", ret);
ceph_release_page_vector(pages, num_pages);
break;
}
@@ -1891,7 +1926,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
write_pos, write_len,
GFP_KERNEL);
if (ret < 0) {
- dout("encryption failed with %d\n", ret);
+ doutc(cl, "encryption failed with %d\n", ret);
ceph_release_page_vector(pages, num_pages);
break;
}
@@ -1910,7 +1945,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
break;
}
- dout("sync_write write op %lld~%llu\n", write_pos, write_len);
+ doutc(cl, "write op %lld~%llu\n", write_pos, write_len);
osd_req_op_extent_osd_data_pages(req, rmw ? 1 : 0, pages, write_len,
offset_in_page(write_pos), false,
true);
@@ -1941,7 +1976,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
req->r_end_latency, len, ret);
ceph_osdc_put_request(req);
if (ret != 0) {
- dout("sync_write osd write returned %d\n", ret);
+ doutc(cl, "osd write returned %d\n", ret);
/* Version changed! Must re-do the rmw cycle */
if ((assert_ver && (ret == -ERANGE || ret == -EOVERFLOW)) ||
(!assert_ver && ret == -EEXIST)) {
@@ -1971,13 +2006,13 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
pos >> PAGE_SHIFT,
(pos + len - 1) >> PAGE_SHIFT);
if (ret < 0) {
- dout("invalidate_inode_pages2_range returned %d\n",
- ret);
+ doutc(cl, "invalidate_inode_pages2_range returned %d\n",
+ ret);
ret = 0;
}
pos += len;
written += len;
- dout("sync_write written %d\n", written);
+ doutc(cl, "written %d\n", written);
if (pos > i_size_read(inode)) {
check_caps = ceph_inode_set_size(inode, pos);
if (check_caps)
@@ -1991,7 +2026,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
ret = written;
iocb->ki_pos = pos;
}
- dout("sync_write returning %d\n", ret);
+ doutc(cl, "returning %d\n", ret);
return ret;
}
@@ -2010,13 +2045,14 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct inode *inode = file_inode(filp);
struct ceph_inode_info *ci = ceph_inode(inode);
bool direct_lock = iocb->ki_flags & IOCB_DIRECT;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
ssize_t ret;
int want = 0, got = 0;
int retry_op = 0, read = 0;
again:
- dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
- inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode);
+ doutc(cl, "%llu~%u trying to get caps on %p %llx.%llx\n",
+ iocb->ki_pos, (unsigned)len, inode, ceph_vinop(inode));
if (ceph_inode_is_shutdown(inode))
return -ESTALE;
@@ -2044,9 +2080,9 @@ again:
(iocb->ki_flags & IOCB_DIRECT) ||
(fi->flags & CEPH_F_SYNC)) {
- dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n",
- inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
- ceph_cap_string(got));
+ doutc(cl, "sync %p %llx.%llx %llu~%u got cap refs on %s\n",
+ inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
+ ceph_cap_string(got));
if (!ceph_has_inline_data(ci)) {
if (!retry_op &&
@@ -2064,16 +2100,16 @@ again:
}
} else {
CEPH_DEFINE_RW_CONTEXT(rw_ctx, got);
- dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
- inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
- ceph_cap_string(got));
+ doutc(cl, "async %p %llx.%llx %llu~%u got cap refs on %s\n",
+ inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
+ ceph_cap_string(got));
ceph_add_rw_context(fi, &rw_ctx);
ret = generic_file_read_iter(iocb, to);
ceph_del_rw_context(fi, &rw_ctx);
}
- dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
- inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
+ doutc(cl, "%p %llx.%llx dropping cap refs on %s = %d\n",
+ inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
ceph_put_cap_refs(ci, got);
if (direct_lock)
@@ -2133,8 +2169,8 @@ again:
/* hit EOF or hole? */
if (retry_op == CHECK_EOF && iocb->ki_pos < i_size &&
ret < len) {
- dout("sync_read hit hole, ppos %lld < size %lld"
- ", reading more\n", iocb->ki_pos, i_size);
+ doutc(cl, "hit hole, ppos %lld < size %lld, reading more\n",
+ iocb->ki_pos, i_size);
read += ret;
len -= ret;
@@ -2228,7 +2264,8 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct ceph_file_info *fi = file->private_data;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_cap_flush *prealloc_cf;
ssize_t count, written = 0;
@@ -2296,8 +2333,9 @@ retry_snap:
if (err)
goto out;
- dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
- inode, ceph_vinop(inode), pos, count, i_size_read(inode));
+ doutc(cl, "%p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
+ inode, ceph_vinop(inode), pos, count,
+ i_size_read(inode));
if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
want |= CEPH_CAP_FILE_BUFFER;
if (fi->fmode & CEPH_FILE_MODE_LAZY)
@@ -2313,8 +2351,8 @@ retry_snap:
inode_inc_iversion_raw(inode);
- dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n",
- inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
+ doutc(cl, "%p %llx.%llx %llu~%zd got cap refs on %s\n",
+ inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
(iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC) ||
@@ -2374,14 +2412,14 @@ retry_snap:
ceph_check_caps(ci, CHECK_CAPS_FLUSH);
}
- dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
- inode, ceph_vinop(inode), pos, (unsigned)count,
- ceph_cap_string(got));
+ doutc(cl, "%p %llx.%llx %llu~%u dropping cap refs on %s\n",
+ inode, ceph_vinop(inode), pos, (unsigned)count,
+ ceph_cap_string(got));
ceph_put_cap_refs(ci, got);
if (written == -EOLDSNAPC) {
- dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n",
- inode, ceph_vinop(inode), pos, (unsigned)count);
+ doutc(cl, "%p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n",
+ inode, ceph_vinop(inode), pos, (unsigned)count);
goto retry_snap;
}
@@ -2462,7 +2500,7 @@ static int ceph_zero_partial_object(struct inode *inode,
loff_t offset, loff_t *length)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
struct ceph_osd_request *req;
int ret = 0;
loff_t zero = 0;
@@ -2489,7 +2527,7 @@ static int ceph_zero_partial_object(struct inode *inode,
goto out;
}
- req->r_mtime = inode->i_mtime;
+ req->r_mtime = inode_get_mtime(inode);
ceph_osdc_start_request(&fsc->client->osdc, req);
ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
if (ret == -ENOENT)
@@ -2553,14 +2591,15 @@ static long ceph_fallocate(struct file *file, int mode,
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_cap_flush *prealloc_cf;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int want, got = 0;
int dirty;
int ret = 0;
loff_t endoff = 0;
loff_t size;
- dout("%s %p %llx.%llx mode %x, offset %llu length %llu\n", __func__,
- inode, ceph_vinop(inode), mode, offset, length);
+ doutc(cl, "%p %llx.%llx mode %x, offset %llu length %llu\n",
+ inode, ceph_vinop(inode), mode, offset, length);
if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
@@ -2689,6 +2728,7 @@ static void put_rd_wr_caps(struct ceph_inode_info *src_ci, int src_got,
static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode,
loff_t src_off, loff_t dst_off, size_t len)
{
+ struct ceph_client *cl = ceph_inode_to_client(src_inode);
loff_t size, endoff;
size = i_size_read(src_inode);
@@ -2699,8 +2739,8 @@ static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode,
* inode.
*/
if (src_off + len > size) {
- dout("Copy beyond EOF (%llu + %zu > %llu)\n",
- src_off, len, size);
+ doutc(cl, "Copy beyond EOF (%llu + %zu > %llu)\n", src_off,
+ len, size);
return -EOPNOTSUPP;
}
size = i_size_read(dst_inode);
@@ -2776,6 +2816,7 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off
u64 src_objnum, src_objoff, dst_objnum, dst_objoff;
u32 src_objlen, dst_objlen;
u32 object_size = src_ci->i_layout.object_size;
+ struct ceph_client *cl = fsc->client;
int ret;
src_oloc.pool = src_ci->i_layout.pool_id;
@@ -2817,9 +2858,10 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off
if (ret) {
if (ret == -EOPNOTSUPP) {
fsc->have_copy_from2 = false;
- pr_notice("OSDs don't support copy-from2; disabling copy offload\n");
+ pr_notice_client(cl,
+ "OSDs don't support copy-from2; disabling copy offload\n");
}
- dout("ceph_osdc_copy_from returned %d\n", ret);
+ doutc(cl, "returned %d\n", ret);
if (!bytes)
bytes = ret;
goto out;
@@ -2845,7 +2887,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
struct ceph_inode_info *src_ci = ceph_inode(src_inode);
struct ceph_inode_info *dst_ci = ceph_inode(dst_inode);
struct ceph_cap_flush *prealloc_cf;
- struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode);
+ struct ceph_fs_client *src_fsc = ceph_inode_to_fs_client(src_inode);
+ struct ceph_client *cl = src_fsc->client;
loff_t size;
ssize_t ret = -EIO, bytes;
u64 src_objnum, dst_objnum, src_objoff, dst_objoff;
@@ -2853,7 +2896,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
int src_got = 0, dst_got = 0, err, dirty;
if (src_inode->i_sb != dst_inode->i_sb) {
- struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode);
+ struct ceph_fs_client *dst_fsc = ceph_inode_to_fs_client(dst_inode);
if (ceph_fsid_compare(&src_fsc->client->fsid,
&dst_fsc->client->fsid)) {
@@ -2888,7 +2931,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
(src_ci->i_layout.stripe_count != 1) ||
(dst_ci->i_layout.stripe_count != 1) ||
(src_ci->i_layout.object_size != dst_ci->i_layout.object_size)) {
- dout("Invalid src/dst files layout\n");
+ doutc(cl, "Invalid src/dst files layout\n");
return -EOPNOTSUPP;
}
@@ -2906,12 +2949,12 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
/* Start by sync'ing the source and destination files */
ret = file_write_and_wait_range(src_file, src_off, (src_off + len));
if (ret < 0) {
- dout("failed to write src file (%zd)\n", ret);
+ doutc(cl, "failed to write src file (%zd)\n", ret);
goto out;
}
ret = file_write_and_wait_range(dst_file, dst_off, (dst_off + len));
if (ret < 0) {
- dout("failed to write dst file (%zd)\n", ret);
+ doutc(cl, "failed to write dst file (%zd)\n", ret);
goto out;
}
@@ -2923,7 +2966,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
err = get_rd_wr_caps(src_file, &src_got,
dst_file, (dst_off + len), &dst_got);
if (err < 0) {
- dout("get_rd_wr_caps returned %d\n", err);
+ doutc(cl, "get_rd_wr_caps returned %d\n", err);
ret = -EOPNOTSUPP;
goto out;
}
@@ -2938,7 +2981,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
dst_off >> PAGE_SHIFT,
(dst_off + len) >> PAGE_SHIFT);
if (ret < 0) {
- dout("Failed to invalidate inode pages (%zd)\n", ret);
+ doutc(cl, "Failed to invalidate inode pages (%zd)\n",
+ ret);
ret = 0; /* XXX */
}
ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
@@ -2959,7 +3003,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
* starting at the src_off
*/
if (src_objoff) {
- dout("Initial partial copy of %u bytes\n", src_objlen);
+ doutc(cl, "Initial partial copy of %u bytes\n", src_objlen);
/*
* we need to temporarily drop all caps as we'll be calling
@@ -2970,7 +3014,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
&dst_off, src_objlen, flags);
/* Abort on short copies or on error */
if (ret < (long)src_objlen) {
- dout("Failed partial copy (%zd)\n", ret);
+ doutc(cl, "Failed partial copy (%zd)\n", ret);
goto out;
}
len -= ret;
@@ -2992,7 +3036,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
ret = bytes;
goto out_caps;
}
- dout("Copied %zu bytes out of %zu\n", bytes, len);
+ doutc(cl, "Copied %zu bytes out of %zu\n", bytes, len);
len -= bytes;
ret += bytes;
@@ -3020,13 +3064,13 @@ out_caps:
* there were errors in remote object copies (len >= object_size).
*/
if (len && (len < src_ci->i_layout.object_size)) {
- dout("Final partial copy of %zu bytes\n", len);
+ doutc(cl, "Final partial copy of %zu bytes\n", len);
bytes = do_splice_direct(src_file, &src_off, dst_file,
&dst_off, len, flags);
if (bytes > 0)
ret += bytes;
else
- dout("Failed partial copy (%zd)\n", bytes);
+ doutc(cl, "Failed partial copy (%zd)\n", bytes);
}
out:
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index b79100f720b3..0679240f06db 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -129,6 +129,8 @@ void ceph_as_ctx_to_req(struct ceph_mds_request *req,
struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino,
struct inode *newino)
{
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct inode *inode;
if (ceph_vino_is_reserved(vino))
@@ -145,12 +147,13 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino,
}
if (!inode) {
- dout("No inode found for %llx.%llx\n", vino.ino, vino.snap);
+ doutc(cl, "no inode found for %llx.%llx\n", vino.ino, vino.snap);
return ERR_PTR(-ENOMEM);
}
- dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode),
- ceph_vinop(inode), inode, !!(inode->i_state & I_NEW));
+ doutc(cl, "on %llx=%llx.%llx got %p new %d\n",
+ ceph_present_inode(inode), ceph_vinop(inode), inode,
+ !!(inode->i_state & I_NEW));
return inode;
}
@@ -159,6 +162,7 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino,
*/
struct inode *ceph_get_snapdir(struct inode *parent)
{
+ struct ceph_client *cl = ceph_inode_to_client(parent);
struct ceph_vino vino = {
.ino = ceph_ino(parent),
.snap = CEPH_SNAPDIR,
@@ -171,23 +175,23 @@ struct inode *ceph_get_snapdir(struct inode *parent)
return inode;
if (!S_ISDIR(parent->i_mode)) {
- pr_warn_once("bad snapdir parent type (mode=0%o)\n",
- parent->i_mode);
+ pr_warn_once_client(cl, "bad snapdir parent type (mode=0%o)\n",
+ parent->i_mode);
goto err;
}
if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) {
- pr_warn_once("bad snapdir inode type (mode=0%o)\n",
- inode->i_mode);
+ pr_warn_once_client(cl, "bad snapdir inode type (mode=0%o)\n",
+ inode->i_mode);
goto err;
}
inode->i_mode = parent->i_mode;
inode->i_uid = parent->i_uid;
inode->i_gid = parent->i_gid;
- inode->i_mtime = parent->i_mtime;
+ inode_set_mtime_to_ts(inode, inode_get_mtime(parent));
inode_set_ctime_to_ts(inode, inode_get_ctime(parent));
- inode->i_atime = parent->i_atime;
+ inode_set_atime_to_ts(inode, inode_get_atime(parent));
ci->i_rbytes = 0;
ci->i_btime = ceph_inode(parent)->i_btime;
@@ -203,7 +207,7 @@ struct inode *ceph_get_snapdir(struct inode *parent)
inode->i_flags |= S_ENCRYPTED;
ci->fscrypt_auth_len = pci->fscrypt_auth_len;
} else {
- dout("Failed to alloc snapdir fscrypt_auth\n");
+ doutc(cl, "Failed to alloc snapdir fscrypt_auth\n");
ret = -ENOMEM;
goto err;
}
@@ -249,6 +253,8 @@ const struct inode_operations ceph_file_iops = {
static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
u32 f)
{
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct rb_node **p;
struct rb_node *parent = NULL;
struct ceph_inode_frag *frag;
@@ -279,8 +285,7 @@ static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
rb_link_node(&frag->node, parent, p);
rb_insert_color(&frag->node, &ci->i_fragtree);
- dout("get_or_create_frag added %llx.%llx frag %x\n",
- ceph_vinop(&ci->netfs.inode), f);
+ doutc(cl, "added %p %llx.%llx frag %x\n", inode, ceph_vinop(inode), f);
return frag;
}
@@ -313,6 +318,7 @@ struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
struct ceph_inode_frag *pfrag, int *found)
{
+ struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode);
u32 t = ceph_frag_make(0, 0);
struct ceph_inode_frag *frag;
unsigned nway, i;
@@ -336,8 +342,8 @@ static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
/* choose child */
nway = 1 << frag->split_by;
- dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
- frag->split_by, nway);
+ doutc(cl, "frag(%x) %x splits by %d (%d ways)\n", v, t,
+ frag->split_by, nway);
for (i = 0; i < nway; i++) {
n = ceph_frag_make_child(t, frag->split_by, i);
if (ceph_frag_contains_value(n, v)) {
@@ -347,7 +353,7 @@ static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
}
BUG_ON(i == nway);
}
- dout("choose_frag(%x) = %x\n", v, t);
+ doutc(cl, "frag(%x) = %x\n", v, t);
return t;
}
@@ -371,6 +377,7 @@ static int ceph_fill_dirfrag(struct inode *inode,
struct ceph_mds_reply_dirfrag *dirinfo)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_frag *frag;
u32 id = le32_to_cpu(dirinfo->frag);
int mds = le32_to_cpu(dirinfo->auth);
@@ -395,14 +402,14 @@ static int ceph_fill_dirfrag(struct inode *inode,
goto out;
if (frag->split_by == 0) {
/* tree leaf, remove */
- dout("fill_dirfrag removed %llx.%llx frag %x"
- " (no ref)\n", ceph_vinop(inode), id);
+ doutc(cl, "removed %p %llx.%llx frag %x (no ref)\n",
+ inode, ceph_vinop(inode), id);
rb_erase(&frag->node, &ci->i_fragtree);
kfree(frag);
} else {
/* tree branch, keep and clear */
- dout("fill_dirfrag cleared %llx.%llx frag %x"
- " referral\n", ceph_vinop(inode), id);
+ doutc(cl, "cleared %p %llx.%llx frag %x referral\n",
+ inode, ceph_vinop(inode), id);
frag->mds = -1;
frag->ndist = 0;
}
@@ -415,8 +422,9 @@ static int ceph_fill_dirfrag(struct inode *inode,
if (IS_ERR(frag)) {
/* this is not the end of the world; we can continue
with bad/inaccurate delegation info */
- pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
- ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
+ pr_err_client(cl, "ENOMEM on mds ref %p %llx.%llx fg %x\n",
+ inode, ceph_vinop(inode),
+ le32_to_cpu(dirinfo->frag));
err = -ENOMEM;
goto out;
}
@@ -425,8 +433,8 @@ static int ceph_fill_dirfrag(struct inode *inode,
frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
for (i = 0; i < frag->ndist; i++)
frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
- dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
- ceph_vinop(inode), frag->frag, frag->ndist);
+ doutc(cl, "%p %llx.%llx frag %x ndist=%d\n", inode,
+ ceph_vinop(inode), frag->frag, frag->ndist);
out:
mutex_unlock(&ci->i_fragtree_mutex);
@@ -454,6 +462,7 @@ static int ceph_fill_fragtree(struct inode *inode,
struct ceph_frag_tree_head *fragtree,
struct ceph_mds_reply_dirfrag *dirinfo)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_inode_frag *frag, *prev_frag = NULL;
struct rb_node *rb_node;
@@ -489,15 +498,15 @@ static int ceph_fill_fragtree(struct inode *inode,
frag_tree_split_cmp, NULL);
}
- dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode));
+ doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
rb_node = rb_first(&ci->i_fragtree);
for (i = 0; i < nsplits; i++) {
id = le32_to_cpu(fragtree->splits[i].frag);
split_by = le32_to_cpu(fragtree->splits[i].by);
if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) {
- pr_err("fill_fragtree %llx.%llx invalid split %d/%u, "
- "frag %x split by %d\n", ceph_vinop(inode),
- i, nsplits, id, split_by);
+ pr_err_client(cl, "%p %llx.%llx invalid split %d/%u, "
+ "frag %x split by %d\n", inode,
+ ceph_vinop(inode), i, nsplits, id, split_by);
continue;
}
frag = NULL;
@@ -529,7 +538,7 @@ static int ceph_fill_fragtree(struct inode *inode,
if (frag->split_by == 0)
ci->i_fragtree_nsplits++;
frag->split_by = split_by;
- dout(" frag %x split by %d\n", frag->frag, frag->split_by);
+ doutc(cl, " frag %x split by %d\n", frag->frag, frag->split_by);
prev_frag = frag;
}
while (rb_node) {
@@ -554,6 +563,7 @@ out_unlock:
*/
struct inode *ceph_alloc_inode(struct super_block *sb)
{
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
struct ceph_inode_info *ci;
int i;
@@ -561,7 +571,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
if (!ci)
return NULL;
- dout("alloc_inode %p\n", &ci->netfs.inode);
+ doutc(fsc->client, "%p\n", &ci->netfs.inode);
/* Set parameters for the netfs library */
netfs_inode_init(&ci->netfs, &ceph_netfs_ops);
@@ -675,10 +685,11 @@ void ceph_evict_inode(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_frag *frag;
struct rb_node *n;
- dout("evict_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
+ doutc(cl, "%p ino %llx.%llx\n", inode, ceph_vinop(inode));
percpu_counter_dec(&mdsc->metric.total_inodes);
@@ -701,8 +712,8 @@ void ceph_evict_inode(struct inode *inode)
*/
if (ci->i_snap_realm) {
if (ceph_snap(inode) == CEPH_NOSNAP) {
- dout(" dropping residual ref to snap realm %p\n",
- ci->i_snap_realm);
+ doutc(cl, " dropping residual ref to snap realm %p\n",
+ ci->i_snap_realm);
ceph_change_snap_realm(inode, NULL);
} else {
ceph_put_snapid_map(mdsc, ci->i_snapid_map);
@@ -743,15 +754,16 @@ static inline blkcnt_t calc_inode_blocks(u64 size)
int ceph_fill_file_size(struct inode *inode, int issued,
u32 truncate_seq, u64 truncate_size, u64 size)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
int queue_trunc = 0;
loff_t isize = i_size_read(inode);
if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
(truncate_seq == ci->i_truncate_seq && size > isize)) {
- dout("size %lld -> %llu\n", isize, size);
+ doutc(cl, "size %lld -> %llu\n", isize, size);
if (size > 0 && S_ISDIR(inode->i_mode)) {
- pr_err("fill_file_size non-zero size for directory\n");
+ pr_err_client(cl, "non-zero size for directory\n");
size = 0;
}
i_size_write(inode, size);
@@ -764,8 +776,8 @@ int ceph_fill_file_size(struct inode *inode, int issued,
ceph_fscache_update(inode);
ci->i_reported_size = size;
if (truncate_seq != ci->i_truncate_seq) {
- dout("%s truncate_seq %u -> %u\n", __func__,
- ci->i_truncate_seq, truncate_seq);
+ doutc(cl, "truncate_seq %u -> %u\n",
+ ci->i_truncate_seq, truncate_seq);
ci->i_truncate_seq = truncate_seq;
/* the MDS should have revoked these caps */
@@ -794,14 +806,15 @@ int ceph_fill_file_size(struct inode *inode, int issued,
* anyway.
*/
if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0) {
- dout("%s truncate_size %lld -> %llu, encrypted %d\n", __func__,
- ci->i_truncate_size, truncate_size, !!IS_ENCRYPTED(inode));
+ doutc(cl, "truncate_size %lld -> %llu, encrypted %d\n",
+ ci->i_truncate_size, truncate_size,
+ !!IS_ENCRYPTED(inode));
ci->i_truncate_size = truncate_size;
if (IS_ENCRYPTED(inode)) {
- dout("%s truncate_pagecache_size %lld -> %llu\n",
- __func__, ci->i_truncate_pagecache_size, size);
+ doutc(cl, "truncate_pagecache_size %lld -> %llu\n",
+ ci->i_truncate_pagecache_size, size);
ci->i_truncate_pagecache_size = size;
} else {
ci->i_truncate_pagecache_size = truncate_size;
@@ -814,6 +827,7 @@ void ceph_fill_file_time(struct inode *inode, int issued,
u64 time_warp_seq, struct timespec64 *ctime,
struct timespec64 *mtime, struct timespec64 *atime)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
struct timespec64 ictime = inode_get_ctime(inode);
int warn = 0;
@@ -825,7 +839,7 @@ void ceph_fill_file_time(struct inode *inode, int issued,
CEPH_CAP_XATTR_EXCL)) {
if (ci->i_version == 0 ||
timespec64_compare(ctime, &ictime) > 0) {
- dout("ctime %lld.%09ld -> %lld.%09ld inc w/ cap\n",
+ doutc(cl, "ctime %lld.%09ld -> %lld.%09ld inc w/ cap\n",
ictime.tv_sec, ictime.tv_nsec,
ctime->tv_sec, ctime->tv_nsec);
inode_set_ctime_to_ts(inode, *ctime);
@@ -833,30 +847,32 @@ void ceph_fill_file_time(struct inode *inode, int issued,
if (ci->i_version == 0 ||
ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
/* the MDS did a utimes() */
- dout("mtime %lld.%09ld -> %lld.%09ld "
- "tw %d -> %d\n",
- inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
+ doutc(cl, "mtime %lld.%09ld -> %lld.%09ld tw %d -> %d\n",
+ inode_get_mtime_sec(inode),
+ inode_get_mtime_nsec(inode),
mtime->tv_sec, mtime->tv_nsec,
ci->i_time_warp_seq, (int)time_warp_seq);
- inode->i_mtime = *mtime;
- inode->i_atime = *atime;
+ inode_set_mtime_to_ts(inode, *mtime);
+ inode_set_atime_to_ts(inode, *atime);
ci->i_time_warp_seq = time_warp_seq;
} else if (time_warp_seq == ci->i_time_warp_seq) {
+ struct timespec64 ts;
+
/* nobody did utimes(); take the max */
- if (timespec64_compare(mtime, &inode->i_mtime) > 0) {
- dout("mtime %lld.%09ld -> %lld.%09ld inc\n",
- inode->i_mtime.tv_sec,
- inode->i_mtime.tv_nsec,
+ ts = inode_get_mtime(inode);
+ if (timespec64_compare(mtime, &ts) > 0) {
+ doutc(cl, "mtime %lld.%09ld -> %lld.%09ld inc\n",
+ ts.tv_sec, ts.tv_nsec,
mtime->tv_sec, mtime->tv_nsec);
- inode->i_mtime = *mtime;
+ inode_set_mtime_to_ts(inode, *mtime);
}
- if (timespec64_compare(atime, &inode->i_atime) > 0) {
- dout("atime %lld.%09ld -> %lld.%09ld inc\n",
- inode->i_atime.tv_sec,
- inode->i_atime.tv_nsec,
+ ts = inode_get_atime(inode);
+ if (timespec64_compare(atime, &ts) > 0) {
+ doutc(cl, "atime %lld.%09ld -> %lld.%09ld inc\n",
+ ts.tv_sec, ts.tv_nsec,
atime->tv_sec, atime->tv_nsec);
- inode->i_atime = *atime;
+ inode_set_atime_to_ts(inode, *atime);
}
} else if (issued & CEPH_CAP_FILE_EXCL) {
/* we did a utimes(); ignore mds values */
@@ -867,21 +883,24 @@ void ceph_fill_file_time(struct inode *inode, int issued,
/* we have no write|excl caps; whatever the MDS says is true */
if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
inode_set_ctime_to_ts(inode, *ctime);
- inode->i_mtime = *mtime;
- inode->i_atime = *atime;
+ inode_set_mtime_to_ts(inode, *mtime);
+ inode_set_atime_to_ts(inode, *atime);
ci->i_time_warp_seq = time_warp_seq;
} else {
warn = 1;
}
}
if (warn) /* time_warp_seq shouldn't go backwards */
- dout("%p mds time_warp_seq %llu < %u\n",
- inode, time_warp_seq, ci->i_time_warp_seq);
+ doutc(cl, "%p mds time_warp_seq %llu < %u\n", inode,
+ time_warp_seq, ci->i_time_warp_seq);
}
#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
-static int decode_encrypted_symlink(const char *encsym, int enclen, u8 **decsym)
+static int decode_encrypted_symlink(struct ceph_mds_client *mdsc,
+ const char *encsym,
+ int enclen, u8 **decsym)
{
+ struct ceph_client *cl = mdsc->fsc->client;
int declen;
u8 *sym;
@@ -891,8 +910,9 @@ static int decode_encrypted_symlink(const char *encsym, int enclen, u8 **decsym)
declen = ceph_base64_decode(encsym, enclen, sym);
if (declen < 0) {
- pr_err("%s: can't decode symlink (%d). Content: %.*s\n",
- __func__, declen, enclen, encsym);
+ pr_err_client(cl,
+ "can't decode symlink (%d). Content: %.*s\n",
+ declen, enclen, encsym);
kfree(sym);
return -EIO;
}
@@ -901,7 +921,9 @@ static int decode_encrypted_symlink(const char *encsym, int enclen, u8 **decsym)
return declen;
}
#else
-static int decode_encrypted_symlink(const char *encsym, int symlen, u8 **decsym)
+static int decode_encrypted_symlink(struct ceph_mds_client *mdsc,
+ const char *encsym,
+ int symlen, u8 **decsym)
{
return -EOPNOTSUPP;
}
@@ -918,6 +940,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
struct ceph_cap_reservation *caps_reservation)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_reply_inode *info = iinfo->in;
struct ceph_inode_info *ci = ceph_inode(inode);
int issued, new_issued, info_caps;
@@ -936,25 +959,26 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
lockdep_assert_held(&mdsc->snap_rwsem);
- dout("%s %p ino %llx.%llx v %llu had %llu\n", __func__,
- inode, ceph_vinop(inode), le64_to_cpu(info->version),
- ci->i_version);
+ doutc(cl, "%p ino %llx.%llx v %llu had %llu\n", inode, ceph_vinop(inode),
+ le64_to_cpu(info->version), ci->i_version);
/* Once I_NEW is cleared, we can't change type or dev numbers */
if (inode->i_state & I_NEW) {
inode->i_mode = mode;
} else {
if (inode_wrong_type(inode, mode)) {
- pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n",
- ceph_vinop(inode), inode->i_mode, mode);
+ pr_warn_once_client(cl,
+ "inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n",
+ ceph_vinop(inode), inode->i_mode, mode);
return -ESTALE;
}
if ((S_ISCHR(mode) || S_ISBLK(mode)) && inode->i_rdev != rdev) {
- pr_warn_once("dev inode rdev changed! (ino %llx.%llx is %u:%u, mds says %u:%u)\n",
- ceph_vinop(inode), MAJOR(inode->i_rdev),
- MINOR(inode->i_rdev), MAJOR(rdev),
- MINOR(rdev));
+ pr_warn_once_client(cl,
+ "dev inode rdev changed! (ino %llx.%llx is %u:%u, mds says %u:%u)\n",
+ ceph_vinop(inode), MAJOR(inode->i_rdev),
+ MINOR(inode->i_rdev), MAJOR(rdev),
+ MINOR(rdev));
return -ESTALE;
}
}
@@ -976,8 +1000,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
if (iinfo->xattr_len > 4) {
xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
if (!xattr_blob)
- pr_err("%s ENOMEM xattr blob %d bytes\n", __func__,
- iinfo->xattr_len);
+ pr_err_client(cl, "ENOMEM xattr blob %d bytes\n",
+ iinfo->xattr_len);
}
if (iinfo->pool_ns_len > 0)
@@ -1031,9 +1055,10 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
inode->i_mode = mode;
inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid));
inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid));
- dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
- from_kuid(&init_user_ns, inode->i_uid),
- from_kgid(&init_user_ns, inode->i_gid));
+ doutc(cl, "%p %llx.%llx mode 0%o uid.gid %d.%d\n", inode,
+ ceph_vinop(inode), inode->i_mode,
+ from_kuid(&init_user_ns, inode->i_uid),
+ from_kgid(&init_user_ns, inode->i_gid));
ceph_decode_timespec64(&ci->i_btime, &iinfo->btime);
ceph_decode_timespec64(&ci->i_snap_btime, &iinfo->snap_btime);
}
@@ -1089,7 +1114,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
if (size == round_up(fsize, CEPH_FSCRYPT_BLOCK_SIZE)) {
size = fsize;
} else {
- pr_warn("fscrypt size mismatch: size=%llu fscrypt_file=%llu, discarding fscrypt_file size.\n",
+ pr_warn_client(cl,
+ "fscrypt size mismatch: size=%llu fscrypt_file=%llu, discarding fscrypt_file size.\n",
info->size, size);
}
}
@@ -1101,8 +1127,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
/* only update max_size on auth cap */
if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
ci->i_max_size != le64_to_cpu(info->max_size)) {
- dout("max_size %lld -> %llu\n", ci->i_max_size,
- le64_to_cpu(info->max_size));
+ doutc(cl, "max_size %lld -> %llu\n",
+ ci->i_max_size, le64_to_cpu(info->max_size));
ci->i_max_size = le64_to_cpu(info->max_size);
}
}
@@ -1165,15 +1191,17 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
if (IS_ENCRYPTED(inode)) {
if (symlen != i_size_read(inode))
- pr_err("%s %llx.%llx BAD symlink size %lld\n",
- __func__, ceph_vinop(inode),
+ pr_err_client(cl,
+ "%p %llx.%llx BAD symlink size %lld\n",
+ inode, ceph_vinop(inode),
i_size_read(inode));
- err = decode_encrypted_symlink(iinfo->symlink,
+ err = decode_encrypted_symlink(mdsc, iinfo->symlink,
symlen, (u8 **)&sym);
if (err < 0) {
- pr_err("%s decoding encrypted symlink failed: %d\n",
- __func__, err);
+ pr_err_client(cl,
+ "decoding encrypted symlink failed: %d\n",
+ err);
goto out;
}
symlen = err;
@@ -1181,8 +1209,9 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
inode->i_blocks = calc_inode_blocks(symlen);
} else {
if (symlen != i_size_read(inode)) {
- pr_err("%s %llx.%llx BAD symlink size %lld\n",
- __func__, ceph_vinop(inode),
+ pr_err_client(cl,
+ "%p %llx.%llx BAD symlink size %lld\n",
+ inode, ceph_vinop(inode),
i_size_read(inode));
i_size_write(inode, symlen);
inode->i_blocks = calc_inode_blocks(symlen);
@@ -1217,8 +1246,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
inode->i_fop = &ceph_dir_fops;
break;
default:
- pr_err("%s %llx.%llx BAD mode 0%o\n", __func__,
- ceph_vinop(inode), inode->i_mode);
+ pr_err_client(cl, "%p %llx.%llx BAD mode 0%o\n", inode,
+ ceph_vinop(inode), inode->i_mode);
}
/* were we issued a capability? */
@@ -1239,7 +1268,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
(info_caps & CEPH_CAP_FILE_SHARED) &&
(issued & CEPH_CAP_FILE_EXCL) == 0 &&
!__ceph_dir_is_complete(ci)) {
- dout(" marking %p complete (empty)\n", inode);
+ doutc(cl, " marking %p complete (empty)\n",
+ inode);
i_size_write(inode, 0);
__ceph_dir_set_complete(ci,
atomic64_read(&ci->i_release_count),
@@ -1248,8 +1278,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
wake = true;
} else {
- dout(" %p got snap_caps %s\n", inode,
- ceph_cap_string(info_caps));
+ doutc(cl, " %p got snap_caps %s\n", inode,
+ ceph_cap_string(info_caps));
ci->i_snap_caps |= info_caps;
}
}
@@ -1265,8 +1295,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
if (cap_fmode >= 0) {
if (!info_caps)
- pr_warn("mds issued no caps on %llx.%llx\n",
- ceph_vinop(inode));
+ pr_warn_client(cl, "mds issued no caps on %llx.%llx\n",
+ ceph_vinop(inode));
__ceph_touch_fmode(ci, mdsc, cap_fmode);
}
@@ -1312,14 +1342,14 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
unsigned long from_time,
struct ceph_mds_session **old_lease_session)
{
+ struct ceph_client *cl = ceph_inode_to_client(dir);
struct ceph_dentry_info *di = ceph_dentry(dentry);
unsigned mask = le16_to_cpu(lease->mask);
long unsigned duration = le32_to_cpu(lease->duration_ms);
long unsigned ttl = from_time + (duration * HZ) / 1000;
long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
- dout("update_dentry_lease %p duration %lu ms ttl %lu\n",
- dentry, duration, ttl);
+ doutc(cl, "%p duration %lu ms ttl %lu\n", dentry, duration, ttl);
/* only track leases on regular dentries */
if (ceph_snap(dir) != CEPH_NOSNAP)
@@ -1420,6 +1450,7 @@ out_unlock:
*/
static int splice_dentry(struct dentry **pdn, struct inode *in)
{
+ struct ceph_client *cl = ceph_inode_to_client(in);
struct dentry *dn = *pdn;
struct dentry *realdn;
@@ -1451,23 +1482,21 @@ static int splice_dentry(struct dentry **pdn, struct inode *in)
d_drop(dn);
realdn = d_splice_alias(in, dn);
if (IS_ERR(realdn)) {
- pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
- PTR_ERR(realdn), dn, in, ceph_vinop(in));
+ pr_err_client(cl, "error %ld %p inode %p ino %llx.%llx\n",
+ PTR_ERR(realdn), dn, in, ceph_vinop(in));
return PTR_ERR(realdn);
}
if (realdn) {
- dout("dn %p (%d) spliced with %p (%d) "
- "inode %p ino %llx.%llx\n",
- dn, d_count(dn),
- realdn, d_count(realdn),
- d_inode(realdn), ceph_vinop(d_inode(realdn)));
+ doutc(cl, "dn %p (%d) spliced with %p (%d) inode %p ino %llx.%llx\n",
+ dn, d_count(dn), realdn, d_count(realdn),
+ d_inode(realdn), ceph_vinop(d_inode(realdn)));
dput(dn);
*pdn = realdn;
} else {
BUG_ON(!ceph_dentry(dn));
- dout("dn %p attached to %p ino %llx.%llx\n",
- dn, d_inode(dn), ceph_vinop(d_inode(dn)));
+ doutc(cl, "dn %p attached to %p ino %llx.%llx\n", dn,
+ d_inode(dn), ceph_vinop(d_inode(dn)));
}
return 0;
}
@@ -1489,14 +1518,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
struct inode *in = NULL;
struct ceph_vino tvino, dvino;
- struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
+ struct ceph_client *cl = fsc->client;
int err = 0;
- dout("fill_trace %p is_dentry %d is_target %d\n", req,
- rinfo->head->is_dentry, rinfo->head->is_target);
+ doutc(cl, "%p is_dentry %d is_target %d\n", req,
+ rinfo->head->is_dentry, rinfo->head->is_target);
if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
- dout("fill_trace reply is empty!\n");
+ doutc(cl, "reply is empty!\n");
if (rinfo->head->result == 0 && req->r_parent)
ceph_invalidate_dir_request(req);
return 0;
@@ -1553,13 +1583,13 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
retry_lookup:
dn = d_lookup(parent, &dname);
- dout("d_lookup on parent=%p name=%.*s got %p\n",
- parent, dname.len, dname.name, dn);
+ doutc(cl, "d_lookup on parent=%p name=%.*s got %p\n",
+ parent, dname.len, dname.name, dn);
if (!dn) {
dn = d_alloc(parent, &dname);
- dout("d_alloc %p '%.*s' = %p\n", parent,
- dname.len, dname.name, dn);
+ doutc(cl, "d_alloc %p '%.*s' = %p\n", parent,
+ dname.len, dname.name, dn);
if (!dn) {
dput(parent);
ceph_fname_free_buffer(dir, &oname);
@@ -1575,8 +1605,8 @@ retry_lookup:
} else if (d_really_is_positive(dn) &&
(ceph_ino(d_inode(dn)) != tvino.ino ||
ceph_snap(d_inode(dn)) != tvino.snap)) {
- dout(" dn %p points to wrong inode %p\n",
- dn, d_inode(dn));
+ doutc(cl, " dn %p points to wrong inode %p\n",
+ dn, d_inode(dn));
ceph_dir_clear_ordered(dir);
d_delete(dn);
dput(dn);
@@ -1601,8 +1631,8 @@ retry_lookup:
rinfo->head->result == 0) ? req->r_fmode : -1,
&req->r_caps_reservation);
if (err < 0) {
- pr_err("ceph_fill_inode badness %p %llx.%llx\n",
- in, ceph_vinop(in));
+ pr_err_client(cl, "badness %p %llx.%llx\n", in,
+ ceph_vinop(in));
req->r_target_inode = NULL;
if (in->i_state & I_NEW)
discard_new_inode(in);
@@ -1652,36 +1682,32 @@ retry_lookup:
have_lease = have_dir_cap ||
le32_to_cpu(rinfo->dlease->duration_ms);
if (!have_lease)
- dout("fill_trace no dentry lease or dir cap\n");
+ doutc(cl, "no dentry lease or dir cap\n");
/* rename? */
if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
struct inode *olddir = req->r_old_dentry_dir;
BUG_ON(!olddir);
- dout(" src %p '%pd' dst %p '%pd'\n",
- req->r_old_dentry,
- req->r_old_dentry,
- dn, dn);
- dout("fill_trace doing d_move %p -> %p\n",
- req->r_old_dentry, dn);
+ doutc(cl, " src %p '%pd' dst %p '%pd'\n",
+ req->r_old_dentry, req->r_old_dentry, dn, dn);
+ doutc(cl, "doing d_move %p -> %p\n", req->r_old_dentry, dn);
/* d_move screws up sibling dentries' offsets */
ceph_dir_clear_ordered(dir);
ceph_dir_clear_ordered(olddir);
d_move(req->r_old_dentry, dn);
- dout(" src %p '%pd' dst %p '%pd'\n",
- req->r_old_dentry,
- req->r_old_dentry,
- dn, dn);
+ doutc(cl, " src %p '%pd' dst %p '%pd'\n",
+ req->r_old_dentry, req->r_old_dentry, dn, dn);
/* ensure target dentry is invalidated, despite
rehashing bug in vfs_rename_dir */
ceph_invalidate_dentry_lease(dn);
- dout("dn %p gets new offset %lld\n", req->r_old_dentry,
- ceph_dentry(req->r_old_dentry)->offset);
+ doutc(cl, "dn %p gets new offset %lld\n",
+ req->r_old_dentry,
+ ceph_dentry(req->r_old_dentry)->offset);
/* swap r_dentry and r_old_dentry in case that
* splice_dentry() gets called later. This is safe
@@ -1693,9 +1719,9 @@ retry_lookup:
/* null dentry? */
if (!rinfo->head->is_target) {
- dout("fill_trace null dentry\n");
+ doutc(cl, "null dentry\n");
if (d_really_is_positive(dn)) {
- dout("d_delete %p\n", dn);
+ doutc(cl, "d_delete %p\n", dn);
ceph_dir_clear_ordered(dir);
d_delete(dn);
} else if (have_lease) {
@@ -1719,9 +1745,9 @@ retry_lookup:
goto done;
dn = req->r_dentry; /* may have spliced */
} else if (d_really_is_positive(dn) && d_inode(dn) != in) {
- dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
- dn, d_inode(dn), ceph_vinop(d_inode(dn)),
- ceph_vinop(in));
+ doutc(cl, " %p links to %p %llx.%llx, not %llx.%llx\n",
+ dn, d_inode(dn), ceph_vinop(d_inode(dn)),
+ ceph_vinop(in));
d_invalidate(dn);
have_lease = false;
}
@@ -1731,7 +1757,7 @@ retry_lookup:
rinfo->dlease, session,
req->r_request_started);
}
- dout(" final dn %p\n", dn);
+ doutc(cl, " final dn %p\n", dn);
} else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
req->r_op == CEPH_MDS_OP_MKSNAP) &&
test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
@@ -1742,7 +1768,8 @@ retry_lookup:
BUG_ON(!dir);
BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR);
BUG_ON(!req->r_dentry);
- dout(" linking snapped dir %p to dn %p\n", in, req->r_dentry);
+ doutc(cl, " linking snapped dir %p to dn %p\n", in,
+ req->r_dentry);
ceph_dir_clear_ordered(dir);
ihold(in);
err = splice_dentry(&req->r_dentry, in);
@@ -1764,7 +1791,7 @@ retry_lookup:
&dvino, ptvino);
}
done:
- dout("fill_trace done err=%d\n", err);
+ doutc(cl, "done err=%d\n", err);
return err;
}
@@ -1775,6 +1802,7 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
struct ceph_mds_session *session)
{
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+ struct ceph_client *cl = session->s_mdsc->fsc->client;
int i, err = 0;
for (i = 0; i < rinfo->dir_nr; i++) {
@@ -1789,14 +1817,14 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
in = ceph_get_inode(req->r_dentry->d_sb, vino, NULL);
if (IS_ERR(in)) {
err = PTR_ERR(in);
- dout("new_inode badness got %d\n", err);
+ doutc(cl, "badness got %d\n", err);
continue;
}
rc = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
-1, &req->r_caps_reservation);
if (rc < 0) {
- pr_err("ceph_fill_inode badness on %p got %d\n",
- in, rc);
+ pr_err_client(cl, "inode badness on %p got %d\n", in,
+ rc);
err = rc;
if (in->i_state & I_NEW) {
ihold(in);
@@ -1825,6 +1853,7 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
struct ceph_readdir_cache_control *ctl,
struct ceph_mds_request *req)
{
+ struct ceph_client *cl = ceph_inode_to_client(dir);
struct ceph_inode_info *ci = ceph_inode(dir);
unsigned nsize = PAGE_SIZE / sizeof(struct dentry*);
unsigned idx = ctl->index % nsize;
@@ -1850,11 +1879,11 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) &&
req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) {
- dout("readdir cache dn %p idx %d\n", dn, ctl->index);
+ doutc(cl, "dn %p idx %d\n", dn, ctl->index);
ctl->dentries[idx] = dn;
ctl->index++;
} else {
- dout("disable readdir cache\n");
+ doutc(cl, "disable readdir cache\n");
ctl->index = -1;
}
return 0;
@@ -1867,6 +1896,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct inode *inode = d_inode(parent);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+ struct ceph_client *cl = session->s_mdsc->fsc->client;
struct qstr dname;
struct dentry *dn;
struct inode *in;
@@ -1894,19 +1924,18 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
if (rinfo->dir_dir &&
le32_to_cpu(rinfo->dir_dir->frag) != frag) {
- dout("readdir_prepopulate got new frag %x -> %x\n",
- frag, le32_to_cpu(rinfo->dir_dir->frag));
+ doutc(cl, "got new frag %x -> %x\n", frag,
+ le32_to_cpu(rinfo->dir_dir->frag));
frag = le32_to_cpu(rinfo->dir_dir->frag);
if (!rinfo->hash_order)
req->r_readdir_offset = 2;
}
if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
- dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
- rinfo->dir_nr, parent);
+ doutc(cl, "%d items under SNAPDIR dn %p\n",
+ rinfo->dir_nr, parent);
} else {
- dout("readdir_prepopulate %d items under dn %p\n",
- rinfo->dir_nr, parent);
+ doutc(cl, "%d items under dn %p\n", rinfo->dir_nr, parent);
if (rinfo->dir_dir)
ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir);
@@ -1950,15 +1979,15 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
retry_lookup:
dn = d_lookup(parent, &dname);
- dout("d_lookup on parent=%p name=%.*s got %p\n",
- parent, dname.len, dname.name, dn);
+ doutc(cl, "d_lookup on parent=%p name=%.*s got %p\n",
+ parent, dname.len, dname.name, dn);
if (!dn) {
dn = d_alloc(parent, &dname);
- dout("d_alloc %p '%.*s' = %p\n", parent,
- dname.len, dname.name, dn);
+ doutc(cl, "d_alloc %p '%.*s' = %p\n", parent,
+ dname.len, dname.name, dn);
if (!dn) {
- dout("d_alloc badness\n");
+ doutc(cl, "d_alloc badness\n");
err = -ENOMEM;
goto out;
}
@@ -1971,8 +2000,8 @@ retry_lookup:
(ceph_ino(d_inode(dn)) != tvino.ino ||
ceph_snap(d_inode(dn)) != tvino.snap)) {
struct ceph_dentry_info *di = ceph_dentry(dn);
- dout(" dn %p points to wrong inode %p\n",
- dn, d_inode(dn));
+ doutc(cl, " dn %p points to wrong inode %p\n",
+ dn, d_inode(dn));
spin_lock(&dn->d_lock);
if (di->offset > 0 &&
@@ -1994,7 +2023,7 @@ retry_lookup:
} else {
in = ceph_get_inode(parent->d_sb, tvino, NULL);
if (IS_ERR(in)) {
- dout("new_inode badness\n");
+ doutc(cl, "new_inode badness\n");
d_drop(dn);
dput(dn);
err = PTR_ERR(in);
@@ -2005,7 +2034,8 @@ retry_lookup:
ret = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
-1, &req->r_caps_reservation);
if (ret < 0) {
- pr_err("ceph_fill_inode badness on %p\n", in);
+ pr_err_client(cl, "badness on %p %llx.%llx\n", in,
+ ceph_vinop(in));
if (d_really_is_negative(dn)) {
if (in->i_state & I_NEW) {
ihold(in);
@@ -2022,8 +2052,8 @@ retry_lookup:
if (d_really_is_negative(dn)) {
if (ceph_security_xattr_deadlock(in)) {
- dout(" skip splicing dn %p to inode %p"
- " (security xattr deadlock)\n", dn, in);
+ doutc(cl, " skip splicing dn %p to inode %p"
+ " (security xattr deadlock)\n", dn, in);
iput(in);
skipped++;
goto next_item;
@@ -2055,17 +2085,18 @@ out:
req->r_readdir_cache_idx = cache_ctl.index;
}
ceph_readdir_cache_release(&cache_ctl);
- dout("readdir_prepopulate done\n");
+ doutc(cl, "done\n");
return err;
}
bool ceph_inode_set_size(struct inode *inode, loff_t size)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
bool ret;
spin_lock(&ci->i_ceph_lock);
- dout("set_size %p %llu -> %llu\n", inode, i_size_read(inode), size);
+ doutc(cl, "set_size %p %llu -> %llu\n", inode, i_size_read(inode), size);
i_size_write(inode, size);
ceph_fscache_update(inode);
inode->i_blocks = calc_inode_blocks(size);
@@ -2079,22 +2110,25 @@ bool ceph_inode_set_size(struct inode *inode, loff_t size)
void ceph_queue_inode_work(struct inode *inode, int work_bit)
{
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
struct ceph_inode_info *ci = ceph_inode(inode);
set_bit(work_bit, &ci->i_work_mask);
ihold(inode);
if (queue_work(fsc->inode_wq, &ci->i_work)) {
- dout("queue_inode_work %p, mask=%lx\n", inode, ci->i_work_mask);
+ doutc(cl, "%p %llx.%llx mask=%lx\n", inode,
+ ceph_vinop(inode), ci->i_work_mask);
} else {
- dout("queue_inode_work %p already queued, mask=%lx\n",
- inode, ci->i_work_mask);
+ doutc(cl, "%p %llx.%llx already queued, mask=%lx\n",
+ inode, ceph_vinop(inode), ci->i_work_mask);
iput(inode);
}
}
static void ceph_do_invalidate_pages(struct inode *inode)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
u32 orig_gen;
int check = 0;
@@ -2104,8 +2138,9 @@ static void ceph_do_invalidate_pages(struct inode *inode)
mutex_lock(&ci->i_truncate_mutex);
if (ceph_inode_is_shutdown(inode)) {
- pr_warn_ratelimited("%s: inode %llx.%llx is shut down\n",
- __func__, ceph_vinop(inode));
+ pr_warn_ratelimited_client(cl,
+ "%p %llx.%llx is shut down\n", inode,
+ ceph_vinop(inode));
mapping_set_error(inode->i_mapping, -EIO);
truncate_pagecache(inode, 0);
mutex_unlock(&ci->i_truncate_mutex);
@@ -2113,8 +2148,8 @@ static void ceph_do_invalidate_pages(struct inode *inode)
}
spin_lock(&ci->i_ceph_lock);
- dout("invalidate_pages %p gen %d revoking %d\n", inode,
- ci->i_rdcache_gen, ci->i_rdcache_revoking);
+ doutc(cl, "%p %llx.%llx gen %d revoking %d\n", inode,
+ ceph_vinop(inode), ci->i_rdcache_gen, ci->i_rdcache_revoking);
if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
check = 1;
@@ -2126,21 +2161,21 @@ static void ceph_do_invalidate_pages(struct inode *inode)
spin_unlock(&ci->i_ceph_lock);
if (invalidate_inode_pages2(inode->i_mapping) < 0) {
- pr_err("invalidate_inode_pages2 %llx.%llx failed\n",
- ceph_vinop(inode));
+ pr_err_client(cl, "invalidate_inode_pages2 %llx.%llx failed\n",
+ ceph_vinop(inode));
}
spin_lock(&ci->i_ceph_lock);
if (orig_gen == ci->i_rdcache_gen &&
orig_gen == ci->i_rdcache_revoking) {
- dout("invalidate_pages %p gen %d successful\n", inode,
- ci->i_rdcache_gen);
+ doutc(cl, "%p %llx.%llx gen %d successful\n", inode,
+ ceph_vinop(inode), ci->i_rdcache_gen);
ci->i_rdcache_revoking--;
check = 1;
} else {
- dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
- inode, orig_gen, ci->i_rdcache_gen,
- ci->i_rdcache_revoking);
+ doutc(cl, "%p %llx.%llx gen %d raced, now %d revoking %d\n",
+ inode, ceph_vinop(inode), orig_gen, ci->i_rdcache_gen,
+ ci->i_rdcache_revoking);
if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
check = 1;
}
@@ -2157,6 +2192,7 @@ out:
*/
void __ceph_do_pending_vmtruncate(struct inode *inode)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
u64 to;
int wrbuffer_refs, finish = 0;
@@ -2165,7 +2201,8 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
retry:
spin_lock(&ci->i_ceph_lock);
if (ci->i_truncate_pending == 0) {
- dout("%s %p none pending\n", __func__, inode);
+ doutc(cl, "%p %llx.%llx none pending\n", inode,
+ ceph_vinop(inode));
spin_unlock(&ci->i_ceph_lock);
mutex_unlock(&ci->i_truncate_mutex);
return;
@@ -2177,7 +2214,8 @@ retry:
*/
if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
spin_unlock(&ci->i_ceph_lock);
- dout("%s %p flushing snaps first\n", __func__, inode);
+ doutc(cl, "%p %llx.%llx flushing snaps first\n", inode,
+ ceph_vinop(inode));
filemap_write_and_wait_range(&inode->i_data, 0,
inode->i_sb->s_maxbytes);
goto retry;
@@ -2188,8 +2226,8 @@ retry:
to = ci->i_truncate_pagecache_size;
wrbuffer_refs = ci->i_wrbuffer_ref;
- dout("%s %p (%d) to %lld\n", __func__, inode,
- ci->i_truncate_pending, to);
+ doutc(cl, "%p %llx.%llx (%d) to %lld\n", inode, ceph_vinop(inode),
+ ci->i_truncate_pending, to);
spin_unlock(&ci->i_ceph_lock);
ceph_fscache_resize(inode, to);
@@ -2217,9 +2255,10 @@ static void ceph_inode_work(struct work_struct *work)
struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
i_work);
struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask)) {
- dout("writeback %p\n", inode);
+ doutc(cl, "writeback %p %llx.%llx\n", inode, ceph_vinop(inode));
filemap_fdatawrite(&inode->i_data);
}
if (test_and_clear_bit(CEPH_I_WORK_INVALIDATE_PAGES, &ci->i_work_mask))
@@ -2291,6 +2330,7 @@ static int fill_fscrypt_truncate(struct inode *inode,
struct ceph_mds_request *req,
struct iattr *attr)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
int boff = attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE;
loff_t pos, orig_pos = round_down(attr->ia_size,
@@ -2313,9 +2353,9 @@ static int fill_fscrypt_truncate(struct inode *inode,
issued = __ceph_caps_issued(ci, NULL);
- dout("%s size %lld -> %lld got cap refs on %s, issued %s\n", __func__,
- i_size, attr->ia_size, ceph_cap_string(got),
- ceph_cap_string(issued));
+ doutc(cl, "size %lld -> %lld got cap refs on %s, issued %s\n",
+ i_size, attr->ia_size, ceph_cap_string(got),
+ ceph_cap_string(issued));
/* Try to writeback the dirty pagecaches */
if (issued & (CEPH_CAP_FILE_BUFFER)) {
@@ -2370,8 +2410,7 @@ static int fill_fscrypt_truncate(struct inode *inode,
* If the Rados object doesn't exist, it will be set to 0.
*/
if (!objver) {
- dout("%s hit hole, ppos %lld < size %lld\n", __func__,
- pos, i_size);
+ doutc(cl, "hit hole, ppos %lld < size %lld\n", pos, i_size);
header.data_len = cpu_to_le32(8 + 8 + 4);
header.file_offset = 0;
@@ -2380,8 +2419,8 @@ static int fill_fscrypt_truncate(struct inode *inode,
header.data_len = cpu_to_le32(8 + 8 + 4 + CEPH_FSCRYPT_BLOCK_SIZE);
header.file_offset = cpu_to_le64(orig_pos);
- dout("%s encrypt block boff/bsize %d/%lu\n", __func__,
- boff, CEPH_FSCRYPT_BLOCK_SIZE);
+ doutc(cl, "encrypt block boff/bsize %d/%lu\n", boff,
+ CEPH_FSCRYPT_BLOCK_SIZE);
/* truncate and zero out the extra contents for the last block */
memset(iov.iov_base + boff, 0, PAGE_SIZE - boff);
@@ -2409,8 +2448,8 @@ static int fill_fscrypt_truncate(struct inode *inode,
}
req->r_pagelist = pagelist;
out:
- dout("%s %p size dropping cap refs on %s\n", __func__,
- inode, ceph_cap_string(got));
+ doutc(cl, "%p %llx.%llx size dropping cap refs on %s\n", inode,
+ ceph_vinop(inode), ceph_cap_string(got));
ceph_put_cap_refs(ci, got);
if (iov.iov_base)
kunmap_local(iov.iov_base);
@@ -2421,13 +2460,14 @@ out:
return ret;
}
-int __ceph_setattr(struct inode *inode, struct iattr *attr,
- struct ceph_iattr *cia)
+int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode,
+ struct iattr *attr, struct ceph_iattr *cia)
{
struct ceph_inode_info *ci = ceph_inode(inode);
unsigned int ia_valid = attr->ia_valid;
struct ceph_mds_request *req;
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_cap_flush *prealloc_cf;
loff_t isize = i_size_read(inode);
int issued;
@@ -2466,7 +2506,8 @@ retry:
}
}
- dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
+ doutc(cl, "%p %llx.%llx issued %s\n", inode, ceph_vinop(inode),
+ ceph_cap_string(issued));
#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
if (cia && cia->fscrypt_auth) {
u32 len = ceph_fscrypt_auth_len(cia->fscrypt_auth);
@@ -2477,8 +2518,8 @@ retry:
goto out;
}
- dout("setattr %llx:%llx fscrypt_auth len %u to %u)\n",
- ceph_vinop(inode), ci->fscrypt_auth_len, len);
+ doutc(cl, "%p %llx.%llx fscrypt_auth len %u to %u)\n", inode,
+ ceph_vinop(inode), ci->fscrypt_auth_len, len);
/* It should never be re-set once set */
WARN_ON_ONCE(ci->fscrypt_auth);
@@ -2506,38 +2547,44 @@ retry:
#endif /* CONFIG_FS_ENCRYPTION */
if (ia_valid & ATTR_UID) {
- dout("setattr %p uid %d -> %d\n", inode,
- from_kuid(&init_user_ns, inode->i_uid),
- from_kuid(&init_user_ns, attr->ia_uid));
+ kuid_t fsuid = from_vfsuid(idmap, i_user_ns(inode), attr->ia_vfsuid);
+
+ doutc(cl, "%p %llx.%llx uid %d -> %d\n", inode,
+ ceph_vinop(inode),
+ from_kuid(&init_user_ns, inode->i_uid),
+ from_kuid(&init_user_ns, attr->ia_uid));
if (issued & CEPH_CAP_AUTH_EXCL) {
- inode->i_uid = attr->ia_uid;
+ inode->i_uid = fsuid;
dirtied |= CEPH_CAP_AUTH_EXCL;
} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
- !uid_eq(attr->ia_uid, inode->i_uid)) {
+ !uid_eq(fsuid, inode->i_uid)) {
req->r_args.setattr.uid = cpu_to_le32(
- from_kuid(&init_user_ns, attr->ia_uid));
+ from_kuid(&init_user_ns, fsuid));
mask |= CEPH_SETATTR_UID;
release |= CEPH_CAP_AUTH_SHARED;
}
}
if (ia_valid & ATTR_GID) {
- dout("setattr %p gid %d -> %d\n", inode,
- from_kgid(&init_user_ns, inode->i_gid),
- from_kgid(&init_user_ns, attr->ia_gid));
+ kgid_t fsgid = from_vfsgid(idmap, i_user_ns(inode), attr->ia_vfsgid);
+
+ doutc(cl, "%p %llx.%llx gid %d -> %d\n", inode,
+ ceph_vinop(inode),
+ from_kgid(&init_user_ns, inode->i_gid),
+ from_kgid(&init_user_ns, attr->ia_gid));
if (issued & CEPH_CAP_AUTH_EXCL) {
- inode->i_gid = attr->ia_gid;
+ inode->i_gid = fsgid;
dirtied |= CEPH_CAP_AUTH_EXCL;
} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
- !gid_eq(attr->ia_gid, inode->i_gid)) {
+ !gid_eq(fsgid, inode->i_gid)) {
req->r_args.setattr.gid = cpu_to_le32(
- from_kgid(&init_user_ns, attr->ia_gid));
+ from_kgid(&init_user_ns, fsgid));
mask |= CEPH_SETATTR_GID;
release |= CEPH_CAP_AUTH_SHARED;
}
}
if (ia_valid & ATTR_MODE) {
- dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
- attr->ia_mode);
+ doutc(cl, "%p %llx.%llx mode 0%o -> 0%o\n", inode,
+ ceph_vinop(inode), inode->i_mode, attr->ia_mode);
if (issued & CEPH_CAP_AUTH_EXCL) {
inode->i_mode = attr->ia_mode;
dirtied |= CEPH_CAP_AUTH_EXCL;
@@ -2551,20 +2598,23 @@ retry:
}
if (ia_valid & ATTR_ATIME) {
- dout("setattr %p atime %lld.%ld -> %lld.%ld\n", inode,
- inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
- attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
+ struct timespec64 atime = inode_get_atime(inode);
+
+ doutc(cl, "%p %llx.%llx atime %lld.%09ld -> %lld.%09ld\n",
+ inode, ceph_vinop(inode),
+ atime.tv_sec, atime.tv_nsec,
+ attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
if (issued & CEPH_CAP_FILE_EXCL) {
ci->i_time_warp_seq++;
- inode->i_atime = attr->ia_atime;
+ inode_set_atime_to_ts(inode, attr->ia_atime);
dirtied |= CEPH_CAP_FILE_EXCL;
} else if ((issued & CEPH_CAP_FILE_WR) &&
- timespec64_compare(&inode->i_atime,
- &attr->ia_atime) < 0) {
- inode->i_atime = attr->ia_atime;
+ timespec64_compare(&atime,
+ &attr->ia_atime) < 0) {
+ inode_set_atime_to_ts(inode, attr->ia_atime);
dirtied |= CEPH_CAP_FILE_WR;
} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
- !timespec64_equal(&inode->i_atime, &attr->ia_atime)) {
+ !timespec64_equal(&atime, &attr->ia_atime)) {
ceph_encode_timespec64(&req->r_args.setattr.atime,
&attr->ia_atime);
mask |= CEPH_SETATTR_ATIME;
@@ -2573,7 +2623,8 @@ retry:
}
}
if (ia_valid & ATTR_SIZE) {
- dout("setattr %p size %lld -> %lld\n", inode, isize, attr->ia_size);
+ doutc(cl, "%p %llx.%llx size %lld -> %lld\n", inode,
+ ceph_vinop(inode), isize, attr->ia_size);
/*
* Only when the new size is smaller and not aligned to
* CEPH_FSCRYPT_BLOCK_SIZE will the RMW is needed.
@@ -2624,20 +2675,22 @@ retry:
}
}
if (ia_valid & ATTR_MTIME) {
- dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode,
- inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
- attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
+ struct timespec64 mtime = inode_get_mtime(inode);
+
+ doutc(cl, "%p %llx.%llx mtime %lld.%09ld -> %lld.%09ld\n",
+ inode, ceph_vinop(inode),
+ mtime.tv_sec, mtime.tv_nsec,
+ attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
if (issued & CEPH_CAP_FILE_EXCL) {
ci->i_time_warp_seq++;
- inode->i_mtime = attr->ia_mtime;
+ inode_set_mtime_to_ts(inode, attr->ia_mtime);
dirtied |= CEPH_CAP_FILE_EXCL;
} else if ((issued & CEPH_CAP_FILE_WR) &&
- timespec64_compare(&inode->i_mtime,
- &attr->ia_mtime) < 0) {
- inode->i_mtime = attr->ia_mtime;
+ timespec64_compare(&mtime, &attr->ia_mtime) < 0) {
+ inode_set_mtime_to_ts(inode, attr->ia_mtime);
dirtied |= CEPH_CAP_FILE_WR;
} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
- !timespec64_equal(&inode->i_mtime, &attr->ia_mtime)) {
+ !timespec64_equal(&mtime, &attr->ia_mtime)) {
ceph_encode_timespec64(&req->r_args.setattr.mtime,
&attr->ia_mtime);
mask |= CEPH_SETATTR_MTIME;
@@ -2650,11 +2703,12 @@ retry:
if (ia_valid & ATTR_CTIME) {
bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
- dout("setattr %p ctime %lld.%ld -> %lld.%ld (%s)\n", inode,
- inode_get_ctime(inode).tv_sec,
- inode_get_ctime(inode).tv_nsec,
- attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
- only ? "ctime only" : "ignored");
+ doutc(cl, "%p %llx.%llx ctime %lld.%09ld -> %lld.%09ld (%s)\n",
+ inode, ceph_vinop(inode),
+ inode_get_ctime_sec(inode),
+ inode_get_ctime_nsec(inode),
+ attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
+ only ? "ctime only" : "ignored");
if (only) {
/*
* if kernel wants to dirty ctime but nothing else,
@@ -2672,7 +2726,8 @@ retry:
}
}
if (ia_valid & ATTR_FILE)
- dout("setattr %p ATTR_FILE ... hrm!\n", inode);
+ doutc(cl, "%p %llx.%llx ATTR_FILE ... hrm!\n", inode,
+ ceph_vinop(inode));
if (dirtied) {
inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
@@ -2713,16 +2768,17 @@ retry:
*/
err = ceph_mdsc_do_request(mdsc, NULL, req);
if (err == -EAGAIN && truncate_retry--) {
- dout("setattr %p result=%d (%s locally, %d remote), retry it!\n",
- inode, err, ceph_cap_string(dirtied), mask);
+ doutc(cl, "%p %llx.%llx result=%d (%s locally, %d remote), retry it!\n",
+ inode, ceph_vinop(inode), err,
+ ceph_cap_string(dirtied), mask);
ceph_mdsc_put_request(req);
ceph_free_cap_flush(prealloc_cf);
goto retry;
}
}
out:
- dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
- ceph_cap_string(dirtied), mask);
+ doutc(cl, "%p %llx.%llx result=%d (%s locally, %d remote)\n", inode,
+ ceph_vinop(inode), err, ceph_cap_string(dirtied), mask);
ceph_mdsc_put_request(req);
ceph_free_cap_flush(prealloc_cf);
@@ -2740,7 +2796,7 @@ int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
int err;
if (ceph_snap(inode) != CEPH_NOSNAP)
@@ -2753,7 +2809,7 @@ int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (err)
return err;
- err = setattr_prepare(&nop_mnt_idmap, dentry, attr);
+ err = setattr_prepare(idmap, dentry, attr);
if (err != 0)
return err;
@@ -2765,10 +2821,10 @@ int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
ceph_quota_is_max_bytes_exceeded(inode, attr->ia_size))
return -EDQUOT;
- err = __ceph_setattr(inode, attr, NULL);
+ err = __ceph_setattr(idmap, inode, attr, NULL);
if (err >= 0 && (attr->ia_valid & ATTR_MODE))
- err = posix_acl_chmod(&nop_mnt_idmap, dentry, attr->ia_mode);
+ err = posix_acl_chmod(idmap, dentry, attr->ia_mode);
return err;
}
@@ -2810,19 +2866,21 @@ int ceph_try_to_choose_auth_mds(struct inode *inode, int mask)
int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
int mask, bool force)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb);
+ struct ceph_client *cl = fsc->client;
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
int mode;
int err;
if (ceph_snap(inode) == CEPH_SNAPDIR) {
- dout("do_getattr inode %p SNAPDIR\n", inode);
+ doutc(cl, "inode %p %llx.%llx SNAPDIR\n", inode,
+ ceph_vinop(inode));
return 0;
}
- dout("do_getattr inode %p mask %s mode 0%o\n",
- inode, ceph_cap_string(mask), inode->i_mode);
+ doutc(cl, "inode %p %llx.%llx mask %s mode 0%o\n", inode,
+ ceph_vinop(inode), ceph_cap_string(mask), inode->i_mode);
if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1))
return 0;
@@ -2849,14 +2907,15 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
}
}
ceph_mdsc_put_request(req);
- dout("do_getattr result=%d\n", err);
+ doutc(cl, "result=%d\n", err);
return err;
}
int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
size_t size)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb);
+ struct ceph_client *cl = fsc->client;
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
int mode = USE_AUTH_MDS;
@@ -2886,7 +2945,7 @@ int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
xattr_value = req->r_reply_info.xattr_info.xattr_value;
xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
- dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
+ doutc(cl, "xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
err = (int)xattr_value_len;
if (size == 0)
@@ -2901,7 +2960,7 @@ int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
put:
ceph_mdsc_put_request(req);
out:
- dout("do_getvxattr result=%d\n", err);
+ doutc(cl, "result=%d\n", err);
return err;
}
@@ -2921,7 +2980,7 @@ int ceph_permission(struct mnt_idmap *idmap, struct inode *inode,
err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false);
if (!err)
- err = generic_permission(&nop_mnt_idmap, inode, mask);
+ err = generic_permission(idmap, inode, mask);
return err;
}
@@ -2978,7 +3037,7 @@ int ceph_getattr(struct mnt_idmap *idmap, const struct path *path,
return err;
}
- generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
+ generic_fillattr(idmap, request_mask, inode, stat);
stat->ino = ceph_present_inode(inode);
/*
@@ -3001,7 +3060,7 @@ int ceph_getattr(struct mnt_idmap *idmap, const struct path *path,
stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
if (S_ISDIR(inode->i_mode)) {
- if (ceph_test_mount_opt(ceph_sb_to_client(sb), RBYTES)) {
+ if (ceph_test_mount_opt(ceph_sb_to_fs_client(sb), RBYTES)) {
stat->size = ci->i_rbytes;
} else if (ceph_snap(inode) == CEPH_SNAPDIR) {
struct ceph_inode_info *pci;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 91a84917d203..e861de3c79b9 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -65,7 +65,7 @@ static long __validate_layout(struct ceph_mds_client *mdsc,
static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
struct ceph_mds_request *req;
struct ceph_ioctl_layout l;
struct ceph_inode_info *ci = ceph_inode(file_inode(file));
@@ -140,7 +140,7 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
struct ceph_mds_request *req;
struct ceph_ioctl_layout l;
int err;
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
/* copy and validate */
if (copy_from_user(&l, arg, sizeof(l)))
@@ -183,7 +183,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc =
- &ceph_sb_to_client(inode->i_sb)->client->osdc;
+ &ceph_sb_to_fs_client(inode->i_sb)->client->osdc;
struct ceph_object_locator oloc;
CEPH_DEFINE_OID_ONSTACK(oid);
u32 xlen;
@@ -244,7 +244,8 @@ static long ceph_ioctl_lazyio(struct file *file)
struct ceph_file_info *fi = file->private_data;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
spin_lock(&ci->i_ceph_lock);
@@ -252,11 +253,13 @@ static long ceph_ioctl_lazyio(struct file *file)
ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++;
__ceph_touch_fmode(ci, mdsc, fi->fmode);
spin_unlock(&ci->i_ceph_lock);
- dout("ioctl_layzio: file %p marked lazy\n", file);
+ doutc(cl, "file %p %p %llx.%llx marked lazy\n", file, inode,
+ ceph_vinop(inode));
ceph_check_caps(ci, 0);
} else {
- dout("ioctl_layzio: file %p already lazy\n", file);
+ doutc(cl, "file %p %p %llx.%llx already lazy\n", file, inode,
+ ceph_vinop(inode));
}
return 0;
}
@@ -355,10 +358,12 @@ static const char *ceph_ioctl_cmd_name(const unsigned int cmd)
long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
+ struct inode *inode = file_inode(file);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
int ret;
- dout("ioctl file %p cmd %s arg %lu\n", file,
- ceph_ioctl_cmd_name(cmd), arg);
+ doutc(fsc->client, "file %p %p %llx.%llx cmd %s arg %lu\n", file,
+ inode, ceph_vinop(inode), ceph_ioctl_cmd_name(cmd), arg);
switch (cmd) {
case CEPH_IOC_GET_LAYOUT:
return ceph_ioctl_get_layout(file, (void __user *)arg);
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index cb51c7e9c8e2..e07ad29ff8b9 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -77,6 +77,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
int cmd, u8 wait, struct file_lock *fl)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
int err;
u64 length = 0;
@@ -111,10 +112,10 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
owner = secure_addr(fl->fl_owner);
- dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, "
- "start: %llu, length: %llu, wait: %d, type: %d\n", (int)lock_type,
- (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length,
- wait, fl->fl_type);
+ doutc(cl, "rule: %d, op: %d, owner: %llx, pid: %llu, "
+ "start: %llu, length: %llu, wait: %d, type: %d\n",
+ (int)lock_type, (int)operation, owner, (u64)fl->fl_pid,
+ fl->fl_start, length, wait, fl->fl_type);
req->r_args.filelock_change.rule = lock_type;
req->r_args.filelock_change.type = cmd;
@@ -147,16 +148,17 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
}
ceph_mdsc_put_request(req);
- dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
- "length: %llu, wait: %d, type: %d, err code %d\n", (int)lock_type,
- (int)operation, (u64)fl->fl_pid, fl->fl_start,
- length, wait, fl->fl_type, err);
+ doutc(cl, "rule: %d, op: %d, pid: %llu, start: %llu, "
+ "length: %llu, wait: %d, type: %d, err code %d\n",
+ (int)lock_type, (int)operation, (u64)fl->fl_pid,
+ fl->fl_start, length, wait, fl->fl_type, err);
return err;
}
static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *intr_req;
struct inode *inode = req->r_inode;
int err, lock_type;
@@ -174,8 +176,7 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
if (!err)
return 0;
- dout("ceph_lock_wait_for_completion: request %llu was interrupted\n",
- req->r_tid);
+ doutc(cl, "request %llu was interrupted\n", req->r_tid);
mutex_lock(&mdsc->mutex);
if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
@@ -246,6 +247,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
{
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int err = 0;
u16 op = CEPH_MDS_OP_SETFILELOCK;
u8 wait = 0;
@@ -257,7 +259,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
if (ceph_inode_is_shutdown(inode))
return -ESTALE;
- dout("ceph_lock, fl_owner: %p\n", fl->fl_owner);
+ doutc(cl, "fl_owner: %p\n", fl->fl_owner);
/* set wait bit as appropriate, then make command as Ceph expects it*/
if (IS_GETLK(cmd))
@@ -292,7 +294,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl);
if (!err) {
if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->fl_type) {
- dout("mds locked, locking locally\n");
+ doutc(cl, "locking locally\n");
err = posix_lock_file(file, fl, NULL);
if (err) {
/* undo! This should only happen if
@@ -300,8 +302,8 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
* deadlock. */
ceph_lock_message(CEPH_LOCK_FCNTL, op, inode,
CEPH_LOCK_UNLOCK, 0, fl);
- dout("got %d on posix_lock_file, undid lock\n",
- err);
+ doutc(cl, "got %d on posix_lock_file, undid lock\n",
+ err);
}
}
}
@@ -312,6 +314,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
{
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int err = 0;
u8 wait = 0;
u8 lock_cmd;
@@ -322,7 +325,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
if (ceph_inode_is_shutdown(inode))
return -ESTALE;
- dout("ceph_flock, fl_file: %p\n", fl->fl_file);
+ doutc(cl, "fl_file: %p\n", fl->fl_file);
spin_lock(&ci->i_ceph_lock);
if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
@@ -359,7 +362,8 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
ceph_lock_message(CEPH_LOCK_FLOCK,
CEPH_MDS_OP_SETFILELOCK,
inode, CEPH_LOCK_UNLOCK, 0, fl);
- dout("got %d on locks_lock_file_wait, undid lock\n", err);
+ doutc(cl, "got %d on locks_lock_file_wait, undid lock\n",
+ err);
}
}
return err;
@@ -371,6 +375,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
*/
void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct file_lock *lock;
struct file_lock_context *ctx;
@@ -386,17 +391,20 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
++(*flock_count);
spin_unlock(&ctx->flc_lock);
}
- dout("counted %d flock locks and %d fcntl locks\n",
- *flock_count, *fcntl_count);
+ doutc(cl, "counted %d flock locks and %d fcntl locks\n",
+ *flock_count, *fcntl_count);
}
/*
* Given a pointer to a lock, convert it to a ceph filelock
*/
-static int lock_to_ceph_filelock(struct file_lock *lock,
+static int lock_to_ceph_filelock(struct inode *inode,
+ struct file_lock *lock,
struct ceph_filelock *cephlock)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int err = 0;
+
cephlock->start = cpu_to_le64(lock->fl_start);
cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
cephlock->client = cpu_to_le64(0);
@@ -414,7 +422,7 @@ static int lock_to_ceph_filelock(struct file_lock *lock,
cephlock->type = CEPH_LOCK_UNLOCK;
break;
default:
- dout("Have unknown lock type %d\n", lock->fl_type);
+ doutc(cl, "Have unknown lock type %d\n", lock->fl_type);
err = -EINVAL;
}
@@ -432,13 +440,14 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
{
struct file_lock *lock;
struct file_lock_context *ctx = locks_inode_context(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int err = 0;
int seen_fcntl = 0;
int seen_flock = 0;
int l = 0;
- dout("encoding %d flock and %d fcntl locks\n", num_flock_locks,
- num_fcntl_locks);
+ doutc(cl, "encoding %d flock and %d fcntl locks\n", num_flock_locks,
+ num_fcntl_locks);
if (!ctx)
return 0;
@@ -450,7 +459,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
err = -ENOSPC;
goto fail;
}
- err = lock_to_ceph_filelock(lock, &flocks[l]);
+ err = lock_to_ceph_filelock(inode, lock, &flocks[l]);
if (err)
goto fail;
++l;
@@ -461,7 +470,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
err = -ENOSPC;
goto fail;
}
- err = lock_to_ceph_filelock(lock, &flocks[l]);
+ err = lock_to_ceph_filelock(inode, lock, &flocks[l]);
if (err)
goto fail;
++l;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 293b93182955..d95eb525519a 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -12,6 +12,7 @@
#include <linux/bits.h>
#include <linux/ktime.h>
#include <linux/bitmap.h>
+#include <linux/mnt_idmapping.h>
#include "super.h"
#include "mds_client.h"
@@ -411,6 +412,7 @@ static int parse_reply_info_readdir(void **p, void *end,
u64 features)
{
struct ceph_mds_reply_info_parsed *info = &req->r_reply_info;
+ struct ceph_client *cl = req->r_mdsc->fsc->client;
u32 num, i = 0;
int err;
@@ -433,7 +435,7 @@ static int parse_reply_info_readdir(void **p, void *end,
BUG_ON(!info->dir_entries);
if ((unsigned long)(info->dir_entries + num) >
(unsigned long)info->dir_entries + info->dir_buf_size) {
- pr_err("dir contents are larger than expected\n");
+ pr_err_client(cl, "dir contents are larger than expected\n");
WARN_ON(1);
goto bad;
}
@@ -454,7 +456,7 @@ static int parse_reply_info_readdir(void **p, void *end,
ceph_decode_need(p, end, _name_len, bad);
_name = *p;
*p += _name_len;
- dout("parsed dir dname '%.*s'\n", _name_len, _name);
+ doutc(cl, "parsed dir dname '%.*s'\n", _name_len, _name);
if (info->hash_order)
rde->raw_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
@@ -514,8 +516,8 @@ static int parse_reply_info_readdir(void **p, void *end,
rde->is_nokey = false;
err = ceph_fname_to_usr(&fname, &tname, &oname, &rde->is_nokey);
if (err) {
- pr_err("%s unable to decode %.*s, got %d\n", __func__,
- _name_len, _name, err);
+ pr_err_client(cl, "unable to decode %.*s, got %d\n",
+ _name_len, _name, err);
goto out_bad;
}
rde->name = oname.name;
@@ -539,7 +541,7 @@ done:
bad:
err = -EIO;
out_bad:
- pr_err("problem parsing dir contents %d\n", err);
+ pr_err_client(cl, "problem parsing dir contents %d\n", err);
return err;
}
@@ -570,10 +572,11 @@ bad:
static int ceph_parse_deleg_inos(void **p, void *end,
struct ceph_mds_session *s)
{
+ struct ceph_client *cl = s->s_mdsc->fsc->client;
u32 sets;
ceph_decode_32_safe(p, end, sets, bad);
- dout("got %u sets of delegated inodes\n", sets);
+ doutc(cl, "got %u sets of delegated inodes\n", sets);
while (sets--) {
u64 start, len;
@@ -582,8 +585,9 @@ static int ceph_parse_deleg_inos(void **p, void *end,
/* Don't accept a delegation of system inodes */
if (start < CEPH_INO_SYSTEM_BASE) {
- pr_warn_ratelimited("ceph: ignoring reserved inode range delegation (start=0x%llx len=0x%llx)\n",
- start, len);
+ pr_warn_ratelimited_client(cl,
+ "ignoring reserved inode range delegation (start=0x%llx len=0x%llx)\n",
+ start, len);
continue;
}
while (len--) {
@@ -591,10 +595,10 @@ static int ceph_parse_deleg_inos(void **p, void *end,
DELEGATED_INO_AVAILABLE,
GFP_KERNEL);
if (!err) {
- dout("added delegated inode 0x%llx\n",
- start - 1);
+ doutc(cl, "added delegated inode 0x%llx\n", start - 1);
} else if (err == -EBUSY) {
- pr_warn("MDS delegated inode 0x%llx more than once.\n",
+ pr_warn_client(cl,
+ "MDS delegated inode 0x%llx more than once.\n",
start - 1);
} else {
return err;
@@ -744,6 +748,7 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
struct ceph_mds_request *req, u64 features)
{
struct ceph_mds_reply_info_parsed *info = &req->r_reply_info;
+ struct ceph_client *cl = s->s_mdsc->fsc->client;
void *p, *end;
u32 len;
int err;
@@ -783,7 +788,7 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
bad:
err = -EIO;
out_bad:
- pr_err("mds parse_reply err %d\n", err);
+ pr_err_client(cl, "mds parse_reply err %d\n", err);
ceph_msg_dump(msg);
return err;
}
@@ -830,7 +835,8 @@ static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
*/
int ceph_wait_on_conflict_unlink(struct dentry *dentry)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dentry->d_sb);
+ struct ceph_client *cl = fsc->client;
struct dentry *pdentry = dentry->d_parent;
struct dentry *udentry, *found = NULL;
struct ceph_dentry_info *di;
@@ -855,8 +861,8 @@ int ceph_wait_on_conflict_unlink(struct dentry *dentry)
goto next;
if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
- pr_warn("%s dentry %p:%pd async unlink bit is not set\n",
- __func__, dentry, dentry);
+ pr_warn_client(cl, "dentry %p:%pd async unlink bit is not set\n",
+ dentry, dentry);
if (!d_same_name(udentry, pdentry, &dname))
goto next;
@@ -872,8 +878,8 @@ next:
if (likely(!found))
return 0;
- dout("%s dentry %p:%pd conflict with old %p:%pd\n", __func__,
- dentry, dentry, found, found);
+ doutc(cl, "dentry %p:%pd conflict with old %p:%pd\n", dentry, dentry,
+ found, found);
err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT,
TASK_KILLABLE);
@@ -957,6 +963,7 @@ static int __verify_registered_session(struct ceph_mds_client *mdsc,
static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
int mds)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_session *s;
if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO)
@@ -973,7 +980,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
int newmax = 1 << get_count_order(mds + 1);
struct ceph_mds_session **sa;
- dout("%s: realloc to %d\n", __func__, newmax);
+ doutc(cl, "realloc to %d\n", newmax);
sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
if (!sa)
goto fail_realloc;
@@ -986,7 +993,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
mdsc->max_sessions = newmax;
}
- dout("%s: mds%d\n", __func__, mds);
+ doutc(cl, "mds%d\n", mds);
s->s_mdsc = mdsc;
s->s_mds = mds;
s->s_state = CEPH_MDS_SESSION_NEW;
@@ -1029,7 +1036,7 @@ fail_realloc:
static void __unregister_session(struct ceph_mds_client *mdsc,
struct ceph_mds_session *s)
{
- dout("__unregister_session mds%d %p\n", s->s_mds, s);
+ doutc(mdsc->fsc->client, "mds%d %p\n", s->s_mds, s);
BUG_ON(mdsc->sessions[s->s_mds] != s);
mdsc->sessions[s->s_mds] = NULL;
ceph_con_close(&s->s_con);
@@ -1116,6 +1123,8 @@ void ceph_mdsc_release_request(struct kref *kref)
kfree(req->r_path1);
kfree(req->r_path2);
put_cred(req->r_cred);
+ if (req->r_mnt_idmap)
+ mnt_idmap_put(req->r_mnt_idmap);
if (req->r_pagelist)
ceph_pagelist_release(req->r_pagelist);
kfree(req->r_fscrypt_auth);
@@ -1155,6 +1164,7 @@ static void __register_request(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req,
struct inode *dir)
{
+ struct ceph_client *cl = mdsc->fsc->client;
int ret = 0;
req->r_tid = ++mdsc->last_tid;
@@ -1162,18 +1172,20 @@ static void __register_request(struct ceph_mds_client *mdsc,
ret = ceph_reserve_caps(mdsc, &req->r_caps_reservation,
req->r_num_caps);
if (ret < 0) {
- pr_err("__register_request %p "
- "failed to reserve caps: %d\n", req, ret);
+ pr_err_client(cl, "%p failed to reserve caps: %d\n",
+ req, ret);
/* set req->r_err to fail early from __do_request */
req->r_err = ret;
return;
}
}
- dout("__register_request %p tid %lld\n", req, req->r_tid);
+ doutc(cl, "%p tid %lld\n", req, req->r_tid);
ceph_mdsc_get_request(req);
insert_request(&mdsc->request_tree, req);
req->r_cred = get_current_cred();
+ if (!req->r_mnt_idmap)
+ req->r_mnt_idmap = &nop_mnt_idmap;
if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK)
mdsc->oldest_tid = req->r_tid;
@@ -1192,7 +1204,7 @@ static void __register_request(struct ceph_mds_client *mdsc,
static void __unregister_request(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req)
{
- dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+ doutc(mdsc->fsc->client, "%p tid %lld\n", req, req->r_tid);
/* Never leave an unregistered request on an unsafe list! */
list_del_init(&req->r_unsafe_item);
@@ -1278,6 +1290,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
int mds = -1;
u32 hash = req->r_direct_hash;
bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
+ struct ceph_client *cl = mdsc->fsc->client;
if (random)
*random = false;
@@ -1289,8 +1302,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
if (req->r_resend_mds >= 0 &&
(__have_session(mdsc, req->r_resend_mds) ||
ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
- dout("%s using resend_mds mds%d\n", __func__,
- req->r_resend_mds);
+ doutc(cl, "using resend_mds mds%d\n", req->r_resend_mds);
return req->r_resend_mds;
}
@@ -1307,7 +1319,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
rcu_read_lock();
inode = get_nonsnap_parent(req->r_dentry);
rcu_read_unlock();
- dout("%s using snapdir's parent %p\n", __func__, inode);
+ doutc(cl, "using snapdir's parent %p %llx.%llx\n",
+ inode, ceph_vinop(inode));
}
} else if (req->r_dentry) {
/* ignore race with rename; old or new d_parent is okay */
@@ -1327,7 +1340,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
/* direct snapped/virtual snapdir requests
* based on parent dir inode */
inode = get_nonsnap_parent(parent);
- dout("%s using nonsnap parent %p\n", __func__, inode);
+ doutc(cl, "using nonsnap parent %p %llx.%llx\n",
+ inode, ceph_vinop(inode));
} else {
/* dentry target */
inode = d_inode(req->r_dentry);
@@ -1343,10 +1357,11 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
rcu_read_unlock();
}
- dout("%s %p is_hash=%d (0x%x) mode %d\n", __func__, inode, (int)is_hash,
- hash, mode);
if (!inode)
goto random;
+
+ doutc(cl, "%p %llx.%llx is_hash=%d (0x%x) mode %d\n", inode,
+ ceph_vinop(inode), (int)is_hash, hash, mode);
ci = ceph_inode(inode);
if (is_hash && S_ISDIR(inode->i_mode)) {
@@ -1362,9 +1377,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
get_random_bytes(&r, 1);
r %= frag.ndist;
mds = frag.dist[r];
- dout("%s %p %llx.%llx frag %u mds%d (%d/%d)\n",
- __func__, inode, ceph_vinop(inode),
- frag.frag, mds, (int)r, frag.ndist);
+ doutc(cl, "%p %llx.%llx frag %u mds%d (%d/%d)\n",
+ inode, ceph_vinop(inode), frag.frag,
+ mds, (int)r, frag.ndist);
if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
CEPH_MDS_STATE_ACTIVE &&
!ceph_mdsmap_is_laggy(mdsc->mdsmap, mds))
@@ -1377,9 +1392,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
if (frag.mds >= 0) {
/* choose auth mds */
mds = frag.mds;
- dout("%s %p %llx.%llx frag %u mds%d (auth)\n",
- __func__, inode, ceph_vinop(inode),
- frag.frag, mds);
+ doutc(cl, "%p %llx.%llx frag %u mds%d (auth)\n",
+ inode, ceph_vinop(inode), frag.frag, mds);
if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
CEPH_MDS_STATE_ACTIVE) {
if (!ceph_mdsmap_is_laggy(mdsc->mdsmap,
@@ -1403,9 +1417,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
goto random;
}
mds = cap->session->s_mds;
- dout("%s %p %llx.%llx mds%d (%scap %p)\n", __func__,
- inode, ceph_vinop(inode), mds,
- cap == ci->i_auth_cap ? "auth " : "", cap);
+ doutc(cl, "%p %llx.%llx mds%d (%scap %p)\n", inode,
+ ceph_vinop(inode), mds,
+ cap == ci->i_auth_cap ? "auth " : "", cap);
spin_unlock(&ci->i_ceph_lock);
out:
iput(inode);
@@ -1416,7 +1430,7 @@ random:
*random = true;
mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
- dout("%s chose random mds%d\n", __func__, mds);
+ doutc(cl, "chose random mds%d\n", mds);
return mds;
}
@@ -1529,6 +1543,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
int metadata_key_count = 0;
struct ceph_options *opt = mdsc->fsc->client->options;
struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
+ struct ceph_client *cl = mdsc->fsc->client;
size_t size, count;
void *p, *end;
int ret;
@@ -1567,7 +1582,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes,
GFP_NOFS, false);
if (!msg) {
- pr_err("ENOMEM creating session open msg\n");
+ pr_err_client(cl, "ENOMEM creating session open msg\n");
return ERR_PTR(-ENOMEM);
}
p = msg->front.iov_base;
@@ -1607,14 +1622,14 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
ret = encode_supported_features(&p, end);
if (ret) {
- pr_err("encode_supported_features failed!\n");
+ pr_err_client(cl, "encode_supported_features failed!\n");
ceph_msg_put(msg);
return ERR_PTR(ret);
}
ret = encode_metric_spec(&p, end);
if (ret) {
- pr_err("encode_metric_spec failed!\n");
+ pr_err_client(cl, "encode_metric_spec failed!\n");
ceph_msg_put(msg);
return ERR_PTR(ret);
}
@@ -1642,8 +1657,8 @@ static int __open_session(struct ceph_mds_client *mdsc,
/* wait for mds to go active? */
mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
- dout("open_session to mds%d (%s)\n", mds,
- ceph_mds_state_name(mstate));
+ doutc(mdsc->fsc->client, "open_session to mds%d (%s)\n", mds,
+ ceph_mds_state_name(mstate));
session->s_state = CEPH_MDS_SESSION_OPENING;
session->s_renew_requested = jiffies;
@@ -1686,8 +1701,9 @@ struct ceph_mds_session *
ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target)
{
struct ceph_mds_session *session;
+ struct ceph_client *cl = mdsc->fsc->client;
- dout("open_export_target_session to mds%d\n", target);
+ doutc(cl, "to mds%d\n", target);
mutex_lock(&mdsc->mutex);
session = __open_export_target_session(mdsc, target);
@@ -1702,13 +1718,14 @@ static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
struct ceph_mds_info *mi;
struct ceph_mds_session *ts;
int i, mds = session->s_mds;
+ struct ceph_client *cl = mdsc->fsc->client;
if (mds >= mdsc->mdsmap->possible_max_rank)
return;
mi = &mdsc->mdsmap->m_info[mds];
- dout("open_export_target_sessions for mds%d (%d targets)\n",
- session->s_mds, mi->num_export_targets);
+ doutc(cl, "for mds%d (%d targets)\n", session->s_mds,
+ mi->num_export_targets);
for (i = 0; i < mi->num_export_targets; i++) {
ts = __open_export_target_session(mdsc, mi->export_targets[i]);
@@ -1731,11 +1748,13 @@ void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
static void detach_cap_releases(struct ceph_mds_session *session,
struct list_head *target)
{
+ struct ceph_client *cl = session->s_mdsc->fsc->client;
+
lockdep_assert_held(&session->s_cap_lock);
list_splice_init(&session->s_cap_releases, target);
session->s_num_cap_releases = 0;
- dout("dispose_cap_releases mds%d\n", session->s_mds);
+ doutc(cl, "mds%d\n", session->s_mds);
}
static void dispose_cap_releases(struct ceph_mds_client *mdsc,
@@ -1753,16 +1772,17 @@ static void dispose_cap_releases(struct ceph_mds_client *mdsc,
static void cleanup_session_requests(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
struct rb_node *p;
- dout("cleanup_session_requests mds%d\n", session->s_mds);
+ doutc(cl, "mds%d\n", session->s_mds);
mutex_lock(&mdsc->mutex);
while (!list_empty(&session->s_unsafe)) {
req = list_first_entry(&session->s_unsafe,
struct ceph_mds_request, r_unsafe_item);
- pr_warn_ratelimited(" dropping unsafe request %llu\n",
- req->r_tid);
+ pr_warn_ratelimited_client(cl, " dropping unsafe request %llu\n",
+ req->r_tid);
if (req->r_target_inode)
mapping_set_error(req->r_target_inode->i_mapping, -EIO);
if (req->r_unsafe_dir)
@@ -1791,13 +1811,14 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
int (*cb)(struct inode *, int mds, void *),
void *arg)
{
+ struct ceph_client *cl = session->s_mdsc->fsc->client;
struct list_head *p;
struct ceph_cap *cap;
struct inode *inode, *last_inode = NULL;
struct ceph_cap *old_cap = NULL;
int ret;
- dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
+ doutc(cl, "%p mds%d\n", session, session->s_mds);
spin_lock(&session->s_cap_lock);
p = session->s_caps.next;
while (p != &session->s_caps) {
@@ -1828,8 +1849,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
spin_lock(&session->s_cap_lock);
p = p->next;
if (!cap->ci) {
- dout("iterate_session_caps finishing cap %p removal\n",
- cap);
+ doutc(cl, "finishing cap %p removal\n", cap);
BUG_ON(cap->session != session);
cap->session = NULL;
list_del_init(&cap->session_caps);
@@ -1858,6 +1878,7 @@ out:
static int remove_session_caps_cb(struct inode *inode, int mds, void *arg)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
bool invalidate = false;
struct ceph_cap *cap;
int iputs = 0;
@@ -1865,8 +1886,8 @@ static int remove_session_caps_cb(struct inode *inode, int mds, void *arg)
spin_lock(&ci->i_ceph_lock);
cap = __get_cap_for_mds(ci, mds);
if (cap) {
- dout(" removing cap %p, ci is %p, inode is %p\n",
- cap, ci, &ci->netfs.inode);
+ doutc(cl, " removing cap %p, ci is %p, inode is %p\n",
+ cap, ci, &ci->netfs.inode);
iputs = ceph_purge_inode_cap(inode, cap, &invalidate);
}
@@ -1890,7 +1911,7 @@ static void remove_session_caps(struct ceph_mds_session *session)
struct super_block *sb = fsc->sb;
LIST_HEAD(dispose);
- dout("remove_session_caps on %p\n", session);
+ doutc(fsc->client, "on %p\n", session);
ceph_iterate_session_caps(session, remove_session_caps_cb, fsc);
wake_up_all(&fsc->mdsc->cap_flushing_wq);
@@ -1971,7 +1992,9 @@ static int wake_up_session_cb(struct inode *inode, int mds, void *arg)
static void wake_up_session_caps(struct ceph_mds_session *session, int ev)
{
- dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
+ struct ceph_client *cl = session->s_mdsc->fsc->client;
+
+ doutc(cl, "session %p mds%d\n", session, session->s_mds);
ceph_iterate_session_caps(session, wake_up_session_cb,
(void *)(unsigned long)ev);
}
@@ -1985,25 +2008,26 @@ static void wake_up_session_caps(struct ceph_mds_session *session, int ev)
static int send_renew_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_msg *msg;
int state;
if (time_after_eq(jiffies, session->s_cap_ttl) &&
time_after_eq(session->s_cap_ttl, session->s_renew_requested))
- pr_info("mds%d caps stale\n", session->s_mds);
+ pr_info_client(cl, "mds%d caps stale\n", session->s_mds);
session->s_renew_requested = jiffies;
/* do not try to renew caps until a recovering mds has reconnected
* with its clients. */
state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
if (state < CEPH_MDS_STATE_RECONNECT) {
- dout("send_renew_caps ignoring mds%d (%s)\n",
- session->s_mds, ceph_mds_state_name(state));
+ doutc(cl, "ignoring mds%d (%s)\n", session->s_mds,
+ ceph_mds_state_name(state));
return 0;
}
- dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
- ceph_mds_state_name(state));
+ doutc(cl, "to mds%d (%s)\n", session->s_mds,
+ ceph_mds_state_name(state));
msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
++session->s_renew_seq);
if (!msg)
@@ -2015,10 +2039,11 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session, u64 seq)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_msg *msg;
- dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
- session->s_mds, ceph_session_state_name(session->s_state), seq);
+ doutc(cl, "to mds%d (%s)s seq %lld\n", session->s_mds,
+ ceph_session_state_name(session->s_state), seq);
msg = ceph_create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
if (!msg)
return -ENOMEM;
@@ -2035,6 +2060,7 @@ static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
static void renewed_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session, int is_renew)
{
+ struct ceph_client *cl = mdsc->fsc->client;
int was_stale;
int wake = 0;
@@ -2046,15 +2072,17 @@ static void renewed_caps(struct ceph_mds_client *mdsc,
if (was_stale) {
if (time_before(jiffies, session->s_cap_ttl)) {
- pr_info("mds%d caps renewed\n", session->s_mds);
+ pr_info_client(cl, "mds%d caps renewed\n",
+ session->s_mds);
wake = 1;
} else {
- pr_info("mds%d caps still stale\n", session->s_mds);
+ pr_info_client(cl, "mds%d caps still stale\n",
+ session->s_mds);
}
}
- dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
- session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
- time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
+ doutc(cl, "mds%d ttl now %lu, was %s, now %s\n", session->s_mds,
+ session->s_cap_ttl, was_stale ? "stale" : "fresh",
+ time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
spin_unlock(&session->s_cap_lock);
if (wake)
@@ -2066,11 +2094,11 @@ static void renewed_caps(struct ceph_mds_client *mdsc,
*/
static int request_close_session(struct ceph_mds_session *session)
{
+ struct ceph_client *cl = session->s_mdsc->fsc->client;
struct ceph_msg *msg;
- dout("request_close_session mds%d state %s seq %lld\n",
- session->s_mds, ceph_session_state_name(session->s_state),
- session->s_seq);
+ doutc(cl, "mds%d state %s seq %lld\n", session->s_mds,
+ ceph_session_state_name(session->s_state), session->s_seq);
msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_CLOSE,
session->s_seq);
if (!msg)
@@ -2126,6 +2154,8 @@ out:
*/
static int trim_caps_cb(struct inode *inode, int mds, void *arg)
{
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
int *remaining = arg;
struct ceph_inode_info *ci = ceph_inode(inode);
int used, wanted, oissued, mine;
@@ -2145,9 +2175,10 @@ static int trim_caps_cb(struct inode *inode, int mds, void *arg)
wanted = __ceph_caps_file_wanted(ci);
oissued = __ceph_caps_issued_other(ci, cap);
- dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n",
- inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
- ceph_cap_string(used), ceph_cap_string(wanted));
+ doutc(cl, "%p %llx.%llx cap %p mine %s oissued %s used %s wanted %s\n",
+ inode, ceph_vinop(inode), cap, ceph_cap_string(mine),
+ ceph_cap_string(oissued), ceph_cap_string(used),
+ ceph_cap_string(wanted));
if (cap == ci->i_auth_cap) {
if (ci->i_dirty_caps || ci->i_flushing_caps ||
!list_empty(&ci->i_cap_snaps))
@@ -2173,7 +2204,7 @@ static int trim_caps_cb(struct inode *inode, int mds, void *arg)
if (oissued) {
/* we aren't the only cap.. just remove us */
- ceph_remove_cap(cap, true);
+ ceph_remove_cap(mdsc, cap, true);
(*remaining)--;
} else {
struct dentry *dentry;
@@ -2187,8 +2218,8 @@ static int trim_caps_cb(struct inode *inode, int mds, void *arg)
count = atomic_read(&inode->i_count);
if (count == 1)
(*remaining)--;
- dout("trim_caps_cb %p cap %p pruned, count now %d\n",
- inode, cap, count);
+ doutc(cl, "%p %llx.%llx cap %p pruned, count now %d\n",
+ inode, ceph_vinop(inode), cap, count);
} else {
dput(dentry);
}
@@ -2207,17 +2238,18 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
int max_caps)
{
+ struct ceph_client *cl = mdsc->fsc->client;
int trim_caps = session->s_nr_caps - max_caps;
- dout("trim_caps mds%d start: %d / %d, trim %d\n",
- session->s_mds, session->s_nr_caps, max_caps, trim_caps);
+ doutc(cl, "mds%d start: %d / %d, trim %d\n", session->s_mds,
+ session->s_nr_caps, max_caps, trim_caps);
if (trim_caps > 0) {
int remaining = trim_caps;
ceph_iterate_session_caps(session, trim_caps_cb, &remaining);
- dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
- session->s_mds, session->s_nr_caps, max_caps,
- trim_caps - remaining);
+ doutc(cl, "mds%d done: %d / %d, trimmed %d\n",
+ session->s_mds, session->s_nr_caps, max_caps,
+ trim_caps - remaining);
}
ceph_flush_cap_releases(mdsc, session);
@@ -2227,6 +2259,7 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc,
static int check_caps_flush(struct ceph_mds_client *mdsc,
u64 want_flush_tid)
{
+ struct ceph_client *cl = mdsc->fsc->client;
int ret = 1;
spin_lock(&mdsc->cap_dirty_lock);
@@ -2235,8 +2268,8 @@ static int check_caps_flush(struct ceph_mds_client *mdsc,
list_first_entry(&mdsc->cap_flush_list,
struct ceph_cap_flush, g_list);
if (cf->tid <= want_flush_tid) {
- dout("check_caps_flush still flushing tid "
- "%llu <= %llu\n", cf->tid, want_flush_tid);
+ doutc(cl, "still flushing tid %llu <= %llu\n",
+ cf->tid, want_flush_tid);
ret = 0;
}
}
@@ -2252,12 +2285,14 @@ static int check_caps_flush(struct ceph_mds_client *mdsc,
static void wait_caps_flush(struct ceph_mds_client *mdsc,
u64 want_flush_tid)
{
- dout("check_caps_flush want %llu\n", want_flush_tid);
+ struct ceph_client *cl = mdsc->fsc->client;
+
+ doutc(cl, "want %llu\n", want_flush_tid);
wait_event(mdsc->cap_flushing_wq,
check_caps_flush(mdsc, want_flush_tid));
- dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid);
+ doutc(cl, "ok, flushed thru %llu\n", want_flush_tid);
}
/*
@@ -2266,6 +2301,7 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc,
static void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_msg *msg = NULL;
struct ceph_mds_cap_release *head;
struct ceph_mds_cap_item *item;
@@ -2324,7 +2360,7 @@ again:
msg->front.iov_len += sizeof(*cap_barrier);
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
- dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
+ doutc(cl, "mds%d %p\n", session->s_mds, msg);
ceph_con_send(&session->s_con, msg);
msg = NULL;
}
@@ -2344,13 +2380,13 @@ again:
msg->front.iov_len += sizeof(*cap_barrier);
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
- dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
+ doutc(cl, "mds%d %p\n", session->s_mds, msg);
ceph_con_send(&session->s_con, msg);
}
return;
out_err:
- pr_err("send_cap_releases mds%d, failed to allocate message\n",
- session->s_mds);
+ pr_err_client(cl, "mds%d, failed to allocate message\n",
+ session->s_mds);
spin_lock(&session->s_cap_lock);
list_splice(&tmp_list, &session->s_cap_releases);
session->s_num_cap_releases += num_cap_releases;
@@ -2373,16 +2409,17 @@ static void ceph_cap_release_work(struct work_struct *work)
void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
+ struct ceph_client *cl = mdsc->fsc->client;
if (mdsc->stopping)
return;
ceph_get_mds_session(session);
if (queue_work(mdsc->fsc->cap_wq,
&session->s_cap_release_work)) {
- dout("cap release work queued\n");
+ doutc(cl, "cap release work queued\n");
} else {
ceph_put_mds_session(session);
- dout("failed to queue cap release work\n");
+ doutc(cl, "failed to queue cap release work\n");
}
}
@@ -2410,13 +2447,14 @@ static void ceph_cap_reclaim_work(struct work_struct *work)
void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc)
{
+ struct ceph_client *cl = mdsc->fsc->client;
if (mdsc->stopping)
return;
if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_reclaim_work)) {
- dout("caps reclaim work queued\n");
+ doutc(cl, "caps reclaim work queued\n");
} else {
- dout("failed to queue caps release work\n");
+ doutc(cl, "failed to queue caps release work\n");
}
}
@@ -2588,6 +2626,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
/**
* ceph_mdsc_build_path - build a path string to a given dentry
+ * @mdsc: mds client
* @dentry: dentry to which path should be built
* @plen: returned length of string
* @pbase: returned base inode number
@@ -2607,9 +2646,10 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
* Encode hidden .snap dirs as a double /, i.e.
* foo/.snap/bar -> foo//bar
*/
-char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *pbase,
- int for_wire)
+char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
+ int *plen, u64 *pbase, int for_wire)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct dentry *cur;
struct inode *inode;
char *path;
@@ -2635,8 +2675,7 @@ retry:
spin_lock(&cur->d_lock);
inode = d_inode(cur);
if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
- dout("build_path path+%d: %p SNAPDIR\n",
- pos, cur);
+ doutc(cl, "path+%d: %p SNAPDIR\n", pos, cur);
spin_unlock(&cur->d_lock);
parent = dget_parent(cur);
} else if (for_wire && inode && dentry != cur &&
@@ -2714,21 +2753,21 @@ retry:
* A rename didn't occur, but somehow we didn't end up where
* we thought we would. Throw a warning and try again.
*/
- pr_warn("build_path did not end path lookup where expected (pos = %d)\n",
- pos);
+ pr_warn_client(cl, "did not end path lookup where expected (pos = %d)\n",
+ pos);
goto retry;
}
*pbase = base;
*plen = PATH_MAX - 1 - pos;
- dout("build_path on %p %d built %llx '%.*s'\n",
- dentry, d_count(dentry), base, *plen, path + pos);
+ doutc(cl, "on %p %d built %llx '%.*s'\n", dentry, d_count(dentry),
+ base, *plen, path + pos);
return path + pos;
}
-static int build_dentry_path(struct dentry *dentry, struct inode *dir,
- const char **ppath, int *ppathlen, u64 *pino,
- bool *pfreepath, bool parent_locked)
+static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
+ struct inode *dir, const char **ppath, int *ppathlen,
+ u64 *pino, bool *pfreepath, bool parent_locked)
{
char *path;
@@ -2744,7 +2783,7 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir,
return 0;
}
rcu_read_unlock();
- path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
+ path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
if (IS_ERR(path))
return PTR_ERR(path);
*ppath = path;
@@ -2756,6 +2795,7 @@ static int build_inode_path(struct inode *inode,
const char **ppath, int *ppathlen, u64 *pino,
bool *pfreepath)
{
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct dentry *dentry;
char *path;
@@ -2765,7 +2805,7 @@ static int build_inode_path(struct inode *inode,
return 0;
}
dentry = d_find_alias(inode);
- path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
+ path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
dput(dentry);
if (IS_ERR(path))
return PTR_ERR(path);
@@ -2778,27 +2818,28 @@ static int build_inode_path(struct inode *inode,
* request arguments may be specified via an inode *, a dentry *, or
* an explicit ino+path.
*/
-static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
- struct inode *rdiri, const char *rpath,
- u64 rino, const char **ppath, int *pathlen,
- u64 *ino, bool *freepath, bool parent_locked)
+static int set_request_path_attr(struct ceph_mds_client *mdsc, struct inode *rinode,
+ struct dentry *rdentry, struct inode *rdiri,
+ const char *rpath, u64 rino, const char **ppath,
+ int *pathlen, u64 *ino, bool *freepath,
+ bool parent_locked)
{
+ struct ceph_client *cl = mdsc->fsc->client;
int r = 0;
if (rinode) {
r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
- dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
- ceph_snap(rinode));
+ doutc(cl, " inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
+ ceph_snap(rinode));
} else if (rdentry) {
- r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino,
+ r = build_dentry_path(mdsc, rdentry, rdiri, ppath, pathlen, ino,
freepath, parent_locked);
- dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
- *ppath);
+ doutc(cl, " dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath);
} else if (rpath || rino) {
*ino = rino;
*ppath = rpath;
*pathlen = rpath ? strlen(rpath) : 0;
- dout(" path %.*s\n", *pathlen, rpath);
+ doutc(cl, " path %.*s\n", *pathlen, rpath);
}
return r;
@@ -2840,6 +2881,17 @@ static void encode_mclientrequest_tail(void **p,
}
}
+static inline u16 mds_supported_head_version(struct ceph_mds_session *session)
+{
+ if (!test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features))
+ return 1;
+
+ if (!test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features))
+ return 2;
+
+ return CEPH_MDS_REQUEST_HEAD_VERSION;
+}
+
static struct ceph_mds_request_head_legacy *
find_legacy_request_head(void *p, u64 features)
{
@@ -2861,6 +2913,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
{
int mds = session->s_mds;
struct ceph_mds_client *mdsc = session->s_mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_msg *msg;
struct ceph_mds_request_head_legacy *lhead;
const char *path1 = NULL;
@@ -2874,10 +2927,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
void *p, *end;
int ret;
bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
- bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD,
- &session->s_features);
+ u16 request_head_version = mds_supported_head_version(session);
+ kuid_t caller_fsuid = req->r_cred->fsuid;
+ kgid_t caller_fsgid = req->r_cred->fsgid;
- ret = set_request_path_attr(req->r_inode, req->r_dentry,
+ ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
req->r_parent, req->r_path1, req->r_ino1.ino,
&path1, &pathlen1, &ino1, &freepath1,
test_bit(CEPH_MDS_R_PARENT_LOCKED,
@@ -2891,7 +2945,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
if (req->r_old_dentry &&
!(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED))
old_dentry = req->r_old_dentry;
- ret = set_request_path_attr(NULL, old_dentry,
+ ret = set_request_path_attr(mdsc, NULL, old_dentry,
req->r_old_dentry_dir,
req->r_path2, req->r_ino2.ino,
&path2, &pathlen2, &ino2, &freepath2, true);
@@ -2916,8 +2970,10 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
*/
if (legacy)
len = sizeof(struct ceph_mds_request_head_legacy);
- else if (old_version)
+ else if (request_head_version == 1)
len = sizeof(struct ceph_mds_request_head_old);
+ else if (request_head_version == 2)
+ len = offsetofend(struct ceph_mds_request_head, ext_num_fwd);
else
len = sizeof(struct ceph_mds_request_head);
@@ -2967,6 +3023,30 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
lhead = find_legacy_request_head(msg->front.iov_base,
session->s_con.peer_features);
+ if ((req->r_mnt_idmap != &nop_mnt_idmap) &&
+ !test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features)) {
+ WARN_ON_ONCE(!IS_CEPH_MDS_OP_NEWINODE(req->r_op));
+
+ if (enable_unsafe_idmap) {
+ pr_warn_once_client(cl,
+ "idmapped mount is used and CEPHFS_FEATURE_HAS_OWNER_UIDGID"
+ " is not supported by MDS. UID/GID-based restrictions may"
+ " not work properly.\n");
+
+ caller_fsuid = from_vfsuid(req->r_mnt_idmap, &init_user_ns,
+ VFSUIDT_INIT(req->r_cred->fsuid));
+ caller_fsgid = from_vfsgid(req->r_mnt_idmap, &init_user_ns,
+ VFSGIDT_INIT(req->r_cred->fsgid));
+ } else {
+ pr_err_ratelimited_client(cl,
+ "idmapped mount is used and CEPHFS_FEATURE_HAS_OWNER_UIDGID"
+ " is not supported by MDS. Fail request with -EIO.\n");
+
+ ret = -EIO;
+ goto out_err;
+ }
+ }
+
/*
* The ceph_mds_request_head_legacy didn't contain a version field, and
* one was added when we moved the message version from 3->4.
@@ -2974,17 +3054,40 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
if (legacy) {
msg->hdr.version = cpu_to_le16(3);
p = msg->front.iov_base + sizeof(*lhead);
- } else if (old_version) {
+ } else if (request_head_version == 1) {
struct ceph_mds_request_head_old *ohead = msg->front.iov_base;
msg->hdr.version = cpu_to_le16(4);
ohead->version = cpu_to_le16(1);
p = msg->front.iov_base + sizeof(*ohead);
+ } else if (request_head_version == 2) {
+ struct ceph_mds_request_head *nhead = msg->front.iov_base;
+
+ msg->hdr.version = cpu_to_le16(6);
+ nhead->version = cpu_to_le16(2);
+
+ p = msg->front.iov_base + offsetofend(struct ceph_mds_request_head, ext_num_fwd);
} else {
struct ceph_mds_request_head *nhead = msg->front.iov_base;
+ kuid_t owner_fsuid;
+ kgid_t owner_fsgid;
msg->hdr.version = cpu_to_le16(6);
nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
+ nhead->struct_len = cpu_to_le32(sizeof(struct ceph_mds_request_head));
+
+ if (IS_CEPH_MDS_OP_NEWINODE(req->r_op)) {
+ owner_fsuid = from_vfsuid(req->r_mnt_idmap, &init_user_ns,
+ VFSUIDT_INIT(req->r_cred->fsuid));
+ owner_fsgid = from_vfsgid(req->r_mnt_idmap, &init_user_ns,
+ VFSGIDT_INIT(req->r_cred->fsgid));
+ nhead->owner_uid = cpu_to_le32(from_kuid(&init_user_ns, owner_fsuid));
+ nhead->owner_gid = cpu_to_le32(from_kgid(&init_user_ns, owner_fsgid));
+ } else {
+ nhead->owner_uid = cpu_to_le32(-1);
+ nhead->owner_gid = cpu_to_le32(-1);
+ }
+
p = msg->front.iov_base + sizeof(*nhead);
}
@@ -2993,9 +3096,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
lhead->op = cpu_to_le32(req->r_op);
lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
- req->r_cred->fsuid));
+ caller_fsuid));
lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
- req->r_cred->fsgid));
+ caller_fsgid));
lhead->ino = cpu_to_le64(req->r_deleg_ino);
lhead->args = req->r_args;
@@ -3099,6 +3202,7 @@ static int __prepare_send_request(struct ceph_mds_session *session,
{
int mds = session->s_mds;
struct ceph_mds_client *mdsc = session->s_mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request_head_legacy *lhead;
struct ceph_mds_request_head *nhead;
struct ceph_msg *msg;
@@ -3117,8 +3221,8 @@ static int __prepare_send_request(struct ceph_mds_session *session,
old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE);
if ((old_version && req->r_attempts >= old_max_retry) ||
((uint32_t)req->r_attempts >= U32_MAX)) {
- pr_warn_ratelimited("%s request tid %llu seq overflow\n",
- __func__, req->r_tid);
+ pr_warn_ratelimited_client(cl, "request tid %llu seq overflow\n",
+ req->r_tid);
return -EMULTIHOP;
}
}
@@ -3133,8 +3237,8 @@ static int __prepare_send_request(struct ceph_mds_session *session,
else
req->r_sent_on_mseq = -1;
}
- dout("%s %p tid %lld %s (attempt %d)\n", __func__, req,
- req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
+ doutc(cl, "%p tid %lld %s (attempt %d)\n", req, req->r_tid,
+ ceph_mds_op_name(req->r_op), req->r_attempts);
if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
void *p;
@@ -3202,7 +3306,7 @@ static int __prepare_send_request(struct ceph_mds_session *session,
nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
}
- dout(" r_parent = %p\n", req->r_parent);
+ doutc(cl, " r_parent = %p\n", req->r_parent);
return 0;
}
@@ -3230,6 +3334,7 @@ static int __send_request(struct ceph_mds_session *session,
static void __do_request(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_session *session = NULL;
int mds = -1;
int err = 0;
@@ -3242,29 +3347,29 @@ static void __do_request(struct ceph_mds_client *mdsc,
}
if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) {
- dout("do_request metadata corrupted\n");
+ doutc(cl, "metadata corrupted\n");
err = -EIO;
goto finish;
}
if (req->r_timeout &&
time_after_eq(jiffies, req->r_started + req->r_timeout)) {
- dout("do_request timed out\n");
+ doutc(cl, "timed out\n");
err = -ETIMEDOUT;
goto finish;
}
if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
- dout("do_request forced umount\n");
+ doutc(cl, "forced umount\n");
err = -EIO;
goto finish;
}
if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) {
if (mdsc->mdsmap_err) {
err = mdsc->mdsmap_err;
- dout("do_request mdsmap err %d\n", err);
+ doutc(cl, "mdsmap err %d\n", err);
goto finish;
}
if (mdsc->mdsmap->m_epoch == 0) {
- dout("do_request no mdsmap, waiting for map\n");
+ doutc(cl, "no mdsmap, waiting for map\n");
list_add(&req->r_wait, &mdsc->waiting_for_map);
return;
}
@@ -3285,7 +3390,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
err = -EJUKEBOX;
goto finish;
}
- dout("do_request no mds or not active, waiting for map\n");
+ doutc(cl, "no mds or not active, waiting for map\n");
list_add(&req->r_wait, &mdsc->waiting_for_map);
return;
}
@@ -3301,8 +3406,8 @@ static void __do_request(struct ceph_mds_client *mdsc,
}
req->r_session = ceph_get_mds_session(session);
- dout("do_request mds%d session %p state %s\n", mds, session,
- ceph_session_state_name(session->s_state));
+ doutc(cl, "mds%d session %p state %s\n", mds, session,
+ ceph_session_state_name(session->s_state));
/*
* The old ceph will crash the MDSs when see unknown OPs
@@ -3393,8 +3498,8 @@ static void __do_request(struct ceph_mds_client *mdsc,
spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap;
if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE && mds != cap->mds) {
- dout("do_request session changed for auth cap %d -> %d\n",
- cap->session->s_mds, session->s_mds);
+ doutc(cl, "session changed for auth cap %d -> %d\n",
+ cap->session->s_mds, session->s_mds);
/* Remove the auth cap from old session */
spin_lock(&cap->session->s_cap_lock);
@@ -3421,7 +3526,7 @@ out_session:
ceph_put_mds_session(session);
finish:
if (err) {
- dout("__do_request early error %d\n", err);
+ doutc(cl, "early error %d\n", err);
req->r_err = err;
complete_request(mdsc, req);
__unregister_request(mdsc, req);
@@ -3435,6 +3540,7 @@ finish:
static void __wake_requests(struct ceph_mds_client *mdsc,
struct list_head *head)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
LIST_HEAD(tmp_list);
@@ -3444,7 +3550,8 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
req = list_entry(tmp_list.next,
struct ceph_mds_request, r_wait);
list_del_init(&req->r_wait);
- dout(" wake request %p tid %llu\n", req, req->r_tid);
+ doutc(cl, " wake request %p tid %llu\n", req,
+ req->r_tid);
__do_request(mdsc, req);
}
}
@@ -3455,10 +3562,11 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
*/
static void kick_requests(struct ceph_mds_client *mdsc, int mds)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
struct rb_node *p = rb_first(&mdsc->request_tree);
- dout("kick_requests mds%d\n", mds);
+ doutc(cl, "kick_requests mds%d\n", mds);
while (p) {
req = rb_entry(p, struct ceph_mds_request, r_node);
p = rb_next(p);
@@ -3468,7 +3576,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
continue; /* only new requests */
if (req->r_session &&
req->r_session->s_mds == mds) {
- dout(" kicking tid %llu\n", req->r_tid);
+ doutc(cl, " kicking tid %llu\n", req->r_tid);
list_del_init(&req->r_wait);
__do_request(mdsc, req);
}
@@ -3478,6 +3586,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
struct ceph_mds_request *req)
{
+ struct ceph_client *cl = mdsc->fsc->client;
int err = 0;
/* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
@@ -3499,8 +3608,7 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
if (req->r_inode) {
err = ceph_wait_on_async_create(req->r_inode);
if (err) {
- dout("%s: wait for async create returned: %d\n",
- __func__, err);
+ doutc(cl, "wait for async create returned: %d\n", err);
return err;
}
}
@@ -3508,13 +3616,12 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
if (!err && req->r_old_inode) {
err = ceph_wait_on_async_create(req->r_old_inode);
if (err) {
- dout("%s: wait for async create returned: %d\n",
- __func__, err);
+ doutc(cl, "wait for async create returned: %d\n", err);
return err;
}
}
- dout("submit_request on %p for inode %p\n", req, dir);
+ doutc(cl, "submit_request on %p for inode %p\n", req, dir);
mutex_lock(&mdsc->mutex);
__register_request(mdsc, req, dir);
__do_request(mdsc, req);
@@ -3527,10 +3634,11 @@ int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req,
ceph_mds_request_wait_callback_t wait_func)
{
+ struct ceph_client *cl = mdsc->fsc->client;
int err;
/* wait */
- dout("do_request waiting\n");
+ doutc(cl, "do_request waiting\n");
if (wait_func) {
err = wait_func(mdsc, req);
} else {
@@ -3544,14 +3652,14 @@ int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
else
err = timeleft; /* killed */
}
- dout("do_request waited, got %d\n", err);
+ doutc(cl, "do_request waited, got %d\n", err);
mutex_lock(&mdsc->mutex);
/* only abort if we didn't race with a real reply */
if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
err = le32_to_cpu(req->r_reply_info.head->result);
} else if (err < 0) {
- dout("aborted request %lld with %d\n", req->r_tid, err);
+ doutc(cl, "aborted request %lld with %d\n", req->r_tid, err);
/*
* ensure we aren't running concurrently with
@@ -3582,15 +3690,16 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
struct inode *dir,
struct ceph_mds_request *req)
{
+ struct ceph_client *cl = mdsc->fsc->client;
int err;
- dout("do_request on %p\n", req);
+ doutc(cl, "do_request on %p\n", req);
/* issue */
err = ceph_mdsc_submit_request(mdsc, dir, req);
if (!err)
err = ceph_mdsc_wait_request(mdsc, req, NULL);
- dout("do_request %p done, result %d\n", req, err);
+ doutc(cl, "do_request %p done, result %d\n", req, err);
return err;
}
@@ -3602,8 +3711,10 @@ void ceph_invalidate_dir_request(struct ceph_mds_request *req)
{
struct inode *dir = req->r_parent;
struct inode *old_dir = req->r_old_dentry_dir;
+ struct ceph_client *cl = req->r_mdsc->fsc->client;
- dout("invalidate_dir_request %p %p (complete, lease(s))\n", dir, old_dir);
+ doutc(cl, "invalidate_dir_request %p %p (complete, lease(s))\n",
+ dir, old_dir);
ceph_dir_clear_complete(dir);
if (old_dir)
@@ -3624,6 +3735,7 @@ void ceph_invalidate_dir_request(struct ceph_mds_request *req)
static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
{
struct ceph_mds_client *mdsc = session->s_mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
struct ceph_mds_reply_head *head = msg->front.iov_base;
struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */
@@ -3634,7 +3746,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
bool close_sessions = false;
if (msg->front.iov_len < sizeof(*head)) {
- pr_err("mdsc_handle_reply got corrupt (short) reply\n");
+ pr_err_client(cl, "got corrupt (short) reply\n");
ceph_msg_dump(msg);
return;
}
@@ -3644,17 +3756,17 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
mutex_lock(&mdsc->mutex);
req = lookup_get_request(mdsc, tid);
if (!req) {
- dout("handle_reply on unknown tid %llu\n", tid);
+ doutc(cl, "on unknown tid %llu\n", tid);
mutex_unlock(&mdsc->mutex);
return;
}
- dout("handle_reply %p\n", req);
+ doutc(cl, "handle_reply %p\n", req);
/* correct session? */
if (req->r_session != session) {
- pr_err("mdsc_handle_reply got %llu on session mds%d"
- " not mds%d\n", tid, session->s_mds,
- req->r_session ? req->r_session->s_mds : -1);
+ pr_err_client(cl, "got %llu on session mds%d not mds%d\n",
+ tid, session->s_mds,
+ req->r_session ? req->r_session->s_mds : -1);
mutex_unlock(&mdsc->mutex);
goto out;
}
@@ -3662,14 +3774,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
/* dup? */
if ((test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags) && !head->safe) ||
(test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags) && head->safe)) {
- pr_warn("got a dup %s reply on %llu from mds%d\n",
- head->safe ? "safe" : "unsafe", tid, mds);
+ pr_warn_client(cl, "got a dup %s reply on %llu from mds%d\n",
+ head->safe ? "safe" : "unsafe", tid, mds);
mutex_unlock(&mdsc->mutex);
goto out;
}
if (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags)) {
- pr_warn("got unsafe after safe on %llu from mds%d\n",
- tid, mds);
+ pr_warn_client(cl, "got unsafe after safe on %llu from mds%d\n",
+ tid, mds);
mutex_unlock(&mdsc->mutex);
goto out;
}
@@ -3692,7 +3804,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
* response. And even if it did, there is nothing
* useful we could do with a revised return value.
*/
- dout("got safe reply %llu, mds%d\n", tid, mds);
+ doutc(cl, "got safe reply %llu, mds%d\n", tid, mds);
mutex_unlock(&mdsc->mutex);
goto out;
@@ -3702,7 +3814,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
}
- dout("handle_reply tid %lld result %d\n", tid, result);
+ doutc(cl, "tid %lld result %d\n", tid, result);
if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features))
err = parse_reply_info(session, msg, req, (u64)-1);
else
@@ -3742,7 +3854,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
mutex_lock(&session->s_mutex);
if (err < 0) {
- pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
+ pr_err_client(cl, "got corrupt reply mds%d(tid:%lld)\n",
+ mds, tid);
ceph_msg_dump(msg);
goto out_err;
}
@@ -3806,7 +3919,7 @@ out_err:
set_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags);
}
} else {
- dout("reply arrived after request %lld was aborted\n", tid);
+ doutc(cl, "reply arrived after request %lld was aborted\n", tid);
}
mutex_unlock(&mdsc->mutex);
@@ -3835,6 +3948,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct ceph_msg *msg)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
u64 tid = le64_to_cpu(msg->hdr.tid);
u32 next_mds;
@@ -3852,12 +3966,12 @@ static void handle_forward(struct ceph_mds_client *mdsc,
req = lookup_get_request(mdsc, tid);
if (!req) {
mutex_unlock(&mdsc->mutex);
- dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
+ doutc(cl, "forward tid %llu to mds%d - req dne\n", tid, next_mds);
return; /* dup reply? */
}
if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
- dout("forward tid %llu aborted, unregistering\n", tid);
+ doutc(cl, "forward tid %llu aborted, unregistering\n", tid);
__unregister_request(mdsc, req);
} else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) {
/*
@@ -3873,10 +3987,11 @@ static void handle_forward(struct ceph_mds_client *mdsc,
set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
mutex_unlock(&req->r_fill_mutex);
aborted = true;
- pr_warn_ratelimited("forward tid %llu seq overflow\n", tid);
+ pr_warn_ratelimited_client(cl, "forward tid %llu seq overflow\n",
+ tid);
} else {
/* resend. forward race not possible; mds would drop */
- dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
+ doutc(cl, "forward tid %llu to mds%d (we resend)\n", tid, next_mds);
BUG_ON(req->r_err);
BUG_ON(test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags));
req->r_attempts = 0;
@@ -3894,7 +4009,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
return;
bad:
- pr_err("mdsc_handle_forward decode error err=%d\n", err);
+ pr_err_client(cl, "decode error err=%d\n", err);
ceph_msg_dump(msg);
}
@@ -3933,6 +4048,7 @@ static void handle_session(struct ceph_mds_session *session,
struct ceph_msg *msg)
{
struct ceph_mds_client *mdsc = session->s_mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
int mds = session->s_mds;
int msg_version = le16_to_cpu(msg->hdr.version);
void *p = msg->front.iov_base;
@@ -3980,7 +4096,8 @@ static void handle_session(struct ceph_mds_session *session,
/* version >= 5, flags */
ceph_decode_32_safe(&p, end, flags, bad);
if (flags & CEPH_SESSION_BLOCKLISTED) {
- pr_warn("mds%d session blocklisted\n", session->s_mds);
+ pr_warn_client(cl, "mds%d session blocklisted\n",
+ session->s_mds);
blocklisted = true;
}
}
@@ -3996,22 +4113,24 @@ static void handle_session(struct ceph_mds_session *session,
mutex_lock(&session->s_mutex);
- dout("handle_session mds%d %s %p state %s seq %llu\n",
- mds, ceph_session_op_name(op), session,
- ceph_session_state_name(session->s_state), seq);
+ doutc(cl, "mds%d %s %p state %s seq %llu\n", mds,
+ ceph_session_op_name(op), session,
+ ceph_session_state_name(session->s_state), seq);
if (session->s_state == CEPH_MDS_SESSION_HUNG) {
session->s_state = CEPH_MDS_SESSION_OPEN;
- pr_info("mds%d came back\n", session->s_mds);
+ pr_info_client(cl, "mds%d came back\n", session->s_mds);
}
switch (op) {
case CEPH_SESSION_OPEN:
if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
- pr_info("mds%d reconnect success\n", session->s_mds);
+ pr_info_client(cl, "mds%d reconnect success\n",
+ session->s_mds);
if (session->s_state == CEPH_MDS_SESSION_OPEN) {
- pr_notice("mds%d is already opened\n", session->s_mds);
+ pr_notice_client(cl, "mds%d is already opened\n",
+ session->s_mds);
} else {
session->s_state = CEPH_MDS_SESSION_OPEN;
session->s_features = features;
@@ -4041,7 +4160,8 @@ static void handle_session(struct ceph_mds_session *session,
case CEPH_SESSION_CLOSE:
if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
- pr_info("mds%d reconnect denied\n", session->s_mds);
+ pr_info_client(cl, "mds%d reconnect denied\n",
+ session->s_mds);
session->s_state = CEPH_MDS_SESSION_CLOSED;
cleanup_session_requests(mdsc, session);
remove_session_caps(session);
@@ -4050,8 +4170,8 @@ static void handle_session(struct ceph_mds_session *session,
break;
case CEPH_SESSION_STALE:
- pr_info("mds%d caps went stale, renewing\n",
- session->s_mds);
+ pr_info_client(cl, "mds%d caps went stale, renewing\n",
+ session->s_mds);
atomic_inc(&session->s_cap_gen);
session->s_cap_ttl = jiffies - 1;
send_renew_caps(mdsc, session);
@@ -4072,7 +4192,7 @@ static void handle_session(struct ceph_mds_session *session,
break;
case CEPH_SESSION_FORCE_RO:
- dout("force_session_readonly %p\n", session);
+ doutc(cl, "force_session_readonly %p\n", session);
spin_lock(&session->s_cap_lock);
session->s_readonly = true;
spin_unlock(&session->s_cap_lock);
@@ -4081,7 +4201,8 @@ static void handle_session(struct ceph_mds_session *session,
case CEPH_SESSION_REJECT:
WARN_ON(session->s_state != CEPH_MDS_SESSION_OPENING);
- pr_info("mds%d rejected session\n", session->s_mds);
+ pr_info_client(cl, "mds%d rejected session\n",
+ session->s_mds);
session->s_state = CEPH_MDS_SESSION_REJECTED;
cleanup_session_requests(mdsc, session);
remove_session_caps(session);
@@ -4091,7 +4212,7 @@ static void handle_session(struct ceph_mds_session *session,
break;
default:
- pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
+ pr_err_client(cl, "bad op %d mds%d\n", op, mds);
WARN_ON(1);
}
@@ -4108,30 +4229,32 @@ static void handle_session(struct ceph_mds_session *session,
return;
bad:
- pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
- (int)msg->front.iov_len);
+ pr_err_client(cl, "corrupt message mds%d len %d\n", mds,
+ (int)msg->front.iov_len);
ceph_msg_dump(msg);
return;
}
void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req)
{
+ struct ceph_client *cl = req->r_mdsc->fsc->client;
int dcaps;
dcaps = xchg(&req->r_dir_caps, 0);
if (dcaps) {
- dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
+ doutc(cl, "releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
ceph_put_cap_refs(ceph_inode(req->r_parent), dcaps);
}
}
void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req)
{
+ struct ceph_client *cl = req->r_mdsc->fsc->client;
int dcaps;
dcaps = xchg(&req->r_dir_caps, 0);
if (dcaps) {
- dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
+ doutc(cl, "releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
ceph_put_cap_refs_no_check_caps(ceph_inode(req->r_parent),
dcaps);
}
@@ -4146,7 +4269,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req, *nreq;
struct rb_node *p;
- dout("replay_unsafe_requests mds%d\n", session->s_mds);
+ doutc(mdsc->fsc->client, "mds%d\n", session->s_mds);
mutex_lock(&mdsc->mutex);
list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item)
@@ -4290,6 +4413,8 @@ out_unlock:
*/
static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
{
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
union {
struct ceph_mds_cap_reconnect v2;
struct ceph_mds_cap_reconnect_v1 v1;
@@ -4307,7 +4432,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
dentry = d_find_primary(inode);
if (dentry) {
/* set pathbase to parent dir when msg_version >= 2 */
- path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase,
+ path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase,
recon_state->msg_version >= 2);
dput(dentry);
if (IS_ERR(path)) {
@@ -4326,9 +4451,9 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
err = 0;
goto out_err;
}
- dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
- inode, ceph_vinop(inode), cap, cap->cap_id,
- ceph_cap_string(cap->issued));
+ doutc(cl, " adding %p ino %llx.%llx cap %p %lld %s\n", inode,
+ ceph_vinop(inode), cap, cap->cap_id,
+ ceph_cap_string(cap->issued));
cap->seq = 0; /* reset cap seq */
cap->issue_seq = 0; /* and issue_seq */
@@ -4353,12 +4478,16 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
rec.v2.flock_len = (__force __le32)
((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
} else {
+ struct timespec64 ts;
+
rec.v1.cap_id = cpu_to_le64(cap->cap_id);
rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
rec.v1.issued = cpu_to_le32(cap->issued);
rec.v1.size = cpu_to_le64(i_size_read(inode));
- ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
- ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
+ ts = inode_get_mtime(inode);
+ ceph_encode_timespec64(&rec.v1.mtime, &ts);
+ ts = inode_get_atime(inode);
+ ceph_encode_timespec64(&rec.v1.atime, &ts);
rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
rec.v1.pathbase = cpu_to_le64(pathbase);
}
@@ -4478,6 +4607,7 @@ static int encode_snap_realms(struct ceph_mds_client *mdsc,
{
struct rb_node *p;
struct ceph_pagelist *pagelist = recon_state->pagelist;
+ struct ceph_client *cl = mdsc->fsc->client;
int err = 0;
if (recon_state->msg_version >= 4) {
@@ -4516,8 +4646,8 @@ static int encode_snap_realms(struct ceph_mds_client *mdsc,
ceph_pagelist_encode_32(pagelist, sizeof(sr_rec));
}
- dout(" adding snap realm %llx seq %lld parent %llx\n",
- realm->ino, realm->seq, realm->parent_ino);
+ doutc(cl, " adding snap realm %llx seq %lld parent %llx\n",
+ realm->ino, realm->seq, realm->parent_ino);
sr_rec.ino = cpu_to_le64(realm->ino);
sr_rec.seq = cpu_to_le64(realm->seq);
sr_rec.parent = cpu_to_le64(realm->parent_ino);
@@ -4546,6 +4676,7 @@ fail:
static void send_mds_reconnect(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_msg *reply;
int mds = session->s_mds;
int err = -ENOMEM;
@@ -4554,7 +4685,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
};
LIST_HEAD(dispose);
- pr_info("mds%d reconnect start\n", mds);
+ pr_info_client(cl, "mds%d reconnect start\n", mds);
recon_state.pagelist = ceph_pagelist_alloc(GFP_NOFS);
if (!recon_state.pagelist)
@@ -4570,8 +4701,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
session->s_state = CEPH_MDS_SESSION_RECONNECTING;
session->s_seq = 0;
- dout("session %p state %s\n", session,
- ceph_session_state_name(session->s_state));
+ doutc(cl, "session %p state %s\n", session,
+ ceph_session_state_name(session->s_state));
atomic_inc(&session->s_cap_gen);
@@ -4705,7 +4836,8 @@ fail:
fail_nomsg:
ceph_pagelist_release(recon_state.pagelist);
fail_nopagelist:
- pr_err("error %d preparing reconnect for mds%d\n", err, mds);
+ pr_err_client(cl, "error %d preparing reconnect for mds%d\n",
+ err, mds);
return;
}
@@ -4724,9 +4856,9 @@ static void check_new_map(struct ceph_mds_client *mdsc,
int oldstate, newstate;
struct ceph_mds_session *s;
unsigned long targets[DIV_ROUND_UP(CEPH_MAX_MDS, sizeof(unsigned long))] = {0};
+ struct ceph_client *cl = mdsc->fsc->client;
- dout("check_new_map new %u old %u\n",
- newmap->m_epoch, oldmap->m_epoch);
+ doutc(cl, "new %u old %u\n", newmap->m_epoch, oldmap->m_epoch);
if (newmap->m_info) {
for (i = 0; i < newmap->possible_max_rank; i++) {
@@ -4742,12 +4874,12 @@ static void check_new_map(struct ceph_mds_client *mdsc,
oldstate = ceph_mdsmap_get_state(oldmap, i);
newstate = ceph_mdsmap_get_state(newmap, i);
- dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n",
- i, ceph_mds_state_name(oldstate),
- ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
- ceph_mds_state_name(newstate),
- ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
- ceph_session_state_name(s->s_state));
+ doutc(cl, "mds%d state %s%s -> %s%s (session %s)\n",
+ i, ceph_mds_state_name(oldstate),
+ ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
+ ceph_mds_state_name(newstate),
+ ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
+ ceph_session_state_name(s->s_state));
if (i >= newmap->possible_max_rank) {
/* force close session for stopped mds */
@@ -4800,7 +4932,8 @@ static void check_new_map(struct ceph_mds_client *mdsc,
newstate >= CEPH_MDS_STATE_ACTIVE) {
if (oldstate != CEPH_MDS_STATE_CREATING &&
oldstate != CEPH_MDS_STATE_STARTING)
- pr_info("mds%d recovery completed\n", s->s_mds);
+ pr_info_client(cl, "mds%d recovery completed\n",
+ s->s_mds);
kick_requests(mdsc, i);
mutex_unlock(&mdsc->mutex);
mutex_lock(&s->s_mutex);
@@ -4844,12 +4977,13 @@ static void check_new_map(struct ceph_mds_client *mdsc,
s = __open_export_target_session(mdsc, i);
if (IS_ERR(s)) {
err = PTR_ERR(s);
- pr_err("failed to open export target session, err %d\n",
- err);
+ pr_err_client(cl,
+ "failed to open export target session, err %d\n",
+ err);
continue;
}
}
- dout("send reconnect to export target mds.%d\n", i);
+ doutc(cl, "send reconnect to export target mds.%d\n", i);
mutex_unlock(&mdsc->mutex);
send_mds_reconnect(mdsc, s);
ceph_put_mds_session(s);
@@ -4865,8 +4999,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
if (s->s_state == CEPH_MDS_SESSION_OPEN ||
s->s_state == CEPH_MDS_SESSION_HUNG ||
s->s_state == CEPH_MDS_SESSION_CLOSING) {
- dout(" connecting to export targets of laggy mds%d\n",
- i);
+ doutc(cl, " connecting to export targets of laggy mds%d\n", i);
__open_export_target_sessions(mdsc, s);
}
}
@@ -4893,6 +5026,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct ceph_msg *msg)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct super_block *sb = mdsc->fsc->sb;
struct inode *inode;
struct dentry *parent, *dentry;
@@ -4904,7 +5038,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
struct qstr dname;
int release = 0;
- dout("handle_lease from mds%d\n", mds);
+ doutc(cl, "from mds%d\n", mds);
if (!ceph_inc_mds_stopping_blocker(mdsc, session))
return;
@@ -4922,20 +5056,19 @@ static void handle_lease(struct ceph_mds_client *mdsc,
/* lookup inode */
inode = ceph_find_inode(sb, vino);
- dout("handle_lease %s, ino %llx %p %.*s\n",
- ceph_lease_op_name(h->action), vino.ino, inode,
- dname.len, dname.name);
+ doutc(cl, "%s, ino %llx %p %.*s\n", ceph_lease_op_name(h->action),
+ vino.ino, inode, dname.len, dname.name);
mutex_lock(&session->s_mutex);
if (!inode) {
- dout("handle_lease no inode %llx\n", vino.ino);
+ doutc(cl, "no inode %llx\n", vino.ino);
goto release;
}
/* dentry */
parent = d_find_alias(inode);
if (!parent) {
- dout("no parent dentry on inode %p\n", inode);
+ doutc(cl, "no parent dentry on inode %p\n", inode);
WARN_ON(1);
goto release; /* hrm... */
}
@@ -4995,7 +5128,7 @@ out:
bad:
ceph_dec_mds_stopping_blocker(mdsc);
- pr_err("corrupt lease message\n");
+ pr_err_client(cl, "corrupt lease message\n");
ceph_msg_dump(msg);
}
@@ -5003,13 +5136,14 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
struct dentry *dentry, char action,
u32 seq)
{
+ struct ceph_client *cl = session->s_mdsc->fsc->client;
struct ceph_msg *msg;
struct ceph_mds_lease *lease;
struct inode *dir;
int len = sizeof(*lease) + sizeof(u32) + NAME_MAX;
- dout("lease_send_msg identry %p %s to mds%d\n",
- dentry, ceph_lease_op_name(action), session->s_mds);
+ doutc(cl, "identry %p %s to mds%d\n", dentry, ceph_lease_op_name(action),
+ session->s_mds);
msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false);
if (!msg)
@@ -5042,6 +5176,7 @@ static void lock_unlock_session(struct ceph_mds_session *s)
static void maybe_recover_session(struct ceph_mds_client *mdsc)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_fs_client *fsc = mdsc->fsc;
if (!ceph_test_mount_opt(fsc, CLEANRECOVER))
@@ -5053,17 +5188,19 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
if (!READ_ONCE(fsc->blocklisted))
return;
- pr_info("auto reconnect after blocklisted\n");
+ pr_info_client(cl, "auto reconnect after blocklisted\n");
ceph_force_reconnect(fsc->sb);
}
bool check_session_state(struct ceph_mds_session *s)
{
+ struct ceph_client *cl = s->s_mdsc->fsc->client;
+
switch (s->s_state) {
case CEPH_MDS_SESSION_OPEN:
if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
s->s_state = CEPH_MDS_SESSION_HUNG;
- pr_info("mds%d hung\n", s->s_mds);
+ pr_info_client(cl, "mds%d hung\n", s->s_mds);
}
break;
case CEPH_MDS_SESSION_CLOSING:
@@ -5083,6 +5220,8 @@ bool check_session_state(struct ceph_mds_session *s)
*/
void inc_session_sequence(struct ceph_mds_session *s)
{
+ struct ceph_client *cl = s->s_mdsc->fsc->client;
+
lockdep_assert_held(&s->s_mutex);
s->s_seq++;
@@ -5090,11 +5229,11 @@ void inc_session_sequence(struct ceph_mds_session *s)
if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
int ret;
- dout("resending session close request for mds%d\n", s->s_mds);
+ doutc(cl, "resending session close request for mds%d\n", s->s_mds);
ret = request_close_session(s);
if (ret < 0)
- pr_err("unable to close session to mds%d: %d\n",
- s->s_mds, ret);
+ pr_err_client(cl, "unable to close session to mds%d: %d\n",
+ s->s_mds, ret);
}
}
@@ -5123,7 +5262,7 @@ static void delayed_work(struct work_struct *work)
int renew_caps;
int i;
- dout("mdsc delayed_work\n");
+ doutc(mdsc->fsc->client, "mdsc delayed_work\n");
if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHED)
return;
@@ -5252,6 +5391,7 @@ err_mdsc:
*/
static void wait_requests(struct ceph_mds_client *mdsc)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_options *opts = mdsc->fsc->client->options;
struct ceph_mds_request *req;
@@ -5259,25 +5399,25 @@ static void wait_requests(struct ceph_mds_client *mdsc)
if (__get_oldest_req(mdsc)) {
mutex_unlock(&mdsc->mutex);
- dout("wait_requests waiting for requests\n");
+ doutc(cl, "waiting for requests\n");
wait_for_completion_timeout(&mdsc->safe_umount_waiters,
ceph_timeout_jiffies(opts->mount_timeout));
/* tear down remaining requests */
mutex_lock(&mdsc->mutex);
while ((req = __get_oldest_req(mdsc))) {
- dout("wait_requests timed out on tid %llu\n",
- req->r_tid);
+ doutc(cl, "timed out on tid %llu\n", req->r_tid);
list_del_init(&req->r_wait);
__unregister_request(mdsc, req);
}
}
mutex_unlock(&mdsc->mutex);
- dout("wait_requests done\n");
+ doutc(cl, "done\n");
}
void send_flush_mdlog(struct ceph_mds_session *s)
{
+ struct ceph_client *cl = s->s_mdsc->fsc->client;
struct ceph_msg *msg;
/*
@@ -5287,13 +5427,13 @@ void send_flush_mdlog(struct ceph_mds_session *s)
return;
mutex_lock(&s->s_mutex);
- dout("request mdlog flush to mds%d (%s)s seq %lld\n", s->s_mds,
- ceph_session_state_name(s->s_state), s->s_seq);
+ doutc(cl, "request mdlog flush to mds%d (%s)s seq %lld\n",
+ s->s_mds, ceph_session_state_name(s->s_state), s->s_seq);
msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_FLUSH_MDLOG,
s->s_seq);
if (!msg) {
- pr_err("failed to request mdlog flush to mds%d (%s) seq %lld\n",
- s->s_mds, ceph_session_state_name(s->s_state), s->s_seq);
+ pr_err_client(cl, "failed to request mdlog flush to mds%d (%s) seq %lld\n",
+ s->s_mds, ceph_session_state_name(s->s_state), s->s_seq);
} else {
ceph_con_send(&s->s_con, msg);
}
@@ -5306,7 +5446,7 @@ void send_flush_mdlog(struct ceph_mds_session *s)
*/
void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
{
- dout("pre_umount\n");
+ doutc(mdsc->fsc->client, "begin\n");
mdsc->stopping = CEPH_MDSC_STOPPING_BEGIN;
ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true);
@@ -5321,6 +5461,7 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
ceph_msgr_flush();
ceph_cleanup_quotarealms_inodes(mdsc);
+ doutc(mdsc->fsc->client, "done\n");
}
/*
@@ -5329,12 +5470,13 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc,
u64 want_tid)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req = NULL, *nextreq;
struct ceph_mds_session *last_session = NULL;
struct rb_node *n;
mutex_lock(&mdsc->mutex);
- dout("%s want %lld\n", __func__, want_tid);
+ doutc(cl, "want %lld\n", want_tid);
restart:
req = __get_oldest_req(mdsc);
while (req && req->r_tid <= want_tid) {
@@ -5368,8 +5510,8 @@ restart:
} else {
ceph_put_mds_session(s);
}
- dout("%s wait on %llu (want %llu)\n", __func__,
- req->r_tid, want_tid);
+ doutc(cl, "wait on %llu (want %llu)\n",
+ req->r_tid, want_tid);
wait_for_completion(&req->r_safe_completion);
mutex_lock(&mdsc->mutex);
@@ -5387,17 +5529,18 @@ restart:
}
mutex_unlock(&mdsc->mutex);
ceph_put_mds_session(last_session);
- dout("%s done\n", __func__);
+ doutc(cl, "done\n");
}
void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
{
+ struct ceph_client *cl = mdsc->fsc->client;
u64 want_tid, want_flush;
if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN)
return;
- dout("sync\n");
+ doutc(cl, "sync\n");
mutex_lock(&mdsc->mutex);
want_tid = mdsc->last_tid;
mutex_unlock(&mdsc->mutex);
@@ -5413,8 +5556,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
}
spin_unlock(&mdsc->cap_dirty_lock);
- dout("sync want tid %lld flush_seq %lld\n",
- want_tid, want_flush);
+ doutc(cl, "sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid);
wait_caps_flush(mdsc, want_flush);
@@ -5436,11 +5578,12 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
{
struct ceph_options *opts = mdsc->fsc->client->options;
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_session *session;
int i;
int skipped = 0;
- dout("close_sessions\n");
+ doutc(cl, "begin\n");
/* close sessions */
mutex_lock(&mdsc->mutex);
@@ -5458,7 +5601,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
}
mutex_unlock(&mdsc->mutex);
- dout("waiting for sessions to close\n");
+ doutc(cl, "waiting for sessions to close\n");
wait_event_timeout(mdsc->session_close_wq,
done_closing_sessions(mdsc, skipped),
ceph_timeout_jiffies(opts->mount_timeout));
@@ -5486,7 +5629,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
cancel_work_sync(&mdsc->cap_reclaim_work);
cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
- dout("stopped\n");
+ doutc(cl, "done\n");
}
void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
@@ -5494,7 +5637,7 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
struct ceph_mds_session *session;
int mds;
- dout("force umount\n");
+ doutc(mdsc->fsc->client, "force umount\n");
mutex_lock(&mdsc->mutex);
for (mds = 0; mds < mdsc->max_sessions; mds++) {
@@ -5525,7 +5668,7 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
{
- dout("stop\n");
+ doutc(mdsc->fsc->client, "stop\n");
/*
* Make sure the delayed work stopped before releasing
* the resources.
@@ -5546,7 +5689,7 @@ static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
{
struct ceph_mds_client *mdsc = fsc->mdsc;
- dout("mdsc_destroy %p\n", mdsc);
+ doutc(fsc->client, "%p\n", mdsc);
if (!mdsc)
return;
@@ -5560,12 +5703,13 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
fsc->mdsc = NULL;
kfree(mdsc);
- dout("mdsc_destroy %p done\n", mdsc);
+ doutc(fsc->client, "%p done\n", mdsc);
}
void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
{
struct ceph_fs_client *fsc = mdsc->fsc;
+ struct ceph_client *cl = fsc->client;
const char *mds_namespace = fsc->mount_options->mds_namespace;
void *p = msg->front.iov_base;
void *end = p + msg->front.iov_len;
@@ -5577,7 +5721,7 @@ void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
ceph_decode_need(&p, end, sizeof(u32), bad);
epoch = ceph_decode_32(&p);
- dout("handle_fsmap epoch %u\n", epoch);
+ doutc(cl, "epoch %u\n", epoch);
/* struct_v, struct_cv, map_len, epoch, legacy_client_fscid */
ceph_decode_skip_n(&p, end, 2 + sizeof(u32) * 3, bad);
@@ -5622,7 +5766,8 @@ void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
return;
bad:
- pr_err("error decoding fsmap %d. Shutting down mount.\n", err);
+ pr_err_client(cl, "error decoding fsmap %d. Shutting down mount.\n",
+ err);
ceph_umount_begin(mdsc->fsc->sb);
ceph_msg_dump(msg);
err_out:
@@ -5637,6 +5782,7 @@ err_out:
*/
void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
{
+ struct ceph_client *cl = mdsc->fsc->client;
u32 epoch;
u32 maplen;
void *p = msg->front.iov_base;
@@ -5651,18 +5797,17 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
return;
epoch = ceph_decode_32(&p);
maplen = ceph_decode_32(&p);
- dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
+ doutc(cl, "epoch %u len %d\n", epoch, (int)maplen);
/* do we need it? */
mutex_lock(&mdsc->mutex);
if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
- dout("handle_map epoch %u <= our %u\n",
- epoch, mdsc->mdsmap->m_epoch);
+ doutc(cl, "epoch %u <= our %u\n", epoch, mdsc->mdsmap->m_epoch);
mutex_unlock(&mdsc->mutex);
return;
}
- newmap = ceph_mdsmap_decode(&p, end, ceph_msgr2(mdsc->fsc->client));
+ newmap = ceph_mdsmap_decode(mdsc, &p, end, ceph_msgr2(mdsc->fsc->client));
if (IS_ERR(newmap)) {
err = PTR_ERR(newmap);
goto bad_unlock;
@@ -5691,7 +5836,8 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
bad_unlock:
mutex_unlock(&mdsc->mutex);
bad:
- pr_err("error decoding mdsmap %d. Shutting down mount.\n", err);
+ pr_err_client(cl, "error decoding mdsmap %d. Shutting down mount.\n",
+ err);
ceph_umount_begin(mdsc->fsc->sb);
ceph_msg_dump(msg);
return;
@@ -5722,7 +5868,8 @@ static void mds_peer_reset(struct ceph_connection *con)
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
- pr_warn("mds%d closed our session\n", s->s_mds);
+ pr_warn_client(mdsc->fsc->client, "mds%d closed our session\n",
+ s->s_mds);
if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO)
send_mds_reconnect(mdsc, s);
}
@@ -5731,6 +5878,7 @@ static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
int type = le16_to_cpu(msg->hdr.type);
mutex_lock(&mdsc->mutex);
@@ -5770,8 +5918,8 @@ static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
break;
default:
- pr_err("received unknown message type %d %s\n", type,
- ceph_msg_type_name(type));
+ pr_err_client(cl, "received unknown message type %d %s\n",
+ type, ceph_msg_type_name(type));
}
out:
ceph_msg_put(msg);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 5a3714bdd64a..2e6ddaa13d72 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -14,9 +14,9 @@
#include <linux/ceph/types.h>
#include <linux/ceph/messenger.h>
-#include <linux/ceph/mdsmap.h>
#include <linux/ceph/auth.h>
+#include "mdsmap.h"
#include "metric.h"
#include "super.h"
@@ -33,8 +33,10 @@ enum ceph_feature_type {
CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
CEPHFS_FEATURE_OP_GETVXATTR,
CEPHFS_FEATURE_32BITS_RETRY_FWD,
+ CEPHFS_FEATURE_NEW_SNAPREALM_INFO,
+ CEPHFS_FEATURE_HAS_OWNER_UIDGID,
- CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD,
+ CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_HAS_OWNER_UIDGID,
};
#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
@@ -49,6 +51,7 @@ enum ceph_feature_type {
CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \
CEPHFS_FEATURE_OP_GETVXATTR, \
CEPHFS_FEATURE_32BITS_RETRY_FWD, \
+ CEPHFS_FEATURE_HAS_OWNER_UIDGID, \
}
/*
@@ -300,6 +303,7 @@ struct ceph_mds_request {
int r_fmode; /* file mode, if expecting cap */
int r_request_release_offset;
const struct cred *r_cred;
+ struct mnt_idmap *r_mnt_idmap;
struct timespec64 r_stamp;
/* for choosing which mds to send this request to */
@@ -581,7 +585,8 @@ static inline void ceph_mdsc_free_path(char *path, int len)
__putname(path - (PATH_MAX - 1 - len));
}
-extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
+extern char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc,
+ struct dentry *dentry, int *plen, u64 *base,
int for_wire);
extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
@@ -614,4 +619,6 @@ static inline int ceph_wait_on_async_create(struct inode *inode)
extern int ceph_wait_on_conflict_unlink(struct dentry *dentry);
extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session);
extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino);
+
+extern bool enable_unsafe_idmap;
#endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 7dac21ee6ce7..fae97c25ce58 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -7,10 +7,11 @@
#include <linux/slab.h>
#include <linux/types.h>
-#include <linux/ceph/mdsmap.h>
#include <linux/ceph/messenger.h>
#include <linux/ceph/decode.h>
+#include "mdsmap.h"
+#include "mds_client.h"
#include "super.h"
#define CEPH_MDS_IS_READY(i, ignore_laggy) \
@@ -114,8 +115,10 @@ bad:
* Ignore any fields we don't care about (there are quite a few of
* them).
*/
-struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2)
+struct ceph_mdsmap *ceph_mdsmap_decode(struct ceph_mds_client *mdsc, void **p,
+ void *end, bool msgr2)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mdsmap *m;
const void *start = *p;
int i, j, n;
@@ -233,20 +236,18 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2)
*p = info_end;
}
- dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s%s\n",
- i+1, n, global_id, mds, inc,
- ceph_pr_addr(&addr),
- ceph_mds_state_name(state),
- laggy ? "(laggy)" : "");
+ doutc(cl, "%d/%d %lld mds%d.%d %s %s%s\n", i+1, n, global_id,
+ mds, inc, ceph_pr_addr(&addr),
+ ceph_mds_state_name(state), laggy ? "(laggy)" : "");
if (mds < 0 || mds >= m->possible_max_rank) {
- pr_warn("mdsmap_decode got incorrect mds(%d)\n", mds);
+ pr_warn_client(cl, "got incorrect mds(%d)\n", mds);
continue;
}
if (state <= 0) {
- dout("mdsmap_decode got incorrect state(%s)\n",
- ceph_mds_state_name(state));
+ doutc(cl, "got incorrect state(%s)\n",
+ ceph_mds_state_name(state));
continue;
}
@@ -385,16 +386,16 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2)
m->m_max_xattr_size = 0;
}
bad_ext:
- dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n",
- !!m->m_enabled, !!m->m_damaged, m->m_num_laggy);
+ doutc(cl, "m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n",
+ !!m->m_enabled, !!m->m_damaged, m->m_num_laggy);
*p = end;
- dout("mdsmap_decode success epoch %u\n", m->m_epoch);
+ doutc(cl, "success epoch %u\n", m->m_epoch);
return m;
nomem:
err = -ENOMEM;
goto out_err;
corrupt:
- pr_err("corrupt mdsmap\n");
+ pr_err_client(cl, "corrupt mdsmap\n");
print_hex_dump(KERN_DEBUG, "mdsmap: ",
DUMP_PREFIX_OFFSET, 16, 1,
start, end - start, true);
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
new file mode 100644
index 000000000000..89f1931f1ba6
--- /dev/null
+++ b/fs/ceph/mdsmap.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_CEPH_MDSMAP_H
+#define _FS_CEPH_MDSMAP_H
+
+#include <linux/bug.h>
+#include <linux/ceph/types.h>
+
+struct ceph_mds_client;
+
+/*
+ * mds map - describe servers in the mds cluster.
+ *
+ * we limit fields to those the client actually xcares about
+ */
+struct ceph_mds_info {
+ u64 global_id;
+ struct ceph_entity_addr addr;
+ s32 state;
+ int num_export_targets;
+ bool laggy;
+ u32 *export_targets;
+};
+
+struct ceph_mdsmap {
+ u32 m_epoch, m_client_epoch, m_last_failure;
+ u32 m_root;
+ u32 m_session_timeout; /* seconds */
+ u32 m_session_autoclose; /* seconds */
+ u64 m_max_file_size;
+ u64 m_max_xattr_size; /* maximum size for xattrs blob */
+ u32 m_max_mds; /* expected up:active mds number */
+ u32 m_num_active_mds; /* actual up:active mds number */
+ u32 possible_max_rank; /* possible max rank index */
+ struct ceph_mds_info *m_info;
+
+ /* which object pools file data can be stored in */
+ int m_num_data_pg_pools;
+ u64 *m_data_pg_pools;
+ u64 m_cas_pg_pool;
+
+ bool m_enabled;
+ bool m_damaged;
+ int m_num_laggy;
+};
+
+static inline struct ceph_entity_addr *
+ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
+{
+ if (w >= m->possible_max_rank)
+ return NULL;
+ return &m->m_info[w].addr;
+}
+
+static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
+{
+ BUG_ON(w < 0);
+ if (w >= m->possible_max_rank)
+ return CEPH_MDS_STATE_DNE;
+ return m->m_info[w].state;
+}
+
+static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
+{
+ if (w >= 0 && w < m->possible_max_rank)
+ return m->m_info[w].laggy;
+ return false;
+}
+
+extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
+struct ceph_mdsmap *ceph_mdsmap_decode(struct ceph_mds_client *mdsc, void **p,
+ void *end, bool msgr2);
+extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
+extern bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m);
+
+#endif
diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c
index 6d3584f16f9a..871c1090e520 100644
--- a/fs/ceph/metric.c
+++ b/fs/ceph/metric.c
@@ -31,6 +31,7 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
struct ceph_client_metric *m = &mdsc->metric;
u64 nr_caps = atomic64_read(&m->total_caps);
u32 header_len = sizeof(struct ceph_metric_header);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_msg *msg;
s64 sum;
s32 items = 0;
@@ -51,8 +52,8 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
if (!msg) {
- pr_err("send metrics to mds%d, failed to allocate message\n",
- s->s_mds);
+ pr_err_client(cl, "to mds%d, failed to allocate message\n",
+ s->s_mds);
return false;
}
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index f7fcf7f08ec6..9d36c3532de1 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -43,6 +43,7 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
{
struct super_block *sb = mdsc->fsc->sb;
struct ceph_mds_quota *h = msg->front.iov_base;
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_vino vino;
struct inode *inode;
struct ceph_inode_info *ci;
@@ -51,8 +52,8 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
return;
if (msg->front.iov_len < sizeof(*h)) {
- pr_err("%s corrupt message mds%d len %d\n", __func__,
- session->s_mds, (int)msg->front.iov_len);
+ pr_err_client(cl, "corrupt message mds%d len %d\n",
+ session->s_mds, (int)msg->front.iov_len);
ceph_msg_dump(msg);
goto out;
}
@@ -62,7 +63,7 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
vino.snap = CEPH_NOSNAP;
inode = ceph_find_inode(sb, vino);
if (!inode) {
- pr_warn("Failed to find inode %llu\n", vino.ino);
+ pr_warn_client(cl, "failed to find inode %llx\n", vino.ino);
goto out;
}
ci = ceph_inode(inode);
@@ -85,6 +86,7 @@ find_quotarealm_inode(struct ceph_mds_client *mdsc, u64 ino)
{
struct ceph_quotarealm_inode *qri = NULL;
struct rb_node **node, *parent = NULL;
+ struct ceph_client *cl = mdsc->fsc->client;
mutex_lock(&mdsc->quotarealms_inodes_mutex);
node = &(mdsc->quotarealms_inodes.rb_node);
@@ -110,7 +112,7 @@ find_quotarealm_inode(struct ceph_mds_client *mdsc, u64 ino)
rb_link_node(&qri->node, parent, node);
rb_insert_color(&qri->node, &mdsc->quotarealms_inodes);
} else
- pr_warn("Failed to alloc quotarealms_inode\n");
+ pr_warn_client(cl, "Failed to alloc quotarealms_inode\n");
}
mutex_unlock(&mdsc->quotarealms_inodes_mutex);
@@ -129,6 +131,7 @@ static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc,
struct super_block *sb,
struct ceph_snap_realm *realm)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_quotarealm_inode *qri;
struct inode *in;
@@ -161,8 +164,8 @@ static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc,
}
if (IS_ERR(in)) {
- dout("Can't lookup inode %llx (err: %ld)\n",
- realm->ino, PTR_ERR(in));
+ doutc(cl, "Can't lookup inode %llx (err: %ld)\n", realm->ino,
+ PTR_ERR(in));
qri->timeout = jiffies + msecs_to_jiffies(60 * 1000); /* XXX */
} else {
qri->timeout = 0;
@@ -213,6 +216,7 @@ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
enum quota_get_realm which_quota,
bool retry)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_inode_info *ci = NULL;
struct ceph_snap_realm *realm, *next;
struct inode *in;
@@ -226,8 +230,9 @@ restart:
if (realm)
ceph_get_snap_realm(mdsc, realm);
else
- pr_err_ratelimited("get_quota_realm: ino (%llx.%llx) "
- "null i_snap_realm\n", ceph_vinop(inode));
+ pr_err_ratelimited_client(cl,
+ "%p %llx.%llx null i_snap_realm\n",
+ inode, ceph_vinop(inode));
while (realm) {
bool has_inode;
@@ -317,6 +322,7 @@ static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op,
loff_t delta)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_inode_info *ci;
struct ceph_snap_realm *realm, *next;
struct inode *in;
@@ -332,8 +338,9 @@ restart:
if (realm)
ceph_get_snap_realm(mdsc, realm);
else
- pr_err_ratelimited("check_quota_exceeded: ino (%llx.%llx) "
- "null i_snap_realm\n", ceph_vinop(inode));
+ pr_err_ratelimited_client(cl,
+ "%p %llx.%llx null i_snap_realm\n",
+ inode, ceph_vinop(inode));
while (realm) {
bool has_inode;
@@ -383,7 +390,7 @@ restart:
break;
default:
/* Shouldn't happen */
- pr_warn("Invalid quota check op (%d)\n", op);
+ pr_warn_client(cl, "Invalid quota check op (%d)\n", op);
exceeded = true; /* Just break the loop */
}
iput(in);
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 813f21add992..c65f2b202b2b 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -138,7 +138,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
__insert_snap_realm(&mdsc->snap_realms, realm);
mdsc->num_snap_realms++;
- dout("%s %llx %p\n", __func__, realm->ino, realm);
+ doutc(mdsc->fsc->client, "%llx %p\n", realm->ino, realm);
return realm;
}
@@ -150,6 +150,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc,
u64 ino)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct rb_node *n = mdsc->snap_realms.rb_node;
struct ceph_snap_realm *r;
@@ -162,7 +163,7 @@ static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc,
else if (ino > r->ino)
n = n->rb_right;
else {
- dout("%s %llx %p\n", __func__, r->ino, r);
+ doutc(cl, "%llx %p\n", r->ino, r);
return r;
}
}
@@ -188,9 +189,10 @@ static void __put_snap_realm(struct ceph_mds_client *mdsc,
static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm)
{
+ struct ceph_client *cl = mdsc->fsc->client;
lockdep_assert_held_write(&mdsc->snap_rwsem);
- dout("%s %p %llx\n", __func__, realm, realm->ino);
+ doutc(cl, "%p %llx\n", realm, realm->ino);
rb_erase(&realm->node, &mdsc->snap_realms);
mdsc->num_snap_realms--;
@@ -290,6 +292,7 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm,
u64 parentino)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_snap_realm *parent;
lockdep_assert_held_write(&mdsc->snap_rwsem);
@@ -303,8 +306,8 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
if (IS_ERR(parent))
return PTR_ERR(parent);
}
- dout("%s %llx %p: %llx %p -> %llx %p\n", __func__, realm->ino,
- realm, realm->parent_ino, realm->parent, parentino, parent);
+ doutc(cl, "%llx %p: %llx %p -> %llx %p\n", realm->ino, realm,
+ realm->parent_ino, realm->parent, parentino, parent);
if (realm->parent) {
list_del_init(&realm->child_item);
ceph_put_snap_realm(mdsc, realm->parent);
@@ -329,10 +332,12 @@ static int cmpu64_rev(const void *a, const void *b)
/*
* build the snap context for a given realm.
*/
-static int build_snap_context(struct ceph_snap_realm *realm,
+static int build_snap_context(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm,
struct list_head *realm_queue,
struct list_head *dirty_realms)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_snap_realm *parent = realm->parent;
struct ceph_snap_context *snapc;
int err = 0;
@@ -360,10 +365,10 @@ static int build_snap_context(struct ceph_snap_realm *realm,
realm->cached_context->seq == realm->seq &&
(!parent ||
realm->cached_context->seq >= parent->cached_context->seq)) {
- dout("%s %llx %p: %p seq %lld (%u snaps) (unchanged)\n",
- __func__, realm->ino, realm, realm->cached_context,
- realm->cached_context->seq,
- (unsigned int)realm->cached_context->num_snaps);
+ doutc(cl, "%llx %p: %p seq %lld (%u snaps) (unchanged)\n",
+ realm->ino, realm, realm->cached_context,
+ realm->cached_context->seq,
+ (unsigned int)realm->cached_context->num_snaps);
return 0;
}
@@ -400,8 +405,8 @@ static int build_snap_context(struct ceph_snap_realm *realm,
sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
snapc->num_snaps = num;
- dout("%s %llx %p: %p seq %lld (%u snaps)\n", __func__, realm->ino,
- realm, snapc, snapc->seq, (unsigned int) snapc->num_snaps);
+ doutc(cl, "%llx %p: %p seq %lld (%u snaps)\n", realm->ino, realm,
+ snapc, snapc->seq, (unsigned int) snapc->num_snaps);
ceph_put_snap_context(realm->cached_context);
realm->cached_context = snapc;
@@ -418,16 +423,18 @@ fail:
ceph_put_snap_context(realm->cached_context);
realm->cached_context = NULL;
}
- pr_err("%s %llx %p fail %d\n", __func__, realm->ino, realm, err);
+ pr_err_client(cl, "%llx %p fail %d\n", realm->ino, realm, err);
return err;
}
/*
* rebuild snap context for the given realm and all of its children.
*/
-static void rebuild_snap_realms(struct ceph_snap_realm *realm,
+static void rebuild_snap_realms(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm,
struct list_head *dirty_realms)
{
+ struct ceph_client *cl = mdsc->fsc->client;
LIST_HEAD(realm_queue);
int last = 0;
bool skip = false;
@@ -451,9 +458,10 @@ static void rebuild_snap_realms(struct ceph_snap_realm *realm,
continue;
}
- last = build_snap_context(_realm, &realm_queue, dirty_realms);
- dout("%s %llx %p, %s\n", __func__, _realm->ino, _realm,
- last > 0 ? "is deferred" : !last ? "succeeded" : "failed");
+ last = build_snap_context(mdsc, _realm, &realm_queue,
+ dirty_realms);
+ doutc(cl, "%llx %p, %s\n", realm->ino, realm,
+ last > 0 ? "is deferred" : !last ? "succeeded" : "failed");
/* is any child in the list ? */
list_for_each_entry(child, &_realm->children, child_item) {
@@ -523,6 +531,7 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap **pcapsnap)
{
struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_snap_context *old_snapc, *new_snapc;
struct ceph_cap_snap *capsnap = *pcapsnap;
struct ceph_buffer *old_blob = NULL;
@@ -548,14 +557,14 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci,
as no new writes are allowed to start when pending, so any
writes in progress now were started before the previous
cap_snap. lucky us. */
- dout("%s %p %llx.%llx already pending\n",
- __func__, inode, ceph_vinop(inode));
+ doutc(cl, "%p %llx.%llx already pending\n", inode,
+ ceph_vinop(inode));
goto update_snapc;
}
if (ci->i_wrbuffer_ref_head == 0 &&
!(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) {
- dout("%s %p %llx.%llx nothing dirty|writing\n",
- __func__, inode, ceph_vinop(inode));
+ doutc(cl, "%p %llx.%llx nothing dirty|writing\n", inode,
+ ceph_vinop(inode));
goto update_snapc;
}
@@ -575,15 +584,15 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci,
} else {
if (!(used & CEPH_CAP_FILE_WR) &&
ci->i_wrbuffer_ref_head == 0) {
- dout("%s %p %llx.%llx no new_snap|dirty_page|writing\n",
- __func__, inode, ceph_vinop(inode));
+ doutc(cl, "%p %llx.%llx no new_snap|dirty_page|writing\n",
+ inode, ceph_vinop(inode));
goto update_snapc;
}
}
- dout("%s %p %llx.%llx cap_snap %p queuing under %p %s %s\n",
- __func__, inode, ceph_vinop(inode), capsnap, old_snapc,
- ceph_cap_string(dirty), capsnap->need_flush ? "" : "no_flush");
+ doutc(cl, "%p %llx.%llx cap_snap %p queuing under %p %s %s\n",
+ inode, ceph_vinop(inode), capsnap, old_snapc,
+ ceph_cap_string(dirty), capsnap->need_flush ? "" : "no_flush");
ihold(inode);
capsnap->follows = old_snapc->seq;
@@ -615,9 +624,9 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci,
list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
if (used & CEPH_CAP_FILE_WR) {
- dout("%s %p %llx.%llx cap_snap %p snapc %p seq %llu used WR,"
- " now pending\n", __func__, inode, ceph_vinop(inode),
- capsnap, old_snapc, old_snapc->seq);
+ doutc(cl, "%p %llx.%llx cap_snap %p snapc %p seq %llu used WR,"
+ " now pending\n", inode, ceph_vinop(inode), capsnap,
+ old_snapc, old_snapc->seq);
capsnap->writing = 1;
} else {
/* note mtime, size NOW. */
@@ -634,7 +643,7 @@ update_snapc:
ci->i_head_snapc = NULL;
} else {
ci->i_head_snapc = ceph_get_snap_context(new_snapc);
- dout(" new snapc is %p\n", new_snapc);
+ doutc(cl, " new snapc is %p\n", new_snapc);
}
spin_unlock(&ci->i_ceph_lock);
@@ -655,11 +664,12 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
{
struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
BUG_ON(capsnap->writing);
capsnap->size = i_size_read(inode);
- capsnap->mtime = inode->i_mtime;
- capsnap->atime = inode->i_atime;
+ capsnap->mtime = inode_get_mtime(inode);
+ capsnap->atime = inode_get_atime(inode);
capsnap->ctime = inode_get_ctime(inode);
capsnap->btime = ci->i_btime;
capsnap->change_attr = inode_peek_iversion_raw(inode);
@@ -667,11 +677,12 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
capsnap->truncate_size = ci->i_truncate_size;
capsnap->truncate_seq = ci->i_truncate_seq;
if (capsnap->dirty_pages) {
- dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu "
- "still has %d dirty pages\n", __func__, inode,
- ceph_vinop(inode), capsnap, capsnap->context,
- capsnap->context->seq, ceph_cap_string(capsnap->dirty),
- capsnap->size, capsnap->dirty_pages);
+ doutc(cl, "%p %llx.%llx cap_snap %p snapc %p %llu %s "
+ "s=%llu still has %d dirty pages\n", inode,
+ ceph_vinop(inode), capsnap, capsnap->context,
+ capsnap->context->seq,
+ ceph_cap_string(capsnap->dirty),
+ capsnap->size, capsnap->dirty_pages);
return 0;
}
@@ -680,20 +691,20 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
* And trigger to flush the buffer immediately.
*/
if (ci->i_wrbuffer_ref) {
- dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu "
- "used WRBUFFER, delaying\n", __func__, inode,
- ceph_vinop(inode), capsnap, capsnap->context,
- capsnap->context->seq, ceph_cap_string(capsnap->dirty),
- capsnap->size);
+ doutc(cl, "%p %llx.%llx cap_snap %p snapc %p %llu %s "
+ "s=%llu used WRBUFFER, delaying\n", inode,
+ ceph_vinop(inode), capsnap, capsnap->context,
+ capsnap->context->seq, ceph_cap_string(capsnap->dirty),
+ capsnap->size);
ceph_queue_writeback(inode);
return 0;
}
ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
- dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu\n",
- __func__, inode, ceph_vinop(inode), capsnap, capsnap->context,
- capsnap->context->seq, ceph_cap_string(capsnap->dirty),
- capsnap->size);
+ doutc(cl, "%p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu\n",
+ inode, ceph_vinop(inode), capsnap, capsnap->context,
+ capsnap->context->seq, ceph_cap_string(capsnap->dirty),
+ capsnap->size);
spin_lock(&mdsc->snap_flush_lock);
if (list_empty(&ci->i_snap_flush_item)) {
@@ -708,13 +719,15 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
* Queue cap_snaps for snap writeback for this realm and its children.
* Called under snap_rwsem, so realm topology won't change.
*/
-static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
+static void queue_realm_cap_snaps(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_inode_info *ci;
struct inode *lastinode = NULL;
struct ceph_cap_snap *capsnap = NULL;
- dout("%s %p %llx inode\n", __func__, realm, realm->ino);
+ doutc(cl, "%p %llx inode\n", realm, realm->ino);
spin_lock(&realm->inodes_with_caps_lock);
list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) {
@@ -733,8 +746,9 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
if (!capsnap) {
capsnap = kmem_cache_zalloc(ceph_cap_snap_cachep, GFP_NOFS);
if (!capsnap) {
- pr_err("ENOMEM allocating ceph_cap_snap on %p\n",
- inode);
+ pr_err_client(cl,
+ "ENOMEM allocating ceph_cap_snap on %p\n",
+ inode);
return;
}
}
@@ -752,7 +766,7 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
if (capsnap)
kmem_cache_free(ceph_cap_snap_cachep, capsnap);
- dout("%s %p %llx done\n", __func__, realm, realm->ino);
+ doutc(cl, "%p %llx done\n", realm, realm->ino);
}
/*
@@ -766,6 +780,7 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
void *p, void *e, bool deletion,
struct ceph_snap_realm **realm_ret)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_snap_realm *ri; /* encoded */
__le64 *snaps; /* encoded */
__le64 *prior_parent_snaps; /* encoded */
@@ -780,7 +795,7 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
lockdep_assert_held_write(&mdsc->snap_rwsem);
- dout("%s deletion=%d\n", __func__, deletion);
+ doutc(cl, "deletion=%d\n", deletion);
more:
realm = NULL;
rebuild_snapcs = 0;
@@ -810,8 +825,8 @@ more:
rebuild_snapcs += err;
if (le64_to_cpu(ri->seq) > realm->seq) {
- dout("%s updating %llx %p %lld -> %lld\n", __func__,
- realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
+ doutc(cl, "updating %llx %p %lld -> %lld\n", realm->ino,
+ realm, realm->seq, le64_to_cpu(ri->seq));
/* update realm parameters, snap lists */
realm->seq = le64_to_cpu(ri->seq);
realm->created = le64_to_cpu(ri->created);
@@ -834,16 +849,16 @@ more:
rebuild_snapcs = 1;
} else if (!realm->cached_context) {
- dout("%s %llx %p seq %lld new\n", __func__,
- realm->ino, realm, realm->seq);
+ doutc(cl, "%llx %p seq %lld new\n", realm->ino, realm,
+ realm->seq);
rebuild_snapcs = 1;
} else {
- dout("%s %llx %p seq %lld unchanged\n", __func__,
- realm->ino, realm, realm->seq);
+ doutc(cl, "%llx %p seq %lld unchanged\n", realm->ino, realm,
+ realm->seq);
}
- dout("done with %llx %p, rebuild_snapcs=%d, %p %p\n", realm->ino,
- realm, rebuild_snapcs, p, e);
+ doutc(cl, "done with %llx %p, rebuild_snapcs=%d, %p %p\n", realm->ino,
+ realm, rebuild_snapcs, p, e);
/*
* this will always track the uppest parent realm from which
@@ -855,7 +870,7 @@ more:
/* rebuild_snapcs when we reach the _end_ (root) of the trace */
if (realm_to_rebuild && p >= e)
- rebuild_snap_realms(realm_to_rebuild, &dirty_realms);
+ rebuild_snap_realms(mdsc, realm_to_rebuild, &dirty_realms);
if (!first_realm)
first_realm = realm;
@@ -873,7 +888,7 @@ more:
realm = list_first_entry(&dirty_realms, struct ceph_snap_realm,
dirty_item);
list_del_init(&realm->dirty_item);
- queue_realm_cap_snaps(realm);
+ queue_realm_cap_snaps(mdsc, realm);
}
if (realm_ret)
@@ -891,7 +906,7 @@ fail:
ceph_put_snap_realm(mdsc, realm);
if (first_realm)
ceph_put_snap_realm(mdsc, first_realm);
- pr_err("%s error %d\n", __func__, err);
+ pr_err_client(cl, "error %d\n", err);
/*
* When receiving a corrupted snap trace we don't know what
@@ -905,11 +920,12 @@ fail:
WRITE_ONCE(mdsc->fsc->mount_state, CEPH_MOUNT_FENCE_IO);
ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr);
if (ret)
- pr_err("%s failed to blocklist %s: %d\n", __func__,
- ceph_pr_addr(&client->msgr.inst.addr), ret);
+ pr_err_client(cl, "failed to blocklist %s: %d\n",
+ ceph_pr_addr(&client->msgr.inst.addr), ret);
- WARN(1, "%s: %s%sdo remount to continue%s",
- __func__, ret ? "" : ceph_pr_addr(&client->msgr.inst.addr),
+ WARN(1, "[client.%lld] %s %s%sdo remount to continue%s",
+ client->monc.auth->global_id, __func__,
+ ret ? "" : ceph_pr_addr(&client->msgr.inst.addr),
ret ? "" : " was blocklisted, ",
err == -EIO ? " after corrupted snaptrace is fixed" : "");
@@ -925,11 +941,12 @@ fail:
*/
static void flush_snaps(struct ceph_mds_client *mdsc)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_inode_info *ci;
struct inode *inode;
struct ceph_mds_session *session = NULL;
- dout("%s\n", __func__);
+ doutc(cl, "begin\n");
spin_lock(&mdsc->snap_flush_lock);
while (!list_empty(&mdsc->snap_flush_list)) {
ci = list_first_entry(&mdsc->snap_flush_list,
@@ -944,7 +961,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
spin_unlock(&mdsc->snap_flush_lock);
ceph_put_mds_session(session);
- dout("%s done\n", __func__);
+ doutc(cl, "done\n");
}
/**
@@ -960,7 +977,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
void ceph_change_snap_realm(struct inode *inode, struct ceph_snap_realm *realm)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
struct ceph_snap_realm *oldrealm = ci->i_snap_realm;
lockdep_assert_held(&ci->i_ceph_lock);
@@ -1000,6 +1017,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct ceph_msg *msg)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct super_block *sb = mdsc->fsc->sb;
int mds = session->s_mds;
u64 split;
@@ -1030,8 +1048,8 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
trace_len = le32_to_cpu(h->trace_len);
p += sizeof(*h);
- dout("%s from mds%d op %s split %llx tracelen %d\n", __func__,
- mds, ceph_snap_op_name(op), split, trace_len);
+ doutc(cl, "from mds%d op %s split %llx tracelen %d\n", mds,
+ ceph_snap_op_name(op), split, trace_len);
down_write(&mdsc->snap_rwsem);
locked_rwsem = 1;
@@ -1062,7 +1080,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
goto out;
}
- dout("splitting snap_realm %llx %p\n", realm->ino, realm);
+ doutc(cl, "splitting snap_realm %llx %p\n", realm->ino, realm);
for (i = 0; i < num_split_inos; i++) {
struct ceph_vino vino = {
.ino = le64_to_cpu(split_inos[i]),
@@ -1087,13 +1105,13 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
*/
if (ci->i_snap_realm->created >
le64_to_cpu(ri->created)) {
- dout(" leaving %p %llx.%llx in newer realm %llx %p\n",
- inode, ceph_vinop(inode), ci->i_snap_realm->ino,
- ci->i_snap_realm);
+ doutc(cl, " leaving %p %llx.%llx in newer realm %llx %p\n",
+ inode, ceph_vinop(inode), ci->i_snap_realm->ino,
+ ci->i_snap_realm);
goto skip_inode;
}
- dout(" will move %p %llx.%llx to split realm %llx %p\n",
- inode, ceph_vinop(inode), realm->ino, realm);
+ doutc(cl, " will move %p %llx.%llx to split realm %llx %p\n",
+ inode, ceph_vinop(inode), realm->ino, realm);
ceph_get_snap_realm(mdsc, realm);
ceph_change_snap_realm(inode, realm);
@@ -1154,7 +1172,7 @@ skip_inode:
return;
bad:
- pr_err("%s corrupt snap message from mds%d\n", __func__, mds);
+ pr_err_client(cl, "corrupt snap message from mds%d\n", mds);
ceph_msg_dump(msg);
out:
if (locked_rwsem)
@@ -1170,6 +1188,7 @@ out:
struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
u64 snap)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_snapid_map *sm, *exist;
struct rb_node **p, *parent;
int ret;
@@ -1192,8 +1211,8 @@ struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
}
spin_unlock(&mdsc->snapid_map_lock);
if (exist) {
- dout("%s found snapid map %llx -> %x\n", __func__,
- exist->snap, exist->dev);
+ doutc(cl, "found snapid map %llx -> %x\n", exist->snap,
+ exist->dev);
return exist;
}
@@ -1237,13 +1256,12 @@ struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
if (exist) {
free_anon_bdev(sm->dev);
kfree(sm);
- dout("%s found snapid map %llx -> %x\n", __func__,
- exist->snap, exist->dev);
+ doutc(cl, "found snapid map %llx -> %x\n", exist->snap,
+ exist->dev);
return exist;
}
- dout("%s create snapid map %llx -> %x\n", __func__,
- sm->snap, sm->dev);
+ doutc(cl, "create snapid map %llx -> %x\n", sm->snap, sm->dev);
return sm;
}
@@ -1268,6 +1286,7 @@ void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
void ceph_trim_snapid_map(struct ceph_mds_client *mdsc)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_snapid_map *sm;
unsigned long now;
LIST_HEAD(to_free);
@@ -1289,7 +1308,7 @@ void ceph_trim_snapid_map(struct ceph_mds_client *mdsc)
while (!list_empty(&to_free)) {
sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
list_del(&sm->lru);
- dout("trim snapid map %llx -> %x\n", sm->snap, sm->dev);
+ doutc(cl, "trim snapid map %llx -> %x\n", sm->snap, sm->dev);
free_anon_bdev(sm->dev);
kfree(sm);
}
@@ -1297,6 +1316,7 @@ void ceph_trim_snapid_map(struct ceph_mds_client *mdsc)
void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_snapid_map *sm;
struct rb_node *p;
LIST_HEAD(to_free);
@@ -1315,8 +1335,8 @@ void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc)
list_del(&sm->lru);
free_anon_bdev(sm->dev);
if (WARN_ON_ONCE(atomic_read(&sm->ref))) {
- pr_err("snapid map %llx -> %x still in use\n",
- sm->snap, sm->dev);
+ pr_err_client(cl, "snapid map %llx -> %x still in use\n",
+ sm->snap, sm->dev);
}
kfree(sm);
}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 2d7f5a8d4a92..5ec102f6b1ac 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -44,28 +44,29 @@ static LIST_HEAD(ceph_fsc_list);
*/
static void ceph_put_super(struct super_block *s)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(s);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(s);
- dout("put_super\n");
+ doutc(fsc->client, "begin\n");
ceph_fscrypt_free_dummy_policy(fsc);
ceph_mdsc_close_sessions(fsc->mdsc);
+ doutc(fsc->client, "done\n");
}
static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
{
- struct ceph_fs_client *fsc = ceph_inode_to_client(d_inode(dentry));
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(d_inode(dentry));
struct ceph_mon_client *monc = &fsc->client->monc;
struct ceph_statfs st;
int i, err;
u64 data_pool;
+ doutc(fsc->client, "begin\n");
if (fsc->mdsc->mdsmap->m_num_data_pg_pools == 1) {
data_pool = fsc->mdsc->mdsmap->m_data_pg_pools[0];
} else {
data_pool = CEPH_NOPOOL;
}
- dout("statfs\n");
err = ceph_monc_do_statfs(monc, data_pool, &st);
if (err < 0)
return err;
@@ -113,24 +114,26 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
/* fold the fs_cluster_id into the upper bits */
buf->f_fsid.val[1] = monc->fs_cluster_id;
+ doutc(fsc->client, "done\n");
return 0;
}
static int ceph_sync_fs(struct super_block *sb, int wait)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
+ struct ceph_client *cl = fsc->client;
if (!wait) {
- dout("sync_fs (non-blocking)\n");
+ doutc(cl, "(non-blocking)\n");
ceph_flush_dirty_caps(fsc->mdsc);
- dout("sync_fs (non-blocking) done\n");
+ doutc(cl, "(non-blocking) done\n");
return 0;
}
- dout("sync_fs (blocking)\n");
+ doutc(cl, "(blocking)\n");
ceph_osdc_sync(&fsc->client->osdc);
ceph_mdsc_sync(fsc->mdsc);
- dout("sync_fs (blocking) done\n");
+ doutc(cl, "(blocking) done\n");
return 0;
}
@@ -341,7 +344,7 @@ static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc)
char *dev_name = param->string, *dev_name_end;
int ret;
- dout("%s '%s'\n", __func__, dev_name);
+ dout("'%s'\n", dev_name);
if (!dev_name || !*dev_name)
return invalfc(fc, "Empty source");
@@ -413,7 +416,7 @@ static int ceph_parse_mount_param(struct fs_context *fc,
return ret;
token = fs_parse(fc, ceph_mount_parameters, param, &result);
- dout("%s fs_parse '%s' token %d\n", __func__, param->key, token);
+ dout("%s: fs_parse '%s' token %d\n",__func__, param->key, token);
if (token < 0)
return token;
@@ -684,7 +687,7 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
*/
static int ceph_show_options(struct seq_file *m, struct dentry *root)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(root->d_sb);
struct ceph_mount_options *fsopt = fsc->mount_options;
size_t pos;
int ret;
@@ -881,7 +884,7 @@ static void flush_fs_workqueues(struct ceph_fs_client *fsc)
static void destroy_fs_client(struct ceph_fs_client *fsc)
{
- dout("destroy_fs_client %p\n", fsc);
+ doutc(fsc->client, "%p\n", fsc);
spin_lock(&ceph_fsc_lock);
list_del(&fsc->metric_wakeup);
@@ -896,7 +899,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
ceph_destroy_client(fsc->client);
kfree(fsc);
- dout("destroy_fs_client %p done\n", fsc);
+ dout("%s: %p done\n", __func__, fsc);
}
/*
@@ -1015,9 +1018,9 @@ static void __ceph_umount_begin(struct ceph_fs_client *fsc)
*/
void ceph_umount_begin(struct super_block *sb)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
- dout("ceph_umount_begin - starting forced umount\n");
+ doutc(fsc->client, "starting forced umount\n");
if (!fsc)
return;
fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
@@ -1045,13 +1048,14 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
const char *path,
unsigned long started)
{
+ struct ceph_client *cl = fsc->client;
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req = NULL;
int err;
struct dentry *root;
/* open dir */
- dout("open_root_inode opening '%s'\n", path);
+ doutc(cl, "opening '%s'\n", path);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
if (IS_ERR(req))
return ERR_CAST(req);
@@ -1071,13 +1075,13 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
if (err == 0) {
struct inode *inode = req->r_target_inode;
req->r_target_inode = NULL;
- dout("open_root_inode success\n");
+ doutc(cl, "success\n");
root = d_make_root(inode);
if (!root) {
root = ERR_PTR(-ENOMEM);
goto out;
}
- dout("open_root_inode success, root dentry is %p\n", root);
+ doutc(cl, "success, root dentry is %p\n", root);
} else {
root = ERR_PTR(err);
}
@@ -1136,11 +1140,12 @@ static int ceph_apply_test_dummy_encryption(struct super_block *sb,
static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
struct fs_context *fc)
{
+ struct ceph_client *cl = fsc->client;
int err;
unsigned long started = jiffies; /* note the start time */
struct dentry *root;
- dout("mount start %p\n", fsc);
+ doutc(cl, "mount start %p\n", fsc);
mutex_lock(&fsc->client->mount_mutex);
if (!fsc->sb->s_root) {
@@ -1163,7 +1168,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
if (err)
goto out;
- dout("mount opening path '%s'\n", path);
+ doutc(cl, "mount opening path '%s'\n", path);
ceph_fs_debugfs_init(fsc);
@@ -1178,7 +1183,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
}
fsc->mount_state = CEPH_MOUNT_MOUNTED;
- dout("mount success\n");
+ doutc(cl, "mount success\n");
mutex_unlock(&fsc->client->mount_mutex);
return root;
@@ -1191,9 +1196,10 @@ out:
static int ceph_set_super(struct super_block *s, struct fs_context *fc)
{
struct ceph_fs_client *fsc = s->s_fs_info;
+ struct ceph_client *cl = fsc->client;
int ret;
- dout("set_super %p\n", s);
+ doutc(cl, "%p\n", s);
s->s_maxbytes = MAX_LFS_FILESIZE;
@@ -1226,31 +1232,32 @@ static int ceph_compare_super(struct super_block *sb, struct fs_context *fc)
struct ceph_fs_client *new = fc->s_fs_info;
struct ceph_mount_options *fsopt = new->mount_options;
struct ceph_options *opt = new->client->options;
- struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
+ struct ceph_client *cl = fsc->client;
- dout("ceph_compare_super %p\n", sb);
+ doutc(cl, "%p\n", sb);
if (compare_mount_options(fsopt, opt, fsc)) {
- dout("monitor(s)/mount options don't match\n");
+ doutc(cl, "monitor(s)/mount options don't match\n");
return 0;
}
if ((opt->flags & CEPH_OPT_FSID) &&
ceph_fsid_compare(&opt->fsid, &fsc->client->fsid)) {
- dout("fsid doesn't match\n");
+ doutc(cl, "fsid doesn't match\n");
return 0;
}
if (fc->sb_flags != (sb->s_flags & ~SB_BORN)) {
- dout("flags differ\n");
+ doutc(cl, "flags differ\n");
return 0;
}
if (fsc->blocklisted && !ceph_test_mount_opt(fsc, CLEANRECOVER)) {
- dout("client is blocklisted (and CLEANRECOVER is not set)\n");
+ doutc(cl, "client is blocklisted (and CLEANRECOVER is not set)\n");
return 0;
}
if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
- dout("client has been forcibly unmounted\n");
+ doutc(cl, "client has been forcibly unmounted\n");
return 0;
}
@@ -1322,9 +1329,9 @@ static int ceph_get_tree(struct fs_context *fc)
goto out;
}
- if (ceph_sb_to_client(sb) != fsc) {
+ if (ceph_sb_to_fs_client(sb) != fsc) {
destroy_fs_client(fsc);
- fsc = ceph_sb_to_client(sb);
+ fsc = ceph_sb_to_fs_client(sb);
dout("get_sb got existing client %p\n", fsc);
} else {
dout("get_sb using new client %p\n", fsc);
@@ -1338,8 +1345,9 @@ static int ceph_get_tree(struct fs_context *fc)
err = PTR_ERR(res);
goto out_splat;
}
- dout("root %p inode %p ino %llx.%llx\n", res,
- d_inode(res), ceph_vinop(d_inode(res)));
+
+ doutc(fsc->client, "root %p inode %p ino %llx.%llx\n", res,
+ d_inode(res), ceph_vinop(d_inode(res)));
fc->root = fsc->sb->s_root;
return 0;
@@ -1377,7 +1385,7 @@ static int ceph_reconfigure_fc(struct fs_context *fc)
struct ceph_parse_opts_ctx *pctx = fc->fs_private;
struct ceph_mount_options *fsopt = pctx->opts;
struct super_block *sb = fc->root->d_sb;
- struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
err = ceph_apply_test_dummy_encryption(sb, fc, fsopt);
if (err)
@@ -1397,7 +1405,8 @@ static int ceph_reconfigure_fc(struct fs_context *fc)
kfree(fsc->mount_options->mon_addr);
fsc->mount_options->mon_addr = fsopt->mon_addr;
fsopt->mon_addr = NULL;
- pr_notice("ceph: monitor addresses recorded, but not used for reconnection");
+ pr_notice_client(fsc->client,
+ "monitor addresses recorded, but not used for reconnection");
}
sync_filesystem(sb);
@@ -1516,11 +1525,12 @@ void ceph_dec_osd_stopping_blocker(struct ceph_mds_client *mdsc)
static void ceph_kill_sb(struct super_block *s)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(s);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(s);
+ struct ceph_client *cl = fsc->client;
struct ceph_mds_client *mdsc = fsc->mdsc;
bool wait;
- dout("kill_sb %p\n", s);
+ doutc(cl, "%p\n", s);
ceph_mdsc_pre_umount(mdsc);
flush_fs_workqueues(fsc);
@@ -1551,9 +1561,9 @@ static void ceph_kill_sb(struct super_block *s)
&mdsc->stopping_waiter,
fsc->client->options->mount_timeout);
if (!timeleft) /* timed out */
- pr_warn("umount timed out, %ld\n", timeleft);
+ pr_warn_client(cl, "umount timed out, %ld\n", timeleft);
else if (timeleft < 0) /* killed */
- pr_warn("umount was killed, %ld\n", timeleft);
+ pr_warn_client(cl, "umount was killed, %ld\n", timeleft);
}
mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED;
@@ -1572,13 +1582,13 @@ static struct file_system_type ceph_fs_type = {
.name = "ceph",
.init_fs_context = ceph_init_fs_context,
.kill_sb = ceph_kill_sb,
- .fs_flags = FS_RENAME_DOES_D_MOVE,
+ .fs_flags = FS_RENAME_DOES_D_MOVE | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("ceph");
int ceph_force_reconnect(struct super_block *sb)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
int err = 0;
fsc->mount_state = CEPH_MOUNT_RECOVER;
@@ -1671,6 +1681,11 @@ static const struct kernel_param_ops param_ops_mount_syntax = {
module_param_cb(mount_syntax_v1, &param_ops_mount_syntax, &mount_support, 0444);
module_param_cb(mount_syntax_v2, &param_ops_mount_syntax, &mount_support, 0444);
+bool enable_unsafe_idmap = false;
+module_param(enable_unsafe_idmap, bool, 0644);
+MODULE_PARM_DESC(enable_unsafe_idmap,
+ "Allow to use idmapped mounts with MDS without CEPHFS_FEATURE_HAS_OWNER_UIDGID");
+
module_init(init_ceph);
module_exit(exit_ceph);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 51c7f2b14f6f..fe0f64a0acb2 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -488,13 +488,13 @@ ceph_inode(const struct inode *inode)
}
static inline struct ceph_fs_client *
-ceph_inode_to_client(const struct inode *inode)
+ceph_inode_to_fs_client(const struct inode *inode)
{
return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
}
static inline struct ceph_fs_client *
-ceph_sb_to_client(const struct super_block *sb)
+ceph_sb_to_fs_client(const struct super_block *sb)
{
return (struct ceph_fs_client *)sb->s_fs_info;
}
@@ -502,7 +502,13 @@ ceph_sb_to_client(const struct super_block *sb)
static inline struct ceph_mds_client *
ceph_sb_to_mdsc(const struct super_block *sb)
{
- return (struct ceph_mds_client *)ceph_sb_to_client(sb)->mdsc;
+ return (struct ceph_mds_client *)ceph_sb_to_fs_client(sb)->mdsc;
+}
+
+static inline struct ceph_client *
+ceph_inode_to_client(const struct inode *inode)
+{
+ return (struct ceph_client *)ceph_inode_to_fs_client(inode)->client;
}
static inline struct ceph_vino
@@ -558,7 +564,7 @@ static inline u64 ceph_snap(struct inode *inode)
*/
static inline u64 ceph_present_ino(struct super_block *sb, u64 ino)
{
- if (unlikely(ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)))
+ if (unlikely(ceph_test_mount_opt(ceph_sb_to_fs_client(sb), INO32)))
return ceph_ino_to_ino32(ino);
return ino;
}
@@ -1094,8 +1100,8 @@ struct ceph_iattr {
struct ceph_fscrypt_auth *fscrypt_auth;
};
-extern int __ceph_setattr(struct inode *inode, struct iattr *attr,
- struct ceph_iattr *cia);
+extern int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode,
+ struct iattr *attr, struct ceph_iattr *cia);
extern int ceph_setattr(struct mnt_idmap *idmap,
struct dentry *dentry, struct iattr *attr);
extern int ceph_getattr(struct mnt_idmap *idmap,
@@ -1106,7 +1112,7 @@ void ceph_inode_shutdown(struct inode *inode);
static inline bool ceph_inode_is_shutdown(struct inode *inode)
{
unsigned long flags = READ_ONCE(ceph_inode(inode)->i_ceph_flags);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
int state = READ_ONCE(fsc->mount_state);
return (flags & CEPH_I_SHUTDOWN) || state >= CEPH_MOUNT_SHUTDOWN;
@@ -1119,7 +1125,7 @@ ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci);
extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
-extern const struct xattr_handler *ceph_xattr_handlers[];
+extern const struct xattr_handler * const ceph_xattr_handlers[];
struct ceph_acl_sec_ctx {
#ifdef CONFIG_CEPH_FS_POSIX_ACL
@@ -1223,7 +1229,8 @@ extern void ceph_add_cap(struct inode *inode,
unsigned cap, unsigned seq, u64 realmino, int flags,
struct ceph_cap **new_cap);
extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
-extern void ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
+extern void ceph_remove_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
+ bool queue_release);
extern void __ceph_remove_caps(struct ceph_inode_info *ci);
extern void ceph_put_cap(struct ceph_mds_client *mdsc,
struct ceph_cap *cap);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 0deae4a0f5f1..e066a556eccb 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -57,7 +57,8 @@ static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
size_t size)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb);
+ struct ceph_client *cl = fsc->client;
struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_string *pool_ns;
s64 pool = ci->i_layout.pool_id;
@@ -69,7 +70,7 @@ static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
- dout("ceph_vxattrcb_layout %p\n", &ci->netfs.inode);
+ doutc(cl, "%p\n", &ci->netfs.inode);
down_read(&osdc->lock);
pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
if (pool_name) {
@@ -161,7 +162,7 @@ static ssize_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
char *val, size_t size)
{
ssize_t ret;
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb);
struct ceph_osd_client *osdc = &fsc->client->osdc;
s64 pool = ci->i_layout.pool_id;
const char *pool_name;
@@ -313,7 +314,7 @@ static ssize_t ceph_vxattrcb_snap_btime(struct ceph_inode_info *ci, char *val,
static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci,
char *val, size_t size)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb);
return ceph_fmt_xattr(val, size, "%pU", &fsc->client->fsid);
}
@@ -321,7 +322,7 @@ static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci,
static ssize_t ceph_vxattrcb_client_id(struct ceph_inode_info *ci,
char *val, size_t size)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb);
return ceph_fmt_xattr(val, size, "client%lld",
ceph_client_gid(fsc->client));
@@ -570,6 +571,8 @@ static int __set_xattr(struct ceph_inode_info *ci,
int flags, int update_xattr,
struct ceph_inode_xattr **newxattr)
{
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct rb_node **p;
struct rb_node *parent = NULL;
struct ceph_inode_xattr *xattr = NULL;
@@ -626,7 +629,7 @@ static int __set_xattr(struct ceph_inode_info *ci,
xattr->should_free_name = update_xattr;
ci->i_xattrs.count++;
- dout("%s count=%d\n", __func__, ci->i_xattrs.count);
+ doutc(cl, "count=%d\n", ci->i_xattrs.count);
} else {
kfree(*newxattr);
*newxattr = NULL;
@@ -654,13 +657,13 @@ static int __set_xattr(struct ceph_inode_info *ci,
if (new) {
rb_link_node(&xattr->node, parent, p);
rb_insert_color(&xattr->node, &ci->i_xattrs.index);
- dout("%s p=%p\n", __func__, p);
+ doutc(cl, "p=%p\n", p);
}
- dout("%s added %llx.%llx xattr %p %.*s=%.*s%s\n", __func__,
- ceph_vinop(&ci->netfs.inode), xattr, name_len, name,
- min(val_len, MAX_XATTR_VAL_PRINT_LEN), val,
- val_len > MAX_XATTR_VAL_PRINT_LEN ? "..." : "");
+ doutc(cl, "added %p %llx.%llx xattr %p %.*s=%.*s%s\n", inode,
+ ceph_vinop(inode), xattr, name_len, name, min(val_len,
+ MAX_XATTR_VAL_PRINT_LEN), val,
+ val_len > MAX_XATTR_VAL_PRINT_LEN ? "..." : "");
return 0;
}
@@ -668,6 +671,7 @@ static int __set_xattr(struct ceph_inode_info *ci,
static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
const char *name)
{
+ struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode);
struct rb_node **p;
struct rb_node *parent = NULL;
struct ceph_inode_xattr *xattr = NULL;
@@ -688,13 +692,13 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
else {
int len = min(xattr->val_len, MAX_XATTR_VAL_PRINT_LEN);
- dout("%s %s: found %.*s%s\n", __func__, name, len,
- xattr->val, xattr->val_len > len ? "..." : "");
+ doutc(cl, "%s found %.*s%s\n", name, len, xattr->val,
+ xattr->val_len > len ? "..." : "");
return xattr;
}
}
- dout("%s %s: not found\n", __func__, name);
+ doutc(cl, "%s not found\n", name);
return NULL;
}
@@ -735,19 +739,20 @@ static int __remove_xattr(struct ceph_inode_info *ci,
static char *__copy_xattr_names(struct ceph_inode_info *ci,
char *dest)
{
+ struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode);
struct rb_node *p;
struct ceph_inode_xattr *xattr = NULL;
p = rb_first(&ci->i_xattrs.index);
- dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count);
+ doutc(cl, "count=%d\n", ci->i_xattrs.count);
while (p) {
xattr = rb_entry(p, struct ceph_inode_xattr, node);
memcpy(dest, xattr->name, xattr->name_len);
dest[xattr->name_len] = '\0';
- dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
- xattr->name_len, ci->i_xattrs.names_size);
+ doutc(cl, "dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
+ xattr->name_len, ci->i_xattrs.names_size);
dest += xattr->name_len + 1;
p = rb_next(p);
@@ -758,19 +763,19 @@ static char *__copy_xattr_names(struct ceph_inode_info *ci,
void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
{
+ struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode);
struct rb_node *p, *tmp;
struct ceph_inode_xattr *xattr = NULL;
p = rb_first(&ci->i_xattrs.index);
- dout("__ceph_destroy_xattrs p=%p\n", p);
+ doutc(cl, "p=%p\n", p);
while (p) {
xattr = rb_entry(p, struct ceph_inode_xattr, node);
tmp = p;
p = rb_next(tmp);
- dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p,
- xattr->name_len, xattr->name);
+ doutc(cl, "next p=%p (%.*s)\n", p, xattr->name_len, xattr->name);
rb_erase(tmp, &ci->i_xattrs.index);
__free_xattr(xattr);
@@ -787,6 +792,7 @@ static int __build_xattrs(struct inode *inode)
__releases(ci->i_ceph_lock)
__acquires(ci->i_ceph_lock)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
u32 namelen;
u32 numattr = 0;
void *p, *end;
@@ -798,8 +804,8 @@ static int __build_xattrs(struct inode *inode)
int err = 0;
int i;
- dout("__build_xattrs() len=%d\n",
- ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
+ doutc(cl, "len=%d\n",
+ ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
if (ci->i_xattrs.index_version >= ci->i_xattrs.version)
return 0; /* already built */
@@ -874,6 +880,8 @@ bad:
static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
int val_size)
{
+ struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode);
+
/*
* 4 bytes for the length, and additional 4 bytes per each xattr name,
* 4 bytes per each value
@@ -881,9 +889,8 @@ static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
int size = 4 + ci->i_xattrs.count*(4 + 4) +
ci->i_xattrs.names_size +
ci->i_xattrs.vals_size;
- dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n",
- ci->i_xattrs.count, ci->i_xattrs.names_size,
- ci->i_xattrs.vals_size);
+ doutc(cl, "c=%d names.size=%d vals.size=%d\n", ci->i_xattrs.count,
+ ci->i_xattrs.names_size, ci->i_xattrs.vals_size);
if (name_size)
size += 4 + 4 + name_size + val_size;
@@ -899,12 +906,14 @@ static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
*/
struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci)
{
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct rb_node *p;
struct ceph_inode_xattr *xattr = NULL;
struct ceph_buffer *old_blob = NULL;
void *dest;
- dout("__build_xattrs_blob %p\n", &ci->netfs.inode);
+ doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
if (ci->i_xattrs.dirty) {
int need = __get_required_blob_size(ci, 0, 0);
@@ -962,6 +971,7 @@ static inline int __get_request_mask(struct inode *in) {
ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
size_t size)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_inode_xattr *xattr;
struct ceph_vxattr *vxattr;
@@ -1000,8 +1010,9 @@ handle_non_vxattrs:
req_mask = __get_request_mask(inode);
spin_lock(&ci->i_ceph_lock);
- dout("getxattr %p name '%s' ver=%lld index_ver=%lld\n", inode, name,
- ci->i_xattrs.version, ci->i_xattrs.index_version);
+ doutc(cl, "%p %llx.%llx name '%s' ver=%lld index_ver=%lld\n", inode,
+ ceph_vinop(inode), name, ci->i_xattrs.version,
+ ci->i_xattrs.index_version);
if (ci->i_xattrs.version == 0 ||
!((req_mask & CEPH_CAP_XATTR_SHARED) ||
@@ -1010,8 +1021,9 @@ handle_non_vxattrs:
/* security module gets xattr while filling trace */
if (current->journal_info) {
- pr_warn_ratelimited("sync getxattr %p "
- "during filling trace\n", inode);
+ pr_warn_ratelimited_client(cl,
+ "sync %p %llx.%llx during filling trace\n",
+ inode, ceph_vinop(inode));
return -EBUSY;
}
@@ -1053,14 +1065,16 @@ out:
ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
{
struct inode *inode = d_inode(dentry);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
bool len_only = (size == 0);
u32 namelen;
int err;
spin_lock(&ci->i_ceph_lock);
- dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
- ci->i_xattrs.version, ci->i_xattrs.index_version);
+ doutc(cl, "%p %llx.%llx ver=%lld index_ver=%lld\n", inode,
+ ceph_vinop(inode), ci->i_xattrs.version,
+ ci->i_xattrs.index_version);
if (ci->i_xattrs.version == 0 ||
!__ceph_caps_issued_mask_metric(ci, CEPH_CAP_XATTR_SHARED, 1)) {
@@ -1094,7 +1108,8 @@ out:
static int ceph_sync_setxattr(struct inode *inode, const char *name,
const char *value, size_t size, int flags)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_request *req;
struct ceph_mds_client *mdsc = fsc->mdsc;
@@ -1119,7 +1134,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
flags |= CEPH_XATTR_REMOVE;
}
- dout("setxattr value size: %zu\n", size);
+ doutc(cl, "name %s value size %zu\n", name, size);
/* do request */
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
@@ -1148,10 +1163,10 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
req->r_num_caps = 1;
req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
- dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
+ doutc(cl, "xattr.ver (before): %lld\n", ci->i_xattrs.version);
err = ceph_mdsc_do_request(mdsc, NULL, req);
ceph_mdsc_put_request(req);
- dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
+ doutc(cl, "xattr.ver (after): %lld\n", ci->i_xattrs.version);
out:
if (pagelist)
@@ -1162,9 +1177,10 @@ out:
int __ceph_setxattr(struct inode *inode, const char *name,
const void *value, size_t size, int flags)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_vxattr *vxattr;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
struct ceph_cap_flush *prealloc_cf = NULL;
struct ceph_buffer *old_blob = NULL;
int issued;
@@ -1220,9 +1236,9 @@ retry:
required_blob_size = __get_required_blob_size(ci, name_len, val_len);
if ((ci->i_xattrs.version == 0) || !(issued & CEPH_CAP_XATTR_EXCL) ||
(required_blob_size > mdsc->mdsmap->m_max_xattr_size)) {
- dout("%s do sync setxattr: version: %llu size: %d max: %llu\n",
- __func__, ci->i_xattrs.version, required_blob_size,
- mdsc->mdsmap->m_max_xattr_size);
+ doutc(cl, "sync version: %llu size: %d max: %llu\n",
+ ci->i_xattrs.version, required_blob_size,
+ mdsc->mdsmap->m_max_xattr_size);
goto do_sync;
}
@@ -1236,8 +1252,8 @@ retry:
}
}
- dout("setxattr %p name '%s' issued %s\n", inode, name,
- ceph_cap_string(issued));
+ doutc(cl, "%p %llx.%llx name '%s' issued %s\n", inode,
+ ceph_vinop(inode), name, ceph_cap_string(issued));
__build_xattrs(inode);
if (!ci->i_xattrs.prealloc_blob ||
@@ -1246,7 +1262,8 @@ retry:
spin_unlock(&ci->i_ceph_lock);
ceph_buffer_put(old_blob); /* Shouldn't be required */
- dout(" pre-allocating new blob size=%d\n", required_blob_size);
+ doutc(cl, " pre-allocating new blob size=%d\n",
+ required_blob_size);
blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
if (!blob)
goto do_sync_unlocked;
@@ -1285,8 +1302,9 @@ do_sync_unlocked:
/* security module set xattr while filling trace */
if (current->journal_info) {
- pr_warn_ratelimited("sync setxattr %p "
- "during filling trace\n", inode);
+ pr_warn_ratelimited_client(cl,
+ "sync %p %llx.%llx during filling trace\n",
+ inode, ceph_vinop(inode));
err = -EBUSY;
} else {
err = ceph_sync_setxattr(inode, name, value, size, flags);
@@ -1446,7 +1464,7 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx)
* List of handlers for synthetic system.* attributes. Other
* attributes are handled directly.
*/
-const struct xattr_handler *ceph_xattr_handlers[] = {
+const struct xattr_handler * const ceph_xattr_handlers[] = {
&ceph_other_xattr_handler,
NULL,
};
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 950b6919fb87..57cc096c498a 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -25,7 +25,7 @@
#include "internal.h"
-static struct kobj_map *cdev_map;
+static struct kobj_map *cdev_map __ro_after_init;
static DEFINE_MUTEX(chrdevs_lock);
@@ -350,7 +350,7 @@ static struct kobject *cdev_get(struct cdev *p)
struct module *owner = p->owner;
struct kobject *kobj;
- if (owner && !try_module_get(owner))
+ if (!try_module_get(owner))
return NULL;
kobj = kobject_get_unless_zero(&p->kobj);
if (!kobj)
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index ae023853a98f..1d2dac95f86a 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -123,9 +123,11 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr)
if (attr->va_size != -1)
inode->i_blocks = (attr->va_size + 511) >> 9;
if (attr->va_atime.tv_sec != -1)
- inode->i_atime = coda_to_timespec64(attr->va_atime);
+ inode_set_atime_to_ts(inode,
+ coda_to_timespec64(attr->va_atime));
if (attr->va_mtime.tv_sec != -1)
- inode->i_mtime = coda_to_timespec64(attr->va_mtime);
+ inode_set_mtime_to_ts(inode,
+ coda_to_timespec64(attr->va_mtime));
if (attr->va_ctime.tv_sec != -1)
inode_set_ctime_to_ts(inode,
coda_to_timespec64(attr->va_ctime));
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index cb512b10473b..4e552ba7bd43 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -111,7 +111,7 @@ static inline void coda_dir_update_mtime(struct inode *dir)
/* optimistically we can also act as if our nose bleeds. The
* granularity of the mtime is coarse anyways so we might actually be
* right most of the time. Note: we only do this for directories. */
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
#endif
}
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 42346618b4ed..16acc58311ea 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -84,7 +84,7 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to)
ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos, 0);
coda_inode->i_size = file_inode(host_file)->i_size;
coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9;
- coda_inode->i_mtime = inode_set_ctime_current(coda_inode);
+ inode_set_mtime_to_ts(coda_inode, inode_set_ctime_current(coda_inode));
inode_unlock(coda_inode);
file_end_write(host_file);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index fbdcb3582926..dcc22f593e43 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -88,7 +88,7 @@ int configfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
static inline void set_default_inode_attr(struct inode * inode, umode_t mode)
{
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
}
static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
@@ -96,8 +96,8 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
inode->i_mode = iattr->ia_mode;
inode->i_uid = iattr->ia_uid;
inode->i_gid = iattr->ia_gid;
- inode->i_atime = iattr->ia_atime;
- inode->i_mtime = iattr->ia_mtime;
+ inode_set_atime_to_ts(inode, iattr->ia_atime);
+ inode_set_mtime_to_ts(inode, iattr->ia_mtime);
inode_set_ctime_to_ts(inode, iattr->ia_ctime);
}
@@ -171,7 +171,7 @@ struct inode *configfs_create(struct dentry *dentry, umode_t mode)
return ERR_PTR(-ENOMEM);
p_inode = d_inode(dentry->d_parent);
- p_inode->i_mtime = inode_set_ctime_current(p_inode);
+ inode_set_mtime_to_ts(p_inode, inode_set_ctime_current(p_inode));
configfs_set_inode_lock_class(sd, inode);
return inode;
}
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 5ee7d7bbb361..60dbfa0f8805 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -133,8 +133,8 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
}
/* Struct copy intentional */
- inode->i_mtime = inode->i_atime = inode_set_ctime_to_ts(inode,
- zerotime);
+ inode_set_mtime_to_ts(inode,
+ inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, zerotime)));
/* inode->i_nlink is left 1 - arguably wrong for directories,
but it's the best we can do without reading the directory
contents. 1 yields the right result in GNU find, even
@@ -495,7 +495,7 @@ static void cramfs_kill_sb(struct super_block *sb)
sb->s_mtd = NULL;
} else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) {
sync_blockdev(sb->s_bdev);
- blkdev_put(sb->s_bdev, sb);
+ bdev_release(sb->s_bdev_handle);
}
kfree(sbi);
}
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 62e1a3dd8357..0ad8c30b8fa5 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -111,10 +111,14 @@ out:
int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
sector_t pblk, unsigned int len)
{
- const unsigned int blockbits = inode->i_blkbits;
- const unsigned int blocksize = 1 << blockbits;
- const unsigned int blocks_per_page_bits = PAGE_SHIFT - blockbits;
- const unsigned int blocks_per_page = 1 << blocks_per_page_bits;
+ const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+ const unsigned int du_bits = ci->ci_data_unit_bits;
+ const unsigned int du_size = 1U << du_bits;
+ const unsigned int du_per_page_bits = PAGE_SHIFT - du_bits;
+ const unsigned int du_per_page = 1U << du_per_page_bits;
+ u64 du_index = (u64)lblk << (inode->i_blkbits - du_bits);
+ u64 du_remaining = (u64)len << (inode->i_blkbits - du_bits);
+ sector_t sector = pblk << (inode->i_blkbits - SECTOR_SHIFT);
struct page *pages[16]; /* write up to 16 pages at a time */
unsigned int nr_pages;
unsigned int i;
@@ -130,8 +134,8 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
len);
BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_VECS);
- nr_pages = min_t(unsigned int, ARRAY_SIZE(pages),
- (len + blocks_per_page - 1) >> blocks_per_page_bits);
+ nr_pages = min_t(u64, ARRAY_SIZE(pages),
+ (du_remaining + du_per_page - 1) >> du_per_page_bits);
/*
* We need at least one page for ciphertext. Allocate the first one
@@ -154,21 +158,22 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
bio = bio_alloc(inode->i_sb->s_bdev, nr_pages, REQ_OP_WRITE, GFP_NOFS);
do {
- bio->bi_iter.bi_sector = pblk << (blockbits - 9);
+ bio->bi_iter.bi_sector = sector;
i = 0;
offset = 0;
do {
- err = fscrypt_crypt_block(inode, FS_ENCRYPT, lblk,
- ZERO_PAGE(0), pages[i],
- blocksize, offset, GFP_NOFS);
+ err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, du_index,
+ ZERO_PAGE(0), pages[i],
+ du_size, offset,
+ GFP_NOFS);
if (err)
goto out;
- lblk++;
- pblk++;
- len--;
- offset += blocksize;
- if (offset == PAGE_SIZE || len == 0) {
+ du_index++;
+ sector += 1U << (du_bits - SECTOR_SHIFT);
+ du_remaining--;
+ offset += du_size;
+ if (offset == PAGE_SIZE || du_remaining == 0) {
ret = bio_add_page(bio, pages[i++], offset, 0);
if (WARN_ON_ONCE(ret != offset)) {
err = -EIO;
@@ -176,13 +181,13 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
}
offset = 0;
}
- } while (i != nr_pages && len != 0);
+ } while (i != nr_pages && du_remaining != 0);
err = submit_bio_wait(bio);
if (err)
goto out;
bio_reset(bio, inode->i_sb->s_bdev, REQ_OP_WRITE);
- } while (len != 0);
+ } while (du_remaining != 0);
err = 0;
out:
bio_put(bio);
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 6a837e4b80dc..328470d40dec 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -39,7 +39,7 @@ static mempool_t *fscrypt_bounce_page_pool = NULL;
static struct workqueue_struct *fscrypt_read_workqueue;
static DEFINE_MUTEX(fscrypt_init_mutex);
-struct kmem_cache *fscrypt_info_cachep;
+struct kmem_cache *fscrypt_inode_info_cachep;
void fscrypt_enqueue_decrypt_work(struct work_struct *work)
{
@@ -49,6 +49,13 @@ EXPORT_SYMBOL(fscrypt_enqueue_decrypt_work);
struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags)
{
+ if (WARN_ON_ONCE(!fscrypt_bounce_page_pool)) {
+ /*
+ * Oops, the filesystem called a function that uses the bounce
+ * page pool, but it didn't set needs_bounce_pages.
+ */
+ return NULL;
+ }
return mempool_alloc(fscrypt_bounce_page_pool, gfp_flags);
}
@@ -70,44 +77,44 @@ void fscrypt_free_bounce_page(struct page *bounce_page)
EXPORT_SYMBOL(fscrypt_free_bounce_page);
/*
- * Generate the IV for the given logical block number within the given file.
- * For filenames encryption, lblk_num == 0.
+ * Generate the IV for the given data unit index within the given file.
+ * For filenames encryption, index == 0.
*
* Keep this in sync with fscrypt_limit_io_blocks(). fscrypt_limit_io_blocks()
* needs to know about any IV generation methods where the low bits of IV don't
- * simply contain the lblk_num (e.g., IV_INO_LBLK_32).
+ * simply contain the data unit index (e.g., IV_INO_LBLK_32).
*/
-void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
- const struct fscrypt_info *ci)
+void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index,
+ const struct fscrypt_inode_info *ci)
{
u8 flags = fscrypt_policy_flags(&ci->ci_policy);
memset(iv, 0, ci->ci_mode->ivsize);
if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) {
- WARN_ON_ONCE(lblk_num > U32_MAX);
+ WARN_ON_ONCE(index > U32_MAX);
WARN_ON_ONCE(ci->ci_inode->i_ino > U32_MAX);
- lblk_num |= (u64)ci->ci_inode->i_ino << 32;
+ index |= (u64)ci->ci_inode->i_ino << 32;
} else if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) {
- WARN_ON_ONCE(lblk_num > U32_MAX);
- lblk_num = (u32)(ci->ci_hashed_ino + lblk_num);
+ WARN_ON_ONCE(index > U32_MAX);
+ index = (u32)(ci->ci_hashed_ino + index);
} else if (flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) {
memcpy(iv->nonce, ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE);
}
- iv->lblk_num = cpu_to_le64(lblk_num);
+ iv->index = cpu_to_le64(index);
}
-/* Encrypt or decrypt a single filesystem block of file contents */
-int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
- u64 lblk_num, struct page *src_page,
- struct page *dest_page, unsigned int len,
- unsigned int offs, gfp_t gfp_flags)
+/* Encrypt or decrypt a single "data unit" of file contents. */
+int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
+ fscrypt_direction_t rw, u64 index,
+ struct page *src_page, struct page *dest_page,
+ unsigned int len, unsigned int offs,
+ gfp_t gfp_flags)
{
union fscrypt_iv iv;
struct skcipher_request *req = NULL;
DECLARE_CRYPTO_WAIT(wait);
struct scatterlist dst, src;
- struct fscrypt_info *ci = inode->i_crypt_info;
struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
int res = 0;
@@ -116,7 +123,7 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
if (WARN_ON_ONCE(len % FSCRYPT_CONTENTS_ALIGNMENT != 0))
return -EINVAL;
- fscrypt_generate_iv(&iv, lblk_num, ci);
+ fscrypt_generate_iv(&iv, index, ci);
req = skcipher_request_alloc(tfm, gfp_flags);
if (!req)
@@ -137,28 +144,29 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
skcipher_request_free(req);
if (res) {
- fscrypt_err(inode, "%scryption failed for block %llu: %d",
- (rw == FS_DECRYPT ? "De" : "En"), lblk_num, res);
+ fscrypt_err(ci->ci_inode,
+ "%scryption failed for data unit %llu: %d",
+ (rw == FS_DECRYPT ? "De" : "En"), index, res);
return res;
}
return 0;
}
/**
- * fscrypt_encrypt_pagecache_blocks() - Encrypt filesystem blocks from a
- * pagecache page
- * @page: The locked pagecache page containing the block(s) to encrypt
- * @len: Total size of the block(s) to encrypt. Must be a nonzero
- * multiple of the filesystem's block size.
- * @offs: Byte offset within @page of the first block to encrypt. Must be
- * a multiple of the filesystem's block size.
- * @gfp_flags: Memory allocation flags. See details below.
+ * fscrypt_encrypt_pagecache_blocks() - Encrypt data from a pagecache page
+ * @page: the locked pagecache page containing the data to encrypt
+ * @len: size of the data to encrypt, in bytes
+ * @offs: offset within @page of the data to encrypt, in bytes
+ * @gfp_flags: memory allocation flags; see details below
+ *
+ * This allocates a new bounce page and encrypts the given data into it. The
+ * length and offset of the data must be aligned to the file's crypto data unit
+ * size. Alignment to the filesystem block size fulfills this requirement, as
+ * the filesystem block size is always a multiple of the data unit size.
*
- * A new bounce page is allocated, and the specified block(s) are encrypted into
- * it. In the bounce page, the ciphertext block(s) will be located at the same
- * offsets at which the plaintext block(s) were located in the source page; any
- * other parts of the bounce page will be left uninitialized. However, normally
- * blocksize == PAGE_SIZE and the whole page is encrypted at once.
+ * In the bounce page, the ciphertext data will be located at the same offset at
+ * which the plaintext data was located in the source page. Any other parts of
+ * the bounce page will be left uninitialized.
*
* This is for use by the filesystem's ->writepages() method.
*
@@ -176,28 +184,29 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct page *page,
{
const struct inode *inode = page->mapping->host;
- const unsigned int blockbits = inode->i_blkbits;
- const unsigned int blocksize = 1 << blockbits;
+ const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+ const unsigned int du_bits = ci->ci_data_unit_bits;
+ const unsigned int du_size = 1U << du_bits;
struct page *ciphertext_page;
- u64 lblk_num = ((u64)page->index << (PAGE_SHIFT - blockbits)) +
- (offs >> blockbits);
+ u64 index = ((u64)page->index << (PAGE_SHIFT - du_bits)) +
+ (offs >> du_bits);
unsigned int i;
int err;
if (WARN_ON_ONCE(!PageLocked(page)))
return ERR_PTR(-EINVAL);
- if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize)))
+ if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, du_size)))
return ERR_PTR(-EINVAL);
ciphertext_page = fscrypt_alloc_bounce_page(gfp_flags);
if (!ciphertext_page)
return ERR_PTR(-ENOMEM);
- for (i = offs; i < offs + len; i += blocksize, lblk_num++) {
- err = fscrypt_crypt_block(inode, FS_ENCRYPT, lblk_num,
- page, ciphertext_page,
- blocksize, i, gfp_flags);
+ for (i = offs; i < offs + len; i += du_size, index++) {
+ err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, index,
+ page, ciphertext_page,
+ du_size, i, gfp_flags);
if (err) {
fscrypt_free_bounce_page(ciphertext_page);
return ERR_PTR(err);
@@ -224,30 +233,33 @@ EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks);
* arbitrary page, not necessarily in the original pagecache page. The @inode
* and @lblk_num must be specified, as they can't be determined from @page.
*
+ * This is not compatible with fscrypt_operations::supports_subblock_data_units.
+ *
* Return: 0 on success; -errno on failure
*/
int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page,
unsigned int len, unsigned int offs,
u64 lblk_num, gfp_t gfp_flags)
{
- return fscrypt_crypt_block(inode, FS_ENCRYPT, lblk_num, page, page,
- len, offs, gfp_flags);
+ if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units))
+ return -EOPNOTSUPP;
+ return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_ENCRYPT,
+ lblk_num, page, page, len, offs,
+ gfp_flags);
}
EXPORT_SYMBOL(fscrypt_encrypt_block_inplace);
/**
- * fscrypt_decrypt_pagecache_blocks() - Decrypt filesystem blocks in a
- * pagecache folio
- * @folio: The locked pagecache folio containing the block(s) to decrypt
- * @len: Total size of the block(s) to decrypt. Must be a nonzero
- * multiple of the filesystem's block size.
- * @offs: Byte offset within @folio of the first block to decrypt. Must be
- * a multiple of the filesystem's block size.
+ * fscrypt_decrypt_pagecache_blocks() - Decrypt data from a pagecache folio
+ * @folio: the pagecache folio containing the data to decrypt
+ * @len: size of the data to decrypt, in bytes
+ * @offs: offset within @folio of the data to decrypt, in bytes
*
- * The specified block(s) are decrypted in-place within the pagecache folio,
- * which must still be locked and not uptodate.
- *
- * This is for use by the filesystem's ->readahead() method.
+ * Decrypt data that has just been read from an encrypted file. The data must
+ * be located in a pagecache folio that is still locked and not yet uptodate.
+ * The length and offset of the data must be aligned to the file's crypto data
+ * unit size. Alignment to the filesystem block size fulfills this requirement,
+ * as the filesystem block size is always a multiple of the data unit size.
*
* Return: 0 on success; -errno on failure
*/
@@ -255,25 +267,26 @@ int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len,
size_t offs)
{
const struct inode *inode = folio->mapping->host;
- const unsigned int blockbits = inode->i_blkbits;
- const unsigned int blocksize = 1 << blockbits;
- u64 lblk_num = ((u64)folio->index << (PAGE_SHIFT - blockbits)) +
- (offs >> blockbits);
+ const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+ const unsigned int du_bits = ci->ci_data_unit_bits;
+ const unsigned int du_size = 1U << du_bits;
+ u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) +
+ (offs >> du_bits);
size_t i;
int err;
if (WARN_ON_ONCE(!folio_test_locked(folio)))
return -EINVAL;
- if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize)))
+ if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, du_size)))
return -EINVAL;
- for (i = offs; i < offs + len; i += blocksize, lblk_num++) {
+ for (i = offs; i < offs + len; i += du_size, index++) {
struct page *page = folio_page(folio, i >> PAGE_SHIFT);
- err = fscrypt_crypt_block(inode, FS_DECRYPT, lblk_num, page,
- page, blocksize, i & ~PAGE_MASK,
- GFP_NOFS);
+ err = fscrypt_crypt_data_unit(ci, FS_DECRYPT, index, page,
+ page, du_size, i & ~PAGE_MASK,
+ GFP_NOFS);
if (err)
return err;
}
@@ -295,14 +308,19 @@ EXPORT_SYMBOL(fscrypt_decrypt_pagecache_blocks);
* arbitrary page, not necessarily in the original pagecache page. The @inode
* and @lblk_num must be specified, as they can't be determined from @page.
*
+ * This is not compatible with fscrypt_operations::supports_subblock_data_units.
+ *
* Return: 0 on success; -errno on failure
*/
int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page,
unsigned int len, unsigned int offs,
u64 lblk_num)
{
- return fscrypt_crypt_block(inode, FS_DECRYPT, lblk_num, page, page,
- len, offs, GFP_NOFS);
+ if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units))
+ return -EOPNOTSUPP;
+ return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_DECRYPT,
+ lblk_num, page, page, len, offs,
+ GFP_NOFS);
}
EXPORT_SYMBOL(fscrypt_decrypt_block_inplace);
@@ -325,7 +343,7 @@ int fscrypt_initialize(struct super_block *sb)
return 0;
/* No need to allocate a bounce page pool if this FS won't use it. */
- if (sb->s_cop->flags & FS_CFLG_OWN_PAGES)
+ if (!sb->s_cop->needs_bounce_pages)
return 0;
mutex_lock(&fscrypt_init_mutex);
@@ -391,18 +409,19 @@ static int __init fscrypt_init(void)
if (!fscrypt_read_workqueue)
goto fail;
- fscrypt_info_cachep = KMEM_CACHE(fscrypt_info, SLAB_RECLAIM_ACCOUNT);
- if (!fscrypt_info_cachep)
+ fscrypt_inode_info_cachep = KMEM_CACHE(fscrypt_inode_info,
+ SLAB_RECLAIM_ACCOUNT);
+ if (!fscrypt_inode_info_cachep)
goto fail_free_queue;
err = fscrypt_init_keyring();
if (err)
- goto fail_free_info;
+ goto fail_free_inode_info;
return 0;
-fail_free_info:
- kmem_cache_destroy(fscrypt_info_cachep);
+fail_free_inode_info:
+ kmem_cache_destroy(fscrypt_inode_info_cachep);
fail_free_queue:
destroy_workqueue(fscrypt_read_workqueue);
fail:
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 6eae3f12ad50..7b3fc189593a 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -100,7 +100,7 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
{
struct skcipher_request *req = NULL;
DECLARE_CRYPTO_WAIT(wait);
- const struct fscrypt_info *ci = inode->i_crypt_info;
+ const struct fscrypt_inode_info *ci = inode->i_crypt_info;
struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
union fscrypt_iv iv;
struct scatterlist sg;
@@ -157,7 +157,7 @@ static int fname_decrypt(const struct inode *inode,
struct skcipher_request *req = NULL;
DECLARE_CRYPTO_WAIT(wait);
struct scatterlist src_sg, dst_sg;
- const struct fscrypt_info *ci = inode->i_crypt_info;
+ const struct fscrypt_inode_info *ci = inode->i_crypt_info;
struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
union fscrypt_iv iv;
int res;
@@ -568,7 +568,7 @@ EXPORT_SYMBOL_GPL(fscrypt_match_name);
*/
u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name)
{
- const struct fscrypt_info *ci = dir->i_crypt_info;
+ const struct fscrypt_inode_info *ci = dir->i_crypt_info;
WARN_ON_ONCE(!ci->ci_dirhash_key_initialized);
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 2d63da48635a..1892356cf924 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -47,7 +47,8 @@ struct fscrypt_context_v2 {
u8 contents_encryption_mode;
u8 filenames_encryption_mode;
u8 flags;
- u8 __reserved[4];
+ u8 log2_data_unit_size;
+ u8 __reserved[3];
u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE];
u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
};
@@ -165,6 +166,26 @@ fscrypt_policy_flags(const union fscrypt_policy *policy)
BUG();
}
+static inline int
+fscrypt_policy_v2_du_bits(const struct fscrypt_policy_v2 *policy,
+ const struct inode *inode)
+{
+ return policy->log2_data_unit_size ?: inode->i_blkbits;
+}
+
+static inline int
+fscrypt_policy_du_bits(const union fscrypt_policy *policy,
+ const struct inode *inode)
+{
+ switch (policy->version) {
+ case FSCRYPT_POLICY_V1:
+ return inode->i_blkbits;
+ case FSCRYPT_POLICY_V2:
+ return fscrypt_policy_v2_du_bits(&policy->v2, inode);
+ }
+ BUG();
+}
+
/*
* For encrypted symlinks, the ciphertext length is stored at the beginning
* of the string in little-endian format.
@@ -189,18 +210,18 @@ struct fscrypt_prepared_key {
};
/*
- * fscrypt_info - the "encryption key" for an inode
+ * fscrypt_inode_info - the "encryption key" for an inode
*
* When an encrypted file's key is made available, an instance of this struct is
* allocated and stored in ->i_crypt_info. Once created, it remains until the
* inode is evicted.
*/
-struct fscrypt_info {
+struct fscrypt_inode_info {
/* The key in a form prepared for actual encryption/decryption */
struct fscrypt_prepared_key ci_enc_key;
- /* True if ci_enc_key should be freed when this fscrypt_info is freed */
+ /* True if ci_enc_key should be freed when this struct is freed */
bool ci_owns_key;
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
@@ -212,6 +233,16 @@ struct fscrypt_info {
#endif
/*
+ * log2 of the data unit size (granularity of contents encryption) of
+ * this file. This is computable from ci_policy and ci_inode but is
+ * cached here for efficiency. Only used for regular files.
+ */
+ u8 ci_data_unit_bits;
+
+ /* Cached value: log2 of number of data units per FS block */
+ u8 ci_data_units_per_block_bits;
+
+ /*
* Encryption mode used for this inode. It corresponds to either the
* contents or filenames encryption mode, depending on the inode type.
*/
@@ -263,12 +294,13 @@ typedef enum {
} fscrypt_direction_t;
/* crypto.c */
-extern struct kmem_cache *fscrypt_info_cachep;
+extern struct kmem_cache *fscrypt_inode_info_cachep;
int fscrypt_initialize(struct super_block *sb);
-int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
- u64 lblk_num, struct page *src_page,
- struct page *dest_page, unsigned int len,
- unsigned int offs, gfp_t gfp_flags);
+int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
+ fscrypt_direction_t rw, u64 index,
+ struct page *src_page, struct page *dest_page,
+ unsigned int len, unsigned int offs,
+ gfp_t gfp_flags);
struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags);
void __printf(3, 4) __cold
@@ -283,8 +315,8 @@ fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...);
union fscrypt_iv {
struct {
- /* logical block number within the file */
- __le64 lblk_num;
+ /* zero-based index of data unit within the file */
+ __le64 index;
/* per-file nonce; only set in DIRECT_KEY mode */
u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
@@ -293,8 +325,18 @@ union fscrypt_iv {
__le64 dun[FSCRYPT_MAX_IV_SIZE / sizeof(__le64)];
};
-void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
- const struct fscrypt_info *ci);
+void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index,
+ const struct fscrypt_inode_info *ci);
+
+/*
+ * Return the number of bits used by the maximum file data unit index that is
+ * possible on the given filesystem, using the given log2 data unit size.
+ */
+static inline int
+fscrypt_max_file_dun_bits(const struct super_block *sb, int du_bits)
+{
+ return fls64(sb->s_maxbytes - 1) - du_bits;
+}
/* fname.c */
bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
@@ -332,17 +374,17 @@ void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf);
/* inline_crypt.c */
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
-int fscrypt_select_encryption_impl(struct fscrypt_info *ci);
+int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci);
static inline bool
-fscrypt_using_inline_encryption(const struct fscrypt_info *ci)
+fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci)
{
return ci->ci_inlinecrypt;
}
int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
const u8 *raw_key,
- const struct fscrypt_info *ci);
+ const struct fscrypt_inode_info *ci);
void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
struct fscrypt_prepared_key *prep_key);
@@ -353,7 +395,7 @@ void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
*/
static inline bool
fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
- const struct fscrypt_info *ci)
+ const struct fscrypt_inode_info *ci)
{
/*
* The two smp_load_acquire()'s here pair with the smp_store_release()'s
@@ -370,13 +412,13 @@ fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
#else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */
-static inline int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
+static inline int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci)
{
return 0;
}
static inline bool
-fscrypt_using_inline_encryption(const struct fscrypt_info *ci)
+fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci)
{
return false;
}
@@ -384,7 +426,7 @@ fscrypt_using_inline_encryption(const struct fscrypt_info *ci)
static inline int
fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
const u8 *raw_key,
- const struct fscrypt_info *ci)
+ const struct fscrypt_inode_info *ci)
{
WARN_ON_ONCE(1);
return -EOPNOTSUPP;
@@ -398,7 +440,7 @@ fscrypt_destroy_inline_crypt_key(struct super_block *sb,
static inline bool
fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
- const struct fscrypt_info *ci)
+ const struct fscrypt_inode_info *ci)
{
return smp_load_acquire(&prep_key->tfm) != NULL;
}
@@ -433,8 +475,28 @@ struct fscrypt_master_key_secret {
* fscrypt_master_key - an in-use master key
*
* This represents a master encryption key which has been added to the
- * filesystem and can be used to "unlock" the encrypted files which were
- * encrypted with it.
+ * filesystem. There are three high-level states that a key can be in:
+ *
+ * FSCRYPT_KEY_STATUS_PRESENT
+ * Key is fully usable; it can be used to unlock inodes that are encrypted
+ * with it (this includes being able to create new inodes). ->mk_present
+ * indicates whether the key is in this state. ->mk_secret exists, the key
+ * is in the keyring, and ->mk_active_refs > 0 due to ->mk_present.
+ *
+ * FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED
+ * Removal of this key has been initiated, but some inodes that were
+ * unlocked with it are still in-use. Like ABSENT, ->mk_secret is wiped,
+ * and the key can no longer be used to unlock inodes. Unlike ABSENT, the
+ * key is still in the keyring; ->mk_decrypted_inodes is nonempty; and
+ * ->mk_active_refs > 0, being equal to the size of ->mk_decrypted_inodes.
+ *
+ * This state transitions to ABSENT if ->mk_decrypted_inodes becomes empty,
+ * or to PRESENT if FS_IOC_ADD_ENCRYPTION_KEY is called again for this key.
+ *
+ * FSCRYPT_KEY_STATUS_ABSENT
+ * Key is fully removed. The key is no longer in the keyring,
+ * ->mk_decrypted_inodes is empty, ->mk_active_refs == 0, ->mk_secret is
+ * wiped, and the key can no longer be used to unlock inodes.
*/
struct fscrypt_master_key {
@@ -444,7 +506,7 @@ struct fscrypt_master_key {
*/
struct hlist_node mk_node;
- /* Semaphore that protects ->mk_secret and ->mk_users */
+ /* Semaphore that protects ->mk_secret, ->mk_users, and ->mk_present */
struct rw_semaphore mk_sem;
/*
@@ -454,8 +516,8 @@ struct fscrypt_master_key {
* ->mk_direct_keys) that have been prepared continue to exist.
* A structural ref only guarantees that the struct continues to exist.
*
- * There is one active ref associated with ->mk_secret being present,
- * and one active ref for each inode in ->mk_decrypted_inodes.
+ * There is one active ref associated with ->mk_present being true, and
+ * one active ref for each inode in ->mk_decrypted_inodes.
*
* There is one structural ref associated with the active refcount being
* nonzero. Finding a key in the keyring also takes a structural ref,
@@ -467,17 +529,10 @@ struct fscrypt_master_key {
struct rcu_head mk_rcu_head;
/*
- * The secret key material. After FS_IOC_REMOVE_ENCRYPTION_KEY is
- * executed, this is wiped and no new inodes can be unlocked with this
- * key; however, there may still be inodes in ->mk_decrypted_inodes
- * which could not be evicted. As long as some inodes still remain,
- * FS_IOC_REMOVE_ENCRYPTION_KEY can be retried, or
- * FS_IOC_ADD_ENCRYPTION_KEY can add the secret again.
+ * The secret key material. Wiped as soon as it is no longer needed;
+ * for details, see the fscrypt_master_key struct comment.
*
- * While ->mk_secret is present, one ref in ->mk_active_refs is held.
- *
- * Locking: protected by ->mk_sem. The manipulation of ->mk_active_refs
- * associated with this field is protected by ->mk_sem as well.
+ * Locking: protected by ->mk_sem.
*/
struct fscrypt_master_key_secret mk_secret;
@@ -500,7 +555,7 @@ struct fscrypt_master_key {
*
* Locking: protected by ->mk_sem. (We don't just rely on the keyrings
* subsystem semaphore ->mk_users->sem, as we need support for atomic
- * search+insert along with proper synchronization with ->mk_secret.)
+ * search+insert along with proper synchronization with other fields.)
*/
struct key *mk_users;
@@ -523,20 +578,17 @@ struct fscrypt_master_key {
siphash_key_t mk_ino_hash_key;
bool mk_ino_hash_key_initialized;
-} __randomize_layout;
-
-static inline bool
-is_master_key_secret_present(const struct fscrypt_master_key_secret *secret)
-{
/*
- * The READ_ONCE() is only necessary for fscrypt_drop_inode().
- * fscrypt_drop_inode() runs in atomic context, so it can't take the key
- * semaphore and thus 'secret' can change concurrently which would be a
- * data race. But fscrypt_drop_inode() only need to know whether the
- * secret *was* present at the time of check, so READ_ONCE() suffices.
+ * Whether this key is in the "present" state, i.e. fully usable. For
+ * details, see the fscrypt_master_key struct comment.
+ *
+ * Locking: protected by ->mk_sem, but can be read locklessly using
+ * READ_ONCE(). Writers must use WRITE_ONCE() when concurrent readers
+ * are possible.
*/
- return READ_ONCE(secret->size) != 0;
-}
+ bool mk_present;
+
+} __randomize_layout;
static inline const char *master_key_spec_type(
const struct fscrypt_key_specifier *spec)
@@ -598,17 +650,18 @@ struct fscrypt_mode {
extern struct fscrypt_mode fscrypt_modes[];
int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
- const u8 *raw_key, const struct fscrypt_info *ci);
+ const u8 *raw_key, const struct fscrypt_inode_info *ci);
void fscrypt_destroy_prepared_key(struct super_block *sb,
struct fscrypt_prepared_key *prep_key);
-int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key);
+int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci,
+ const u8 *raw_key);
-int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
+int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci,
const struct fscrypt_master_key *mk);
-void fscrypt_hash_inode_number(struct fscrypt_info *ci,
+void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci,
const struct fscrypt_master_key *mk);
int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported);
@@ -643,10 +696,11 @@ static inline int fscrypt_require_key(struct inode *inode)
void fscrypt_put_direct_key(struct fscrypt_direct_key *dk);
-int fscrypt_setup_v1_file_key(struct fscrypt_info *ci,
+int fscrypt_setup_v1_file_key(struct fscrypt_inode_info *ci,
const u8 *raw_master_key);
-int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci);
+int fscrypt_setup_v1_file_key_via_subscribed_keyrings(
+ struct fscrypt_inode_info *ci);
/* policy.c */
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 6238dbcadcad..52504dd478d3 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -169,7 +169,7 @@ EXPORT_SYMBOL_GPL(__fscrypt_prepare_setattr);
int fscrypt_prepare_setflags(struct inode *inode,
unsigned int oldflags, unsigned int flags)
{
- struct fscrypt_info *ci;
+ struct fscrypt_inode_info *ci;
struct fscrypt_master_key *mk;
int err;
@@ -187,7 +187,7 @@ int fscrypt_prepare_setflags(struct inode *inode,
return -EINVAL;
mk = ci->ci_master_key;
down_read(&mk->mk_sem);
- if (is_master_key_secret_present(&mk->mk_secret))
+ if (mk->mk_present)
err = fscrypt_derive_dirhash_key(ci, mk);
else
err = -ENOKEY;
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index 8bfb3ce86476..b4002aea7cdb 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -39,11 +39,11 @@ static struct block_device **fscrypt_get_devices(struct super_block *sb,
return devs;
}
-static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci)
+static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_inode_info *ci)
{
- struct super_block *sb = ci->ci_inode->i_sb;
+ const struct super_block *sb = ci->ci_inode->i_sb;
unsigned int flags = fscrypt_policy_flags(&ci->ci_policy);
- int ino_bits = 64, lblk_bits = 64;
+ int dun_bits;
if (flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY)
return offsetofend(union fscrypt_iv, nonce);
@@ -54,10 +54,9 @@ static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci)
if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)
return sizeof(__le32);
- /* Default case: IVs are just the file logical block number */
- if (sb->s_cop->get_ino_and_lblk_bits)
- sb->s_cop->get_ino_and_lblk_bits(sb, &ino_bits, &lblk_bits);
- return DIV_ROUND_UP(lblk_bits, 8);
+ /* Default case: IVs are just the file data unit index */
+ dun_bits = fscrypt_max_file_dun_bits(sb, ci->ci_data_unit_bits);
+ return DIV_ROUND_UP(dun_bits, 8);
}
/*
@@ -90,7 +89,7 @@ static void fscrypt_log_blk_crypto_impl(struct fscrypt_mode *mode,
}
/* Enable inline encryption for this file if supported. */
-int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
+int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci)
{
const struct inode *inode = ci->ci_inode;
struct super_block *sb = inode->i_sb;
@@ -129,7 +128,7 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
* crypto configuration that the file would use.
*/
crypto_cfg.crypto_mode = ci->ci_mode->blk_crypto_mode;
- crypto_cfg.data_unit_size = sb->s_blocksize;
+ crypto_cfg.data_unit_size = 1U << ci->ci_data_unit_bits;
crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci);
devs = fscrypt_get_devices(sb, &num_devs);
@@ -152,7 +151,7 @@ out_free_devs:
int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
const u8 *raw_key,
- const struct fscrypt_info *ci)
+ const struct fscrypt_inode_info *ci)
{
const struct inode *inode = ci->ci_inode;
struct super_block *sb = inode->i_sb;
@@ -168,7 +167,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
return -ENOMEM;
err = blk_crypto_init_key(blk_key, raw_key, crypto_mode,
- fscrypt_get_dun_bytes(ci), sb->s_blocksize);
+ fscrypt_get_dun_bytes(ci),
+ 1U << ci->ci_data_unit_bits);
if (err) {
fscrypt_err(inode, "error %d initializing blk-crypto key", err);
goto fail;
@@ -232,13 +232,15 @@ bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode)
}
EXPORT_SYMBOL_GPL(__fscrypt_inode_uses_inline_crypto);
-static void fscrypt_generate_dun(const struct fscrypt_info *ci, u64 lblk_num,
+static void fscrypt_generate_dun(const struct fscrypt_inode_info *ci,
+ u64 lblk_num,
u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE])
{
+ u64 index = lblk_num << ci->ci_data_units_per_block_bits;
union fscrypt_iv iv;
int i;
- fscrypt_generate_iv(&iv, lblk_num, ci);
+ fscrypt_generate_iv(&iv, index, ci);
BUILD_BUG_ON(FSCRYPT_MAX_IV_SIZE > BLK_CRYPTO_MAX_IV_SIZE);
memset(dun, 0, BLK_CRYPTO_MAX_IV_SIZE);
@@ -265,7 +267,7 @@ static void fscrypt_generate_dun(const struct fscrypt_info *ci, u64 lblk_num,
void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode,
u64 first_lblk, gfp_t gfp_mask)
{
- const struct fscrypt_info *ci;
+ const struct fscrypt_inode_info *ci;
u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
if (!fscrypt_inode_uses_inline_crypto(inode))
@@ -456,7 +458,7 @@ EXPORT_SYMBOL_GPL(fscrypt_dio_supported);
*/
u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks)
{
- const struct fscrypt_info *ci;
+ const struct fscrypt_inode_info *ci;
u32 dun;
if (!fscrypt_inode_uses_inline_crypto(inode))
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index 7cbb1fd872ac..f34a9b0b9e92 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -99,10 +99,10 @@ void fscrypt_put_master_key_activeref(struct super_block *sb,
spin_unlock(&sb->s_master_keys->lock);
/*
- * ->mk_active_refs == 0 implies that ->mk_secret is not present and
- * that ->mk_decrypted_inodes is empty.
+ * ->mk_active_refs == 0 implies that ->mk_present is false and
+ * ->mk_decrypted_inodes is empty.
*/
- WARN_ON_ONCE(is_master_key_secret_present(&mk->mk_secret));
+ WARN_ON_ONCE(mk->mk_present);
WARN_ON_ONCE(!list_empty(&mk->mk_decrypted_inodes));
for (i = 0; i <= FSCRYPT_MODE_MAX; i++) {
@@ -121,6 +121,18 @@ void fscrypt_put_master_key_activeref(struct super_block *sb,
fscrypt_put_master_key(mk);
}
+/*
+ * This transitions the key state from present to incompletely removed, and then
+ * potentially to absent (depending on whether inodes remain).
+ */
+static void fscrypt_initiate_key_removal(struct super_block *sb,
+ struct fscrypt_master_key *mk)
+{
+ WRITE_ONCE(mk->mk_present, false);
+ wipe_master_key_secret(&mk->mk_secret);
+ fscrypt_put_master_key_activeref(sb, mk);
+}
+
static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec)
{
if (spec->__reserved)
@@ -234,14 +246,13 @@ void fscrypt_destroy_keyring(struct super_block *sb)
* evicted, every key remaining in the keyring should
* have an empty inode list, and should only still be in
* the keyring due to the single active ref associated
- * with ->mk_secret. There should be no structural refs
- * beyond the one associated with the active ref.
+ * with ->mk_present. There should be no structural
+ * refs beyond the one associated with the active ref.
*/
WARN_ON_ONCE(refcount_read(&mk->mk_active_refs) != 1);
WARN_ON_ONCE(refcount_read(&mk->mk_struct_refs) != 1);
- WARN_ON_ONCE(!is_master_key_secret_present(&mk->mk_secret));
- wipe_master_key_secret(&mk->mk_secret);
- fscrypt_put_master_key_activeref(sb, mk);
+ WARN_ON_ONCE(!mk->mk_present);
+ fscrypt_initiate_key_removal(sb, mk);
}
}
kfree_sensitive(keyring);
@@ -439,7 +450,8 @@ static int add_new_master_key(struct super_block *sb,
}
move_master_key_secret(&mk->mk_secret, secret);
- refcount_set(&mk->mk_active_refs, 1); /* ->mk_secret is present */
+ mk->mk_present = true;
+ refcount_set(&mk->mk_active_refs, 1); /* ->mk_present is true */
spin_lock(&keyring->lock);
hlist_add_head_rcu(&mk->mk_node,
@@ -478,11 +490,18 @@ static int add_existing_master_key(struct fscrypt_master_key *mk,
return err;
}
- /* Re-add the secret if needed. */
- if (!is_master_key_secret_present(&mk->mk_secret)) {
- if (!refcount_inc_not_zero(&mk->mk_active_refs))
+ /* If the key is incompletely removed, make it present again. */
+ if (!mk->mk_present) {
+ if (!refcount_inc_not_zero(&mk->mk_active_refs)) {
+ /*
+ * Raced with the last active ref being dropped, so the
+ * key has become, or is about to become, "absent".
+ * Therefore, we need to allocate a new key struct.
+ */
return KEY_DEAD;
+ }
move_master_key_secret(&mk->mk_secret, secret);
+ WRITE_ONCE(mk->mk_present, true);
}
return 0;
@@ -506,8 +525,8 @@ static int do_add_master_key(struct super_block *sb,
err = add_new_master_key(sb, secret, mk_spec);
} else {
/*
- * Found the key in ->s_master_keys. Re-add the secret if
- * needed, and add the user to ->mk_users if needed.
+ * Found the key in ->s_master_keys. Add the user to ->mk_users
+ * if needed, and make the key "present" again if possible.
*/
down_write(&mk->mk_sem);
err = add_existing_master_key(mk, secret);
@@ -867,7 +886,7 @@ static void shrink_dcache_inode(struct inode *inode)
static void evict_dentries_for_decrypted_inodes(struct fscrypt_master_key *mk)
{
- struct fscrypt_info *ci;
+ struct fscrypt_inode_info *ci;
struct inode *inode;
struct inode *toput_inode = NULL;
@@ -917,7 +936,7 @@ static int check_for_busy_inodes(struct super_block *sb,
/* select an example file to show for debugging purposes */
struct inode *inode =
list_first_entry(&mk->mk_decrypted_inodes,
- struct fscrypt_info,
+ struct fscrypt_inode_info,
ci_master_key_link)->ci_inode;
ino = inode->i_ino;
}
@@ -989,9 +1008,8 @@ static int try_to_lock_encrypted_files(struct super_block *sb,
*
* If all inodes were evicted, then we unlink the fscrypt_master_key from the
* keyring. Otherwise it remains in the keyring in the "incompletely removed"
- * state (without the actual secret key) where it tracks the list of remaining
- * inodes. Userspace can execute the ioctl again later to retry eviction, or
- * alternatively can re-add the secret key again.
+ * state where it tracks the list of remaining inodes. Userspace can execute
+ * the ioctl again later to retry eviction, or alternatively can re-add the key.
*
* For more details, see the "Removing keys" section of
* Documentation/filesystems/fscrypt.rst.
@@ -1053,11 +1071,10 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
}
}
- /* No user claims remaining. Go ahead and wipe the secret. */
+ /* No user claims remaining. Initiate removal of the key. */
err = -ENOKEY;
- if (is_master_key_secret_present(&mk->mk_secret)) {
- wipe_master_key_secret(&mk->mk_secret);
- fscrypt_put_master_key_activeref(sb, mk);
+ if (mk->mk_present) {
+ fscrypt_initiate_key_removal(sb, mk);
err = 0;
}
inodes_remain = refcount_read(&mk->mk_active_refs) > 0;
@@ -1074,9 +1091,9 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
}
/*
* We return 0 if we successfully did something: removed a claim to the
- * key, wiped the secret, or tried locking the files again. Users need
- * to check the informational status flags if they care whether the key
- * has been fully removed including all files locked.
+ * key, initiated removal of the key, or tried locking the files again.
+ * Users need to check the informational status flags if they care
+ * whether the key has been fully removed including all files locked.
*/
out_put_key:
fscrypt_put_master_key(mk);
@@ -1103,12 +1120,11 @@ EXPORT_SYMBOL_GPL(fscrypt_ioctl_remove_key_all_users);
* Retrieve the status of an fscrypt master encryption key.
*
* We set ->status to indicate whether the key is absent, present, or
- * incompletely removed. "Incompletely removed" means that the master key
- * secret has been removed, but some files which had been unlocked with it are
- * still in use. This field allows applications to easily determine the state
- * of an encrypted directory without using a hack such as trying to open a
- * regular file in it (which can confuse the "incompletely removed" state with
- * absent or present).
+ * incompletely removed. (For an explanation of what these statuses mean and
+ * how they are represented internally, see struct fscrypt_master_key.) This
+ * field allows applications to easily determine the status of an encrypted
+ * directory without using a hack such as trying to open a regular file in it
+ * (which can confuse the "incompletely removed" status with absent or present).
*
* In addition, for v2 policy keys we allow applications to determine, via
* ->status_flags and ->user_count, whether the key has been added by the
@@ -1150,7 +1166,7 @@ int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg)
}
down_read(&mk->mk_sem);
- if (!is_master_key_secret_present(&mk->mk_secret)) {
+ if (!mk->mk_present) {
arg.status = refcount_read(&mk->mk_active_refs) > 0 ?
FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED :
FSCRYPT_KEY_STATUS_ABSENT /* raced with full removal */;
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index 361f41ef46c7..d71f7c799e79 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -148,7 +148,7 @@ err_free_tfm:
* and IV generation method (@ci->ci_policy.flags).
*/
int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
- const u8 *raw_key, const struct fscrypt_info *ci)
+ const u8 *raw_key, const struct fscrypt_inode_info *ci)
{
struct crypto_skcipher *tfm;
@@ -178,13 +178,14 @@ void fscrypt_destroy_prepared_key(struct super_block *sb,
}
/* Given a per-file encryption key, set up the file's crypto transform object */
-int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key)
+int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci,
+ const u8 *raw_key)
{
ci->ci_owns_key = true;
return fscrypt_prepare_key(&ci->ci_enc_key, raw_key, ci);
}
-static int setup_per_mode_enc_key(struct fscrypt_info *ci,
+static int setup_per_mode_enc_key(struct fscrypt_inode_info *ci,
struct fscrypt_master_key *mk,
struct fscrypt_prepared_key *keys,
u8 hkdf_context, bool include_fs_uuid)
@@ -265,7 +266,7 @@ static int fscrypt_derive_siphash_key(const struct fscrypt_master_key *mk,
return 0;
}
-int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
+int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci,
const struct fscrypt_master_key *mk)
{
int err;
@@ -279,7 +280,7 @@ int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
return 0;
}
-void fscrypt_hash_inode_number(struct fscrypt_info *ci,
+void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci,
const struct fscrypt_master_key *mk)
{
WARN_ON_ONCE(ci->ci_inode->i_ino == 0);
@@ -289,7 +290,7 @@ void fscrypt_hash_inode_number(struct fscrypt_info *ci,
&mk->mk_ino_hash_key);
}
-static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_info *ci,
+static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_inode_info *ci,
struct fscrypt_master_key *mk)
{
int err;
@@ -329,7 +330,7 @@ unlock:
return 0;
}
-static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
+static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci,
struct fscrypt_master_key *mk,
bool need_dirhash_key)
{
@@ -404,7 +405,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
* still allow 512-bit master keys if the user chooses to use them, though.)
*/
static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk,
- const struct fscrypt_info *ci)
+ const struct fscrypt_inode_info *ci)
{
unsigned int min_keysize;
@@ -430,11 +431,12 @@ static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk,
*
* If the master key is found in the filesystem-level keyring, then it is
* returned in *mk_ret with its semaphore read-locked. This is needed to ensure
- * that only one task links the fscrypt_info into ->mk_decrypted_inodes (as
- * multiple tasks may race to create an fscrypt_info for the same inode), and to
- * synchronize the master key being removed with a new inode starting to use it.
+ * that only one task links the fscrypt_inode_info into ->mk_decrypted_inodes
+ * (as multiple tasks may race to create an fscrypt_inode_info for the same
+ * inode), and to synchronize the master key being removed with a new inode
+ * starting to use it.
*/
-static int setup_file_encryption_key(struct fscrypt_info *ci,
+static int setup_file_encryption_key(struct fscrypt_inode_info *ci,
bool need_dirhash_key,
struct fscrypt_master_key **mk_ret)
{
@@ -484,8 +486,8 @@ static int setup_file_encryption_key(struct fscrypt_info *ci,
}
down_read(&mk->mk_sem);
- /* Has the secret been removed (via FS_IOC_REMOVE_ENCRYPTION_KEY)? */
- if (!is_master_key_secret_present(&mk->mk_secret)) {
+ if (!mk->mk_present) {
+ /* FS_IOC_REMOVE_ENCRYPTION_KEY has been executed on this key */
err = -ENOKEY;
goto out_release_key;
}
@@ -519,7 +521,7 @@ out_release_key:
return err;
}
-static void put_crypt_info(struct fscrypt_info *ci)
+static void put_crypt_info(struct fscrypt_inode_info *ci)
{
struct fscrypt_master_key *mk;
@@ -537,8 +539,8 @@ static void put_crypt_info(struct fscrypt_info *ci)
/*
* Remove this inode from the list of inodes that were unlocked
* with the master key. In addition, if we're removing the last
- * inode from a master key struct that already had its secret
- * removed, then complete the full removal of the struct.
+ * inode from an incompletely removed key, then complete the
+ * full removal of the key.
*/
spin_lock(&mk->mk_decrypted_inodes_lock);
list_del(&ci->ci_master_key_link);
@@ -546,7 +548,7 @@ static void put_crypt_info(struct fscrypt_info *ci)
fscrypt_put_master_key_activeref(ci->ci_inode->i_sb, mk);
}
memzero_explicit(ci, sizeof(*ci));
- kmem_cache_free(fscrypt_info_cachep, ci);
+ kmem_cache_free(fscrypt_inode_info_cachep, ci);
}
static int
@@ -555,7 +557,7 @@ fscrypt_setup_encryption_info(struct inode *inode,
const u8 nonce[FSCRYPT_FILE_NONCE_SIZE],
bool need_dirhash_key)
{
- struct fscrypt_info *crypt_info;
+ struct fscrypt_inode_info *crypt_info;
struct fscrypt_mode *mode;
struct fscrypt_master_key *mk = NULL;
int res;
@@ -564,7 +566,7 @@ fscrypt_setup_encryption_info(struct inode *inode,
if (res)
return res;
- crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_KERNEL);
+ crypt_info = kmem_cache_zalloc(fscrypt_inode_info_cachep, GFP_KERNEL);
if (!crypt_info)
return -ENOMEM;
@@ -580,6 +582,11 @@ fscrypt_setup_encryption_info(struct inode *inode,
WARN_ON_ONCE(mode->ivsize > FSCRYPT_MAX_IV_SIZE);
crypt_info->ci_mode = mode;
+ crypt_info->ci_data_unit_bits =
+ fscrypt_policy_du_bits(&crypt_info->ci_policy, inode);
+ crypt_info->ci_data_units_per_block_bits =
+ inode->i_blkbits - crypt_info->ci_data_unit_bits;
+
res = setup_file_encryption_key(crypt_info, need_dirhash_key, &mk);
if (res)
goto out;
@@ -587,8 +594,8 @@ fscrypt_setup_encryption_info(struct inode *inode,
/*
* For existing inodes, multiple tasks may race to set ->i_crypt_info.
* So use cmpxchg_release(). This pairs with the smp_load_acquire() in
- * fscrypt_get_info(). I.e., here we publish ->i_crypt_info with a
- * RELEASE barrier so that other tasks can ACQUIRE it.
+ * fscrypt_get_inode_info(). I.e., here we publish ->i_crypt_info with
+ * a RELEASE barrier so that other tasks can ACQUIRE it.
*/
if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) {
/*
@@ -735,8 +742,8 @@ EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode);
* fscrypt_put_encryption_info() - free most of an inode's fscrypt data
* @inode: an inode being evicted
*
- * Free the inode's fscrypt_info. Filesystems must call this when the inode is
- * being evicted. An RCU grace period need not have elapsed yet.
+ * Free the inode's fscrypt_inode_info. Filesystems must call this when the
+ * inode is being evicted. An RCU grace period need not have elapsed yet.
*/
void fscrypt_put_encryption_info(struct inode *inode)
{
@@ -773,7 +780,7 @@ EXPORT_SYMBOL(fscrypt_free_inode);
*/
int fscrypt_drop_inode(struct inode *inode)
{
- const struct fscrypt_info *ci = fscrypt_get_info(inode);
+ const struct fscrypt_inode_info *ci = fscrypt_get_inode_info(inode);
/*
* If ci is NULL, then the inode doesn't have an encryption key set up
@@ -794,13 +801,14 @@ int fscrypt_drop_inode(struct inode *inode)
return 0;
/*
- * Note: since we aren't holding the key semaphore, the result here can
+ * We can't take ->mk_sem here, since this runs in atomic context.
+ * Therefore, ->mk_present can change concurrently, and our result may
* immediately become outdated. But there's no correctness problem with
* unnecessarily evicting. Nor is there a correctness problem with not
* evicting while iput() is racing with the key being removed, since
* then the thread removing the key will either evict the inode itself
* or will correctly detect that it wasn't evicted due to the race.
*/
- return !is_master_key_secret_present(&ci->ci_master_key->mk_secret);
+ return !READ_ONCE(ci->ci_master_key->mk_present);
}
EXPORT_SYMBOL_GPL(fscrypt_drop_inode);
diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c
index 75dabd9b27f9..cf3b58ec32cc 100644
--- a/fs/crypto/keysetup_v1.c
+++ b/fs/crypto/keysetup_v1.c
@@ -20,8 +20,8 @@
* managed alongside the master keys in the filesystem-level keyring)
*/
-#include <crypto/algapi.h>
#include <crypto/skcipher.h>
+#include <crypto/utils.h>
#include <keys/user-type.h>
#include <linux/hashtable.h>
#include <linux/scatterlist.h>
@@ -178,7 +178,8 @@ void fscrypt_put_direct_key(struct fscrypt_direct_key *dk)
*/
static struct fscrypt_direct_key *
find_or_insert_direct_key(struct fscrypt_direct_key *to_insert,
- const u8 *raw_key, const struct fscrypt_info *ci)
+ const u8 *raw_key,
+ const struct fscrypt_inode_info *ci)
{
unsigned long hash_key;
struct fscrypt_direct_key *dk;
@@ -218,7 +219,7 @@ find_or_insert_direct_key(struct fscrypt_direct_key *to_insert,
/* Prepare to encrypt directly using the master key in the given mode */
static struct fscrypt_direct_key *
-fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key)
+fscrypt_get_direct_key(const struct fscrypt_inode_info *ci, const u8 *raw_key)
{
struct fscrypt_direct_key *dk;
int err;
@@ -250,7 +251,7 @@ err_free_dk:
}
/* v1 policy, DIRECT_KEY: use the master key directly */
-static int setup_v1_file_key_direct(struct fscrypt_info *ci,
+static int setup_v1_file_key_direct(struct fscrypt_inode_info *ci,
const u8 *raw_master_key)
{
struct fscrypt_direct_key *dk;
@@ -264,7 +265,7 @@ static int setup_v1_file_key_direct(struct fscrypt_info *ci,
}
/* v1 policy, !DIRECT_KEY: derive the file's encryption key */
-static int setup_v1_file_key_derived(struct fscrypt_info *ci,
+static int setup_v1_file_key_derived(struct fscrypt_inode_info *ci,
const u8 *raw_master_key)
{
u8 *derived_key;
@@ -289,7 +290,8 @@ out:
return err;
}
-int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, const u8 *raw_master_key)
+int fscrypt_setup_v1_file_key(struct fscrypt_inode_info *ci,
+ const u8 *raw_master_key)
{
if (ci->ci_policy.v1.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY)
return setup_v1_file_key_direct(ci, raw_master_key);
@@ -297,8 +299,10 @@ int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, const u8 *raw_master_key)
return setup_v1_file_key_derived(ci, raw_master_key);
}
-int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci)
+int
+fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_inode_info *ci)
{
+ const struct super_block *sb = ci->ci_inode->i_sb;
struct key *key;
const struct fscrypt_key *payload;
int err;
@@ -306,8 +310,8 @@ int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci)
key = find_and_lock_process_key(FSCRYPT_KEY_DESC_PREFIX,
ci->ci_policy.v1.master_key_descriptor,
ci->ci_mode->keysize, &payload);
- if (key == ERR_PTR(-ENOKEY) && ci->ci_inode->i_sb->s_cop->key_prefix) {
- key = find_and_lock_process_key(ci->ci_inode->i_sb->s_cop->key_prefix,
+ if (key == ERR_PTR(-ENOKEY) && sb->s_cop->legacy_key_prefix) {
+ key = find_and_lock_process_key(sb->s_cop->legacy_key_prefix,
ci->ci_policy.v1.master_key_descriptor,
ci->ci_mode->keysize, &payload);
}
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index f4456ecb3f87..701259991277 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -118,12 +118,11 @@ static bool supported_direct_key_modes(const struct inode *inode,
}
static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy,
- const struct inode *inode,
- const char *type,
- int max_ino_bits, int max_lblk_bits)
+ const struct inode *inode)
{
+ const char *type = (policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64)
+ ? "IV_INO_LBLK_64" : "IV_INO_LBLK_32";
struct super_block *sb = inode->i_sb;
- int ino_bits = 64, lblk_bits = 64;
/*
* IV_INO_LBLK_* exist only because of hardware limitations, and
@@ -150,17 +149,29 @@ static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy,
type, sb->s_id);
return false;
}
- if (sb->s_cop->get_ino_and_lblk_bits)
- sb->s_cop->get_ino_and_lblk_bits(sb, &ino_bits, &lblk_bits);
- if (ino_bits > max_ino_bits) {
+
+ /*
+ * IV_INO_LBLK_64 and IV_INO_LBLK_32 both require that inode numbers fit
+ * in 32 bits. In principle, IV_INO_LBLK_32 could support longer inode
+ * numbers because it hashes the inode number; however, currently the
+ * inode number is gotten from inode::i_ino which is 'unsigned long'.
+ * So for now the implementation limit is 32 bits.
+ */
+ if (!sb->s_cop->has_32bit_inodes) {
fscrypt_warn(inode,
"Can't use %s policy on filesystem '%s' because its inode numbers are too long",
type, sb->s_id);
return false;
}
- if (lblk_bits > max_lblk_bits) {
+
+ /*
+ * IV_INO_LBLK_64 and IV_INO_LBLK_32 both require that file data unit
+ * indices fit in 32 bits.
+ */
+ if (fscrypt_max_file_dun_bits(sb,
+ fscrypt_policy_v2_du_bits(policy, inode)) > 32) {
fscrypt_warn(inode,
- "Can't use %s policy on filesystem '%s' because its block numbers are too long",
+ "Can't use %s policy on filesystem '%s' because its maximum file size is too large",
type, sb->s_id);
return false;
}
@@ -233,25 +244,39 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy,
return false;
}
+ if (policy->log2_data_unit_size) {
+ if (!inode->i_sb->s_cop->supports_subblock_data_units) {
+ fscrypt_warn(inode,
+ "Filesystem does not support configuring crypto data unit size");
+ return false;
+ }
+ if (policy->log2_data_unit_size > inode->i_blkbits ||
+ policy->log2_data_unit_size < SECTOR_SHIFT /* 9 */) {
+ fscrypt_warn(inode,
+ "Unsupported log2_data_unit_size in encryption policy: %d",
+ policy->log2_data_unit_size);
+ return false;
+ }
+ if (policy->log2_data_unit_size != inode->i_blkbits &&
+ (policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) {
+ /*
+ * Not safe to enable yet, as we need to ensure that DUN
+ * wraparound can only occur on a FS block boundary.
+ */
+ fscrypt_warn(inode,
+ "Sub-block data units not yet supported with IV_INO_LBLK_32");
+ return false;
+ }
+ }
+
if ((policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) &&
!supported_direct_key_modes(inode, policy->contents_encryption_mode,
policy->filenames_encryption_mode))
return false;
- if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) &&
- !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_64",
- 32, 32))
- return false;
-
- /*
- * IV_INO_LBLK_32 hashes the inode number, so in principle it can
- * support any ino_bits. However, currently the inode number is gotten
- * from inode::i_ino which is 'unsigned long'. So for now the
- * implementation limit is 32 bits.
- */
- if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) &&
- !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_32",
- 32, 32))
+ if ((policy->flags & (FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64 |
+ FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) &&
+ !supported_iv_ino_lblk_policy(policy, inode))
return false;
if (memchr_inv(policy->__reserved, 0, sizeof(policy->__reserved))) {
@@ -330,6 +355,7 @@ static int fscrypt_new_context(union fscrypt_context *ctx_u,
ctx->filenames_encryption_mode =
policy->filenames_encryption_mode;
ctx->flags = policy->flags;
+ ctx->log2_data_unit_size = policy->log2_data_unit_size;
memcpy(ctx->master_key_identifier,
policy->master_key_identifier,
sizeof(ctx->master_key_identifier));
@@ -390,6 +416,7 @@ int fscrypt_policy_from_context(union fscrypt_policy *policy_u,
policy->filenames_encryption_mode =
ctx->filenames_encryption_mode;
policy->flags = ctx->flags;
+ policy->log2_data_unit_size = ctx->log2_data_unit_size;
memcpy(policy->__reserved, ctx->__reserved,
sizeof(policy->__reserved));
memcpy(policy->master_key_identifier,
@@ -405,11 +432,11 @@ int fscrypt_policy_from_context(union fscrypt_policy *policy_u,
/* Retrieve an inode's encryption policy */
static int fscrypt_get_policy(struct inode *inode, union fscrypt_policy *policy)
{
- const struct fscrypt_info *ci;
+ const struct fscrypt_inode_info *ci;
union fscrypt_context ctx;
int ret;
- ci = fscrypt_get_info(inode);
+ ci = fscrypt_get_inode_info(inode);
if (ci) {
/* key available, use the cached policy */
*policy = ci->ci_policy;
@@ -647,7 +674,7 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
/*
* Both parent and child are encrypted, so verify they use the same
- * encryption policy. Compare the fscrypt_info structs if the keys are
+ * encryption policy. Compare the cached policies if the keys are
* available, otherwise retrieve and compare the fscrypt_contexts.
*
* Note that the fscrypt_context retrieval will be required frequently
@@ -717,7 +744,7 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir)
*/
int fscrypt_context_for_new_inode(void *ctx, struct inode *inode)
{
- struct fscrypt_info *ci = inode->i_crypt_info;
+ struct fscrypt_inode_info *ci = inode->i_crypt_info;
BUILD_BUG_ON(sizeof(union fscrypt_context) !=
FSCRYPT_SET_CONTEXT_MAX_SIZE);
@@ -742,7 +769,7 @@ EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode);
*/
int fscrypt_set_context(struct inode *inode, void *fs_data)
{
- struct fscrypt_info *ci = inode->i_crypt_info;
+ struct fscrypt_inode_info *ci = inode->i_crypt_info;
union fscrypt_context ctx;
int ctxsize;
diff --git a/fs/dax.c b/fs/dax.c
index 8fafecbe42b1..3380b43cb6bb 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -412,23 +412,23 @@ static struct page *dax_busy_page(void *entry)
return NULL;
}
-/*
- * dax_lock_page - Lock the DAX entry corresponding to a page
- * @page: The page whose entry we want to lock
+/**
+ * dax_lock_folio - Lock the DAX entry corresponding to a folio
+ * @folio: The folio whose entry we want to lock
*
* Context: Process context.
- * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could
+ * Return: A cookie to pass to dax_unlock_folio() or 0 if the entry could
* not be locked.
*/
-dax_entry_t dax_lock_page(struct page *page)
+dax_entry_t dax_lock_folio(struct folio *folio)
{
XA_STATE(xas, NULL, 0);
void *entry;
- /* Ensure page->mapping isn't freed while we look at it */
+ /* Ensure folio->mapping isn't freed while we look at it */
rcu_read_lock();
for (;;) {
- struct address_space *mapping = READ_ONCE(page->mapping);
+ struct address_space *mapping = READ_ONCE(folio->mapping);
entry = NULL;
if (!mapping || !dax_mapping(mapping))
@@ -447,11 +447,11 @@ dax_entry_t dax_lock_page(struct page *page)
xas.xa = &mapping->i_pages;
xas_lock_irq(&xas);
- if (mapping != page->mapping) {
+ if (mapping != folio->mapping) {
xas_unlock_irq(&xas);
continue;
}
- xas_set(&xas, page->index);
+ xas_set(&xas, folio->index);
entry = xas_load(&xas);
if (dax_is_locked(entry)) {
rcu_read_unlock();
@@ -467,10 +467,10 @@ dax_entry_t dax_lock_page(struct page *page)
return (dax_entry_t)entry;
}
-void dax_unlock_page(struct page *page, dax_entry_t cookie)
+void dax_unlock_folio(struct folio *folio, dax_entry_t cookie)
{
- struct address_space *mapping = page->mapping;
- XA_STATE(xas, &mapping->i_pages, page->index);
+ struct address_space *mapping = folio->mapping;
+ XA_STATE(xas, &mapping->i_pages, folio->index);
if (S_ISCHR(mapping->host->i_mode))
return;
diff --git a/fs/dcache.c b/fs/dcache.c
index 25ac74d30bff..c82ae731df9a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -78,7 +78,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
EXPORT_SYMBOL(rename_lock);
-static struct kmem_cache *dentry_cache __read_mostly;
+static struct kmem_cache *dentry_cache __ro_after_init;
const struct qstr empty_name = QSTR_INIT("", 0);
EXPORT_SYMBOL(empty_name);
@@ -96,9 +96,9 @@ EXPORT_SYMBOL(dotdot_name);
* information, yet avoid using a prime hash-size or similar.
*/
-static unsigned int d_hash_shift __read_mostly;
+static unsigned int d_hash_shift __ro_after_init;
-static struct hlist_bl_head *dentry_hashtable __read_mostly;
+static struct hlist_bl_head *dentry_hashtable __ro_after_init;
static inline struct hlist_bl_head *d_hash(unsigned int hash)
{
@@ -3246,11 +3246,10 @@ void d_genocide(struct dentry *parent)
d_walk(parent, parent, d_genocide_kill);
}
-void d_tmpfile(struct file *file, struct inode *inode)
+void d_mark_tmpfile(struct file *file, struct inode *inode)
{
struct dentry *dentry = file->f_path.dentry;
- inode_dec_link_count(inode);
BUG_ON(dentry->d_name.name != dentry->d_iname ||
!hlist_unhashed(&dentry->d_u.d_alias) ||
!d_unlinked(dentry));
@@ -3260,6 +3259,15 @@ void d_tmpfile(struct file *file, struct inode *inode)
(unsigned long long)inode->i_ino);
spin_unlock(&dentry->d_lock);
spin_unlock(&dentry->d_parent->d_lock);
+}
+EXPORT_SYMBOL(d_mark_tmpfile);
+
+void d_tmpfile(struct file *file, struct inode *inode)
+{
+ struct dentry *dentry = file->f_path.dentry;
+
+ inode_dec_link_count(inode);
+ d_mark_tmpfile(file, inode);
d_instantiate(dentry, inode);
}
EXPORT_SYMBOL(d_tmpfile);
@@ -3324,7 +3332,7 @@ static void __init dcache_init(void)
}
/* SLAB cache for __getname() consumers */
-struct kmem_cache *names_cachep __read_mostly;
+struct kmem_cache *names_cachep __ro_after_init;
EXPORT_SYMBOL(names_cachep);
void __init vfs_caches_init_early(void)
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 87b3753aa4b1..6d7c1a49581f 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -84,6 +84,14 @@ int debugfs_file_get(struct dentry *dentry)
struct debugfs_fsdata *fsd;
void *d_fsd;
+ /*
+ * This could only happen if some debugfs user erroneously calls
+ * debugfs_file_get() on a dentry that isn't even a file, let
+ * them know about it.
+ */
+ if (WARN_ON(!d_is_reg(dentry)))
+ return -EINVAL;
+
d_fsd = READ_ONCE(dentry->d_fsdata);
if (!((unsigned long)d_fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)) {
fsd = d_fsd;
@@ -96,7 +104,11 @@ int debugfs_file_get(struct dentry *dentry)
~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
refcount_set(&fsd->active_users, 1);
init_completion(&fsd->active_users_drained);
+ INIT_LIST_HEAD(&fsd->cancellations);
+ mutex_init(&fsd->cancellations_mtx);
+
if (cmpxchg(&dentry->d_fsdata, d_fsd, fsd) != d_fsd) {
+ mutex_destroy(&fsd->cancellations_mtx);
kfree(fsd);
fsd = READ_ONCE(dentry->d_fsdata);
}
@@ -138,6 +150,86 @@ void debugfs_file_put(struct dentry *dentry)
}
EXPORT_SYMBOL_GPL(debugfs_file_put);
+/**
+ * debugfs_enter_cancellation - enter a debugfs cancellation
+ * @file: the file being accessed
+ * @cancellation: the cancellation object, the cancel callback
+ * inside of it must be initialized
+ *
+ * When a debugfs file is removed it needs to wait for all active
+ * operations to complete. However, the operation itself may need
+ * to wait for hardware or completion of some asynchronous process
+ * or similar. As such, it may need to be cancelled to avoid long
+ * waits or even deadlocks.
+ *
+ * This function can be used inside a debugfs handler that may
+ * need to be cancelled. As soon as this function is called, the
+ * cancellation's 'cancel' callback may be called, at which point
+ * the caller should proceed to call debugfs_leave_cancellation()
+ * and leave the debugfs handler function as soon as possible.
+ * Note that the 'cancel' callback is only ever called in the
+ * context of some kind of debugfs_remove().
+ *
+ * This function must be paired with debugfs_leave_cancellation().
+ */
+void debugfs_enter_cancellation(struct file *file,
+ struct debugfs_cancellation *cancellation)
+{
+ struct debugfs_fsdata *fsd;
+ struct dentry *dentry = F_DENTRY(file);
+
+ INIT_LIST_HEAD(&cancellation->list);
+
+ if (WARN_ON(!d_is_reg(dentry)))
+ return;
+
+ if (WARN_ON(!cancellation->cancel))
+ return;
+
+ fsd = READ_ONCE(dentry->d_fsdata);
+ if (WARN_ON(!fsd ||
+ ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)))
+ return;
+
+ mutex_lock(&fsd->cancellations_mtx);
+ list_add(&cancellation->list, &fsd->cancellations);
+ mutex_unlock(&fsd->cancellations_mtx);
+
+ /* if we're already removing wake it up to cancel */
+ if (d_unlinked(dentry))
+ complete(&fsd->active_users_drained);
+}
+EXPORT_SYMBOL_GPL(debugfs_enter_cancellation);
+
+/**
+ * debugfs_leave_cancellation - leave cancellation section
+ * @file: the file being accessed
+ * @cancellation: the cancellation previously registered with
+ * debugfs_enter_cancellation()
+ *
+ * See the documentation of debugfs_enter_cancellation().
+ */
+void debugfs_leave_cancellation(struct file *file,
+ struct debugfs_cancellation *cancellation)
+{
+ struct debugfs_fsdata *fsd;
+ struct dentry *dentry = F_DENTRY(file);
+
+ if (WARN_ON(!d_is_reg(dentry)))
+ return;
+
+ fsd = READ_ONCE(dentry->d_fsdata);
+ if (WARN_ON(!fsd ||
+ ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)))
+ return;
+
+ mutex_lock(&fsd->cancellations_mtx);
+ if (!list_empty(&cancellation->list))
+ list_del(&cancellation->list);
+ mutex_unlock(&fsd->cancellations_mtx);
+}
+EXPORT_SYMBOL_GPL(debugfs_leave_cancellation);
+
/*
* Only permit access to world-readable files when the kernel is locked down.
* We also need to exclude any file that has ways to write or alter it as root
@@ -939,7 +1031,7 @@ static ssize_t debugfs_write_file_str(struct file *file, const char __user *user
new[pos + count] = '\0';
strim(new);
- rcu_assign_pointer(*(char **)file->private_data, new);
+ rcu_assign_pointer(*(char __rcu **)file->private_data, new);
synchronize_rcu();
kfree(old);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 83e57e9f9fa0..034a617cb1a5 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -72,7 +72,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb)
struct inode *inode = new_inode(sb);
if (inode) {
inode->i_ino = get_next_ino();
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
}
return inode;
}
@@ -236,17 +236,25 @@ static const struct super_operations debugfs_super_operations = {
static void debugfs_release_dentry(struct dentry *dentry)
{
- void *fsd = dentry->d_fsdata;
+ struct debugfs_fsdata *fsd = dentry->d_fsdata;
- if (!((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT))
- kfree(dentry->d_fsdata);
+ if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)
+ return;
+
+ /* check it wasn't a dir (no fsdata) or automount (no real_fops) */
+ if (fsd && fsd->real_fops) {
+ WARN_ON(!list_empty(&fsd->cancellations));
+ mutex_destroy(&fsd->cancellations_mtx);
+ }
+
+ kfree(fsd);
}
static struct vfsmount *debugfs_automount(struct path *path)
{
- debugfs_automount_t f;
- f = (debugfs_automount_t)path->dentry->d_fsdata;
- return f(path->dentry, d_inode(path->dentry)->i_private);
+ struct debugfs_fsdata *fsd = path->dentry->d_fsdata;
+
+ return fsd->automount(path->dentry, d_inode(path->dentry)->i_private);
}
static const struct dentry_operations debugfs_dops = {
@@ -634,13 +642,23 @@ struct dentry *debugfs_create_automount(const char *name,
void *data)
{
struct dentry *dentry = start_creating(name, parent);
+ struct debugfs_fsdata *fsd;
struct inode *inode;
if (IS_ERR(dentry))
return dentry;
+ fsd = kzalloc(sizeof(*fsd), GFP_KERNEL);
+ if (!fsd) {
+ failed_creating(dentry);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ fsd->automount = f;
+
if (!(debugfs_allow & DEBUGFS_ALLOW_API)) {
failed_creating(dentry);
+ kfree(fsd);
return ERR_PTR(-EPERM);
}
@@ -648,13 +666,14 @@ struct dentry *debugfs_create_automount(const char *name,
if (unlikely(!inode)) {
pr_err("out of free dentries, can not create automount '%s'\n",
name);
+ kfree(fsd);
return failed_creating(dentry);
}
make_empty_dir_inode(inode);
inode->i_flags |= S_AUTOMOUNT;
inode->i_private = data;
- dentry->d_fsdata = (void *)f;
+ dentry->d_fsdata = fsd;
/* directory inodes start off with i_nlink == 2 (for "." entry) */
inc_nlink(inode);
d_instantiate(dentry, inode);
@@ -731,8 +750,37 @@ static void __debugfs_file_removed(struct dentry *dentry)
fsd = READ_ONCE(dentry->d_fsdata);
if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)
return;
- if (!refcount_dec_and_test(&fsd->active_users))
+
+ /* if we hit zero, just wait for all to finish */
+ if (!refcount_dec_and_test(&fsd->active_users)) {
wait_for_completion(&fsd->active_users_drained);
+ return;
+ }
+
+ /* if we didn't hit zero, try to cancel any we can */
+ while (refcount_read(&fsd->active_users)) {
+ struct debugfs_cancellation *c;
+
+ /*
+ * Lock the cancellations. Note that the cancellations
+ * structs are meant to be on the stack, so we need to
+ * ensure we either use them here or don't touch them,
+ * and debugfs_leave_cancellation() will wait for this
+ * to be finished processing before exiting one. It may
+ * of course win and remove the cancellation, but then
+ * chances are we never even got into this bit, we only
+ * do if the refcount isn't zero already.
+ */
+ mutex_lock(&fsd->cancellations_mtx);
+ while ((c = list_first_entry_or_null(&fsd->cancellations,
+ typeof(*c), list))) {
+ list_del_init(&c->list);
+ c->cancel(dentry, c->cancel_data);
+ }
+ mutex_unlock(&fsd->cancellations_mtx);
+
+ wait_for_completion(&fsd->active_users_drained);
+ }
}
static void remove_one(struct dentry *victim)
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
index 92af8ae31313..dae80c2a469e 100644
--- a/fs/debugfs/internal.h
+++ b/fs/debugfs/internal.h
@@ -7,6 +7,7 @@
#ifndef _DEBUGFS_INTERNAL_H_
#define _DEBUGFS_INTERNAL_H_
+#include <linux/list.h>
struct file_operations;
@@ -17,8 +18,18 @@ extern const struct file_operations debugfs_full_proxy_file_operations;
struct debugfs_fsdata {
const struct file_operations *real_fops;
- refcount_t active_users;
- struct completion active_users_drained;
+ union {
+ /* automount_fn is used when real_fops is NULL */
+ debugfs_automount_t automount;
+ struct {
+ refcount_t active_users;
+ struct completion active_users_drained;
+
+ /* protect cancellations */
+ struct mutex cancellations_mtx;
+ struct list_head cancellations;
+ };
+ };
};
/*
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 299c295a27a0..c830261aa883 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -338,7 +338,7 @@ static int mknod_ptmx(struct super_block *sb)
}
inode->i_ino = 2;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
mode = S_IFCHR|opts->ptmxmode;
init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2));
@@ -451,7 +451,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
if (!inode)
goto fail;
inode->i_ino = 1;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
@@ -560,7 +560,7 @@ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv)
inode->i_ino = index + 3;
inode->i_uid = opts->setuid ? opts->uid : current_fsuid();
inode->i_gid = opts->setgid ? opts->gid : current_fsgid();
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
init_special_inode(inode, S_IFCHR|opts->mode, MKDEV(UNIX98_PTY_SLAVE_MAJOR, index));
sprintf(s, "%d", index);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 7bc494ee56b9..20533266ade6 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -151,7 +151,7 @@ struct dio {
};
} ____cacheline_aligned_in_smp;
-static struct kmem_cache *dio_cache __read_mostly;
+static struct kmem_cache *dio_cache __ro_after_init;
/*
* How many pages are in the queue?
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 5aabcb6f0f15..42f332f46359 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -973,7 +973,8 @@ void dlm_delete_debug_comms_file(void *ctx)
void dlm_create_debug_file(struct dlm_ls *ls)
{
- char name[DLM_LOCKSPACE_LEN + 8];
+ /* Reserve enough space for the longest file name */
+ char name[DLM_LOCKSPACE_LEN + sizeof("_queued_asts")];
/* format 1 */
@@ -985,8 +986,7 @@ void dlm_create_debug_file(struct dlm_ls *ls)
/* format 2 */
- memset(name, 0, sizeof(name));
- snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_locks", ls->ls_name);
+ snprintf(name, sizeof(name), "%s_locks", ls->ls_name);
ls->ls_debug_locks_dentry = debugfs_create_file(name,
0644,
@@ -996,8 +996,7 @@ void dlm_create_debug_file(struct dlm_ls *ls)
/* format 3 */
- memset(name, 0, sizeof(name));
- snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_all", ls->ls_name);
+ snprintf(name, sizeof(name), "%s_all", ls->ls_name);
ls->ls_debug_all_dentry = debugfs_create_file(name,
S_IFREG | S_IRUGO,
@@ -1007,8 +1006,7 @@ void dlm_create_debug_file(struct dlm_ls *ls)
/* format 4 */
- memset(name, 0, sizeof(name));
- snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_toss", ls->ls_name);
+ snprintf(name, sizeof(name), "%s_toss", ls->ls_name);
ls->ls_debug_toss_dentry = debugfs_create_file(name,
S_IFREG | S_IRUGO,
@@ -1016,8 +1014,7 @@ void dlm_create_debug_file(struct dlm_ls *ls)
ls,
&format4_fops);
- memset(name, 0, sizeof(name));
- snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_waiters", ls->ls_name);
+ snprintf(name, sizeof(name), "%s_waiters", ls->ls_name);
ls->ls_debug_waiters_dentry = debugfs_create_file(name,
0644,
@@ -1027,8 +1024,7 @@ void dlm_create_debug_file(struct dlm_ls *ls)
/* format 5 */
- memset(name, 0, sizeof(name));
- snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_queued_asts", ls->ls_name);
+ snprintf(name, sizeof(name), "%s_queued_asts", ls->ls_name);
ls->ls_debug_queued_asts_dentry = debugfs_create_file(name,
0644,
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index f7bc22e74db2..67f8dd8a05ef 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -63,6 +63,7 @@
#include "config.h"
#define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(5000)
+#define DLM_MAX_PROCESS_BUFFERS 24
#define NEEDED_RMEM (4*1024*1024)
struct connection {
@@ -194,6 +195,7 @@ static const struct dlm_proto_ops *dlm_proto_ops;
#define DLM_IO_END 1
#define DLM_IO_EOF 2
#define DLM_IO_RESCHED 3
+#define DLM_IO_FLUSH 4
static void process_recv_sockets(struct work_struct *work);
static void process_send_sockets(struct work_struct *work);
@@ -202,6 +204,7 @@ static void process_dlm_messages(struct work_struct *work);
static DECLARE_WORK(process_work, process_dlm_messages);
static DEFINE_SPINLOCK(processqueue_lock);
static bool process_dlm_messages_pending;
+static atomic_t processqueue_count;
static LIST_HEAD(processqueue);
bool dlm_lowcomms_is_running(void)
@@ -874,6 +877,7 @@ static void process_dlm_messages(struct work_struct *work)
}
list_del(&pentry->list);
+ atomic_dec(&processqueue_count);
spin_unlock(&processqueue_lock);
for (;;) {
@@ -891,6 +895,7 @@ static void process_dlm_messages(struct work_struct *work)
}
list_del(&pentry->list);
+ atomic_dec(&processqueue_count);
spin_unlock(&processqueue_lock);
}
}
@@ -962,6 +967,7 @@ again:
con->rx_leftover);
spin_lock(&processqueue_lock);
+ ret = atomic_inc_return(&processqueue_count);
list_add_tail(&pentry->list, &processqueue);
if (!process_dlm_messages_pending) {
process_dlm_messages_pending = true;
@@ -969,6 +975,9 @@ again:
}
spin_unlock(&processqueue_lock);
+ if (ret > DLM_MAX_PROCESS_BUFFERS)
+ return DLM_IO_FLUSH;
+
return DLM_IO_SUCCESS;
}
@@ -1503,6 +1512,9 @@ static void process_recv_sockets(struct work_struct *work)
wake_up(&con->shutdown_wait);
/* CF_RECV_PENDING cleared */
break;
+ case DLM_IO_FLUSH:
+ flush_workqueue(process_workqueue);
+ fallthrough;
case DLM_IO_RESCHED:
cond_resched();
queue_work(io_workqueue, &con->rwork);
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index f641b36a36db..2247ebb61be1 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -337,13 +337,21 @@ static struct midcomms_node *nodeid2node(int nodeid)
int dlm_midcomms_addr(int nodeid, struct sockaddr_storage *addr, int len)
{
- int ret, r = nodeid_hash(nodeid);
+ int ret, idx, r = nodeid_hash(nodeid);
struct midcomms_node *node;
ret = dlm_lowcomms_addr(nodeid, addr, len);
if (ret)
return ret;
+ idx = srcu_read_lock(&nodes_srcu);
+ node = __find_node(nodeid, r);
+ if (node) {
+ srcu_read_unlock(&nodes_srcu, idx);
+ return 0;
+ }
+ srcu_read_unlock(&nodes_srcu, idx);
+
node = kmalloc(sizeof(*node), GFP_NOFS);
if (!node)
return -ENOMEM;
@@ -1030,15 +1038,15 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
break;
case DLM_VERSION_3_2:
+ /* send ack back if necessary */
+ dlm_send_ack_threshold(node, DLM_SEND_ACK_BACK_MSG_THRESHOLD);
+
msg = dlm_midcomms_get_msg_3_2(mh, nodeid, len, allocation,
ppc);
if (!msg) {
dlm_free_mhandle(mh);
goto err;
}
-
- /* send ack back if necessary */
- dlm_send_ack_threshold(node, DLM_SEND_ACK_BACK_MSG_THRESHOLD);
break;
default:
dlm_free_mhandle(mh);
@@ -1260,12 +1268,23 @@ void dlm_midcomms_remove_member(int nodeid)
idx = srcu_read_lock(&nodes_srcu);
node = nodeid2node(nodeid);
- if (WARN_ON_ONCE(!node)) {
+ /* in case of dlm_midcomms_close() removes node */
+ if (!node) {
srcu_read_unlock(&nodes_srcu, idx);
return;
}
spin_lock(&node->state_lock);
+ /* case of dlm_midcomms_addr() created node but
+ * was not added before because dlm_midcomms_close()
+ * removed the node
+ */
+ if (!node->users) {
+ spin_unlock(&node->state_lock);
+ srcu_read_unlock(&nodes_srcu, idx);
+ return;
+ }
+
node->users--;
pr_debug("node %d users dec count %d\n", nodeid, node->users);
@@ -1386,10 +1405,16 @@ void dlm_midcomms_shutdown(void)
midcomms_shutdown(node);
}
}
- srcu_read_unlock(&nodes_srcu, idx);
- mutex_unlock(&close_lock);
dlm_lowcomms_shutdown();
+
+ for (i = 0; i < CONN_HASH_SIZE; i++) {
+ hlist_for_each_entry_rcu(node, &node_hash[i], hlist) {
+ midcomms_node_reset(node);
+ }
+ }
+ srcu_read_unlock(&nodes_srcu, idx);
+ mutex_unlock(&close_lock);
}
int dlm_midcomms_close(int nodeid)
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index f2ed0c0266cb..c586c5db18b5 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -702,6 +702,6 @@ int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
loff_t offset);
-extern const struct xattr_handler *ecryptfs_xattr_handlers[];
+extern const struct xattr_handler * const ecryptfs_xattr_handlers[];
#endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 992d9c7e64ae..b0e8774c435a 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -998,6 +998,14 @@ static int ecryptfs_getattr_link(struct mnt_idmap *idmap,
return rc;
}
+static int ecryptfs_do_getattr(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int flags)
+{
+ if (flags & AT_GETATTR_NOSEC)
+ return vfs_getattr_nosec(path, stat, request_mask, flags);
+ return vfs_getattr(path, stat, request_mask, flags);
+}
+
static int ecryptfs_getattr(struct mnt_idmap *idmap,
const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int flags)
@@ -1006,8 +1014,8 @@ static int ecryptfs_getattr(struct mnt_idmap *idmap,
struct kstat lower_stat;
int rc;
- rc = vfs_getattr(ecryptfs_dentry_to_lower_path(dentry), &lower_stat,
- request_mask, flags);
+ rc = ecryptfs_do_getattr(ecryptfs_dentry_to_lower_path(dentry),
+ &lower_stat, request_mask, flags);
if (!rc) {
fsstack_copy_attr_all(d_inode(dentry),
ecryptfs_inode_to_lower(d_inode(dentry)));
@@ -1210,7 +1218,7 @@ static const struct xattr_handler ecryptfs_xattr_handler = {
.set = ecryptfs_xattr_set,
};
-const struct xattr_handler *ecryptfs_xattr_handlers[] = {
+const struct xattr_handler * const ecryptfs_xattr_handlers[] = {
&ecryptfs_xattr_handler,
NULL
};
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index 59b52718a3a2..7e9961639802 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -51,7 +51,7 @@ static ssize_t efivarfs_file_write(struct file *file,
} else {
inode_lock(inode);
i_size_write(inode, datasize + sizeof(attributes));
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
inode_unlock(inode);
}
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index db9231f0e77b..91290fe4a70b 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -21,11 +21,15 @@ struct inode *efivarfs_get_inode(struct super_block *sb,
dev_t dev, bool is_removable)
{
struct inode *inode = new_inode(sb);
+ struct efivarfs_fs_info *fsi = sb->s_fs_info;
+ struct efivarfs_mount_opts *opts = &fsi->mount_opts;
if (inode) {
+ inode->i_uid = opts->uid;
+ inode->i_gid = opts->gid;
inode->i_ino = get_next_ino();
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_flags = is_removable ? 0 : S_IMMUTABLE;
switch (mode & S_IFMT) {
case S_IFREG:
diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
index 8ebf3a6a8aa2..c66647f5c0bd 100644
--- a/fs/efivarfs/internal.h
+++ b/fs/efivarfs/internal.h
@@ -9,6 +9,15 @@
#include <linux/list.h>
#include <linux/efi.h>
+struct efivarfs_mount_opts {
+ kuid_t uid;
+ kgid_t gid;
+};
+
+struct efivarfs_fs_info {
+ struct efivarfs_mount_opts mount_opts;
+};
+
struct efi_variable {
efi_char16_t VariableName[EFI_VAR_NAME_LEN/sizeof(efi_char16_t)];
efi_guid_t VendorGuid;
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 996271473609..77240953a92e 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -8,6 +8,7 @@
#include <linux/efi.h>
#include <linux/fs.h>
#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/module.h>
#include <linux/pagemap.h>
#include <linux/ucs2_string.h>
@@ -24,12 +25,28 @@ static void efivarfs_evict_inode(struct inode *inode)
clear_inode(inode);
}
+static int efivarfs_show_options(struct seq_file *m, struct dentry *root)
+{
+ struct super_block *sb = root->d_sb;
+ struct efivarfs_fs_info *sbi = sb->s_fs_info;
+ struct efivarfs_mount_opts *opts = &sbi->mount_opts;
+
+ if (!uid_eq(opts->uid, GLOBAL_ROOT_UID))
+ seq_printf(m, ",uid=%u",
+ from_kuid_munged(&init_user_ns, opts->uid));
+ if (!gid_eq(opts->gid, GLOBAL_ROOT_GID))
+ seq_printf(m, ",gid=%u",
+ from_kgid_munged(&init_user_ns, opts->gid));
+ return 0;
+}
+
static int efivarfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
const u32 attr = EFI_VARIABLE_NON_VOLATILE |
EFI_VARIABLE_BOOTSERVICE_ACCESS |
EFI_VARIABLE_RUNTIME_ACCESS;
u64 storage_space, remaining_space, max_variable_size;
+ u64 id = huge_encode_dev(dentry->d_sb->s_dev);
efi_status_t status;
/* Some UEFI firmware does not implement QueryVariableInfo() */
@@ -53,6 +70,7 @@ static int efivarfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_blocks = storage_space;
buf->f_bfree = remaining_space;
buf->f_type = dentry->d_sb->s_magic;
+ buf->f_fsid = u64_to_fsid(id);
/*
* In f_bavail we declare the free space that the kernel will allow writing
@@ -70,6 +88,7 @@ static const struct super_operations efivarfs_ops = {
.statfs = efivarfs_statfs,
.drop_inode = generic_delete_inode,
.evict_inode = efivarfs_evict_inode,
+ .show_options = efivarfs_show_options,
};
/*
@@ -231,6 +250,45 @@ static int efivarfs_destroy(struct efivar_entry *entry, void *data)
return 0;
}
+enum {
+ Opt_uid, Opt_gid,
+};
+
+static const struct fs_parameter_spec efivarfs_parameters[] = {
+ fsparam_u32("uid", Opt_uid),
+ fsparam_u32("gid", Opt_gid),
+ {},
+};
+
+static int efivarfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct efivarfs_fs_info *sbi = fc->s_fs_info;
+ struct efivarfs_mount_opts *opts = &sbi->mount_opts;
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, efivarfs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_uid:
+ opts->uid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(opts->uid))
+ return -EINVAL;
+ break;
+ case Opt_gid:
+ opts->gid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(opts->gid))
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct inode *inode = NULL;
@@ -277,10 +335,21 @@ static int efivarfs_get_tree(struct fs_context *fc)
static const struct fs_context_operations efivarfs_context_ops = {
.get_tree = efivarfs_get_tree,
+ .parse_param = efivarfs_parse_param,
};
static int efivarfs_init_fs_context(struct fs_context *fc)
{
+ struct efivarfs_fs_info *sfi;
+
+ sfi = kzalloc(sizeof(*sfi), GFP_KERNEL);
+ if (!sfi)
+ return -ENOMEM;
+
+ sfi->mount_opts.uid = GLOBAL_ROOT_UID;
+ sfi->mount_opts.gid = GLOBAL_ROOT_GID;
+
+ fc->s_fs_info = sfi;
fc->ops = &efivarfs_context_ops;
return 0;
}
@@ -301,6 +370,7 @@ static struct file_system_type efivarfs_type = {
.name = "efivarfs",
.init_fs_context = efivarfs_init_fs_context,
.kill_sb = efivarfs_kill_sb,
+ .parameters = efivarfs_parameters,
};
static __init int efivarfs_init(void)
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index 3789d22ba501..7844ab24b813 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -103,10 +103,9 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino)
i_uid_write(inode, (uid_t)be16_to_cpu(efs_inode->di_uid));
i_gid_write(inode, (gid_t)be16_to_cpu(efs_inode->di_gid));
inode->i_size = be32_to_cpu(efs_inode->di_size);
- inode->i_atime.tv_sec = be32_to_cpu(efs_inode->di_atime);
- inode->i_mtime.tv_sec = be32_to_cpu(efs_inode->di_mtime);
+ inode_set_atime(inode, be32_to_cpu(efs_inode->di_atime), 0);
+ inode_set_mtime(inode, be32_to_cpu(efs_inode->di_mtime), 0);
inode_set_ctime(inode, be32_to_cpu(efs_inode->di_ctime), 0);
- inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
/* this is the number of blocks in the file */
if (inode->i_size == 0) {
diff --git a/fs/efs/super.c b/fs/efs/super.c
index b287f47c165b..f17fdac76b2e 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -123,6 +123,7 @@ static const struct super_operations efs_superblock_operations = {
};
static const struct export_operations efs_export_ops = {
+ .encode_fh = generic_encode_ino32_fh,
.fh_to_dentry = efs_fh_to_dentry,
.fh_to_parent = efs_fh_to_parent,
.get_parent = efs_get_parent,
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index f6dc961e6c2b..1d318f85232d 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -21,7 +21,7 @@ config EROFS_FS
performance under extremely memory pressure without extra cost.
See the documentation at <file:Documentation/filesystems/erofs.rst>
- for more details.
+ and the web pages at <https://erofs.docs.kernel.org> for more details.
If unsure, say N.
@@ -91,13 +91,10 @@ config EROFS_FS_ZIP_LZMA
select XZ_DEC_MICROLZMA
help
Saying Y here includes support for reading EROFS file systems
- containing LZMA compressed data, specifically called microLZMA. it
- gives better compression ratios than the LZ4 algorithm, at the
+ containing LZMA compressed data, specifically called microLZMA. It
+ gives better compression ratios than the default LZ4 format, at the
expense of more CPU overhead.
- LZMA support is an experimental feature for now and so most file
- systems will be readable without selecting this option.
-
If unsure, say N.
config EROFS_FS_ZIP_DEFLATE
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 349c3316ae6b..279933e007d2 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -21,6 +21,8 @@ struct z_erofs_decompress_req {
};
struct z_erofs_decompressor {
+ int (*config)(struct super_block *sb, struct erofs_super_block *dsb,
+ void *data, int size);
int (*decompress)(struct z_erofs_decompress_req *rq,
struct page **pagepool);
char *name;
@@ -92,6 +94,10 @@ int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf,
extern const struct z_erofs_decompressor erofs_decompressors[];
/* prototypes for specific algorithms */
+int z_erofs_load_lzma_config(struct super_block *sb,
+ struct erofs_super_block *dsb, void *data, int size);
+int z_erofs_load_deflate_config(struct super_block *sb,
+ struct erofs_super_block *dsb, void *data, int size);
int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
struct page **pagepool);
int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 0c2c99c58b5e..c98aeda8abb2 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -5,9 +5,7 @@
* Copyright (C) 2021, Alibaba Cloud
*/
#include "internal.h"
-#include <linux/prefetch.h>
#include <linux/sched/mm.h>
-#include <linux/dax.h>
#include <trace/events/erofs.h>
void erofs_unmap_metabuf(struct erofs_buf *buf)
@@ -222,7 +220,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
up_read(&devs->rwsem);
return 0;
}
- map->m_bdev = dif->bdev;
+ map->m_bdev = dif->bdev_handle ? dif->bdev_handle->bdev : NULL;
map->m_daxdev = dif->dax_dev;
map->m_dax_part_off = dif->dax_part_off;
map->m_fscache = dif->fscache;
@@ -240,7 +238,8 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
if (map->m_pa >= startoff &&
map->m_pa < startoff + length) {
map->m_pa -= startoff;
- map->m_bdev = dif->bdev;
+ map->m_bdev = dif->bdev_handle ?
+ dif->bdev_handle->bdev : NULL;
map->m_daxdev = dif->dax_dev;
map->m_dax_part_off = dif->dax_part_off;
map->m_fscache = dif->fscache;
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 332ec5f74002..021be5feb1bc 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -4,7 +4,6 @@
* https://www.huawei.com/
*/
#include "compress.h"
-#include <linux/module.h>
#include <linux/lz4.h>
#ifndef LZ4_DISTANCE_MAX /* history window size */
@@ -24,11 +23,11 @@ struct z_erofs_lz4_decompress_ctx {
unsigned int oend;
};
-int z_erofs_load_lz4_config(struct super_block *sb,
- struct erofs_super_block *dsb,
- struct z_erofs_lz4_cfgs *lz4, int size)
+static int z_erofs_load_lz4_config(struct super_block *sb,
+ struct erofs_super_block *dsb, void *data, int size)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct z_erofs_lz4_cfgs *lz4 = data;
u16 distance;
if (lz4) {
@@ -370,19 +369,75 @@ const struct z_erofs_decompressor erofs_decompressors[] = {
.name = "interlaced"
},
[Z_EROFS_COMPRESSION_LZ4] = {
+ .config = z_erofs_load_lz4_config,
.decompress = z_erofs_lz4_decompress,
.name = "lz4"
},
#ifdef CONFIG_EROFS_FS_ZIP_LZMA
[Z_EROFS_COMPRESSION_LZMA] = {
+ .config = z_erofs_load_lzma_config,
.decompress = z_erofs_lzma_decompress,
.name = "lzma"
},
#endif
#ifdef CONFIG_EROFS_FS_ZIP_DEFLATE
[Z_EROFS_COMPRESSION_DEFLATE] = {
+ .config = z_erofs_load_deflate_config,
.decompress = z_erofs_deflate_decompress,
.name = "deflate"
},
#endif
};
+
+int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ unsigned int algs, alg;
+ erofs_off_t offset;
+ int size, ret = 0;
+
+ if (!erofs_sb_has_compr_cfgs(sbi)) {
+ sbi->available_compr_algs = Z_EROFS_COMPRESSION_LZ4;
+ return z_erofs_load_lz4_config(sb, dsb, NULL, 0);
+ }
+
+ sbi->available_compr_algs = le16_to_cpu(dsb->u1.available_compr_algs);
+ if (sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS) {
+ erofs_err(sb, "unidentified algorithms %x, please upgrade kernel",
+ sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS);
+ return -EOPNOTSUPP;
+ }
+
+ erofs_init_metabuf(&buf, sb);
+ offset = EROFS_SUPER_OFFSET + sbi->sb_size;
+ alg = 0;
+ for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) {
+ void *data;
+
+ if (!(algs & 1))
+ continue;
+
+ data = erofs_read_metadata(sb, &buf, &offset, &size);
+ if (IS_ERR(data)) {
+ ret = PTR_ERR(data);
+ break;
+ }
+
+ if (alg >= ARRAY_SIZE(erofs_decompressors) ||
+ !erofs_decompressors[alg].config) {
+ erofs_err(sb, "algorithm %d isn't enabled on this kernel",
+ alg);
+ ret = -EOPNOTSUPP;
+ } else {
+ ret = erofs_decompressors[alg].config(sb,
+ dsb, data, size);
+ }
+
+ kfree(data);
+ if (ret)
+ break;
+ }
+ erofs_put_metabuf(&buf);
+ return ret;
+}
diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c
index 19e5bdeb30b6..daf3c1bdeab8 100644
--- a/fs/erofs/decompressor_deflate.c
+++ b/fs/erofs/decompressor_deflate.c
@@ -1,5 +1,4 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-#include <linux/module.h>
#include <linux/zlib.h>
#include "compress.h"
@@ -77,9 +76,10 @@ out_failed:
}
int z_erofs_load_deflate_config(struct super_block *sb,
- struct erofs_super_block *dsb,
- struct z_erofs_deflate_cfgs *dfl, int size)
+ struct erofs_super_block *dsb, void *data, int size)
{
+ struct z_erofs_deflate_cfgs *dfl = data;
+
if (!dfl || size < sizeof(struct z_erofs_deflate_cfgs)) {
erofs_err(sb, "invalid deflate cfgs, size=%u", size);
return -EINVAL;
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
index dee10d22ada9..2dd14f99c1dc 100644
--- a/fs/erofs/decompressor_lzma.c
+++ b/fs/erofs/decompressor_lzma.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <linux/xz.h>
-#include <linux/module.h>
#include "compress.h"
struct z_erofs_lzma {
@@ -72,10 +71,10 @@ int __init z_erofs_lzma_init(void)
}
int z_erofs_load_lzma_config(struct super_block *sb,
- struct erofs_super_block *dsb,
- struct z_erofs_lzma_cfgs *lzma, int size)
+ struct erofs_super_block *dsb, void *data, int size)
{
static DEFINE_MUTEX(lzma_resize_mutex);
+ struct z_erofs_lzma_cfgs *lzma = data;
unsigned int dict_size, i;
struct z_erofs_lzma *strm, *head = NULL;
int err;
@@ -96,8 +95,6 @@ int z_erofs_load_lzma_config(struct super_block *sb,
return -EINVAL;
}
- erofs_info(sb, "EXPERIMENTAL MicroLZMA in use. Use at your own risk!");
-
/* in case 2 z_erofs_load_lzma_config() race to avoid deadlock */
mutex_lock(&lzma_resize_mutex);
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index edc8ec7581b8..14a79d3226ab 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -15,11 +15,11 @@ static void *erofs_read_inode(struct erofs_buf *buf,
struct erofs_sb_info *sbi = EROFS_SB(sb);
struct erofs_inode *vi = EROFS_I(inode);
const erofs_off_t inode_loc = erofs_iloc(inode);
-
erofs_blk_t blkaddr, nblks = 0;
void *kaddr;
struct erofs_inode_compact *dic;
struct erofs_inode_extended *die, *copied = NULL;
+ union erofs_inode_i_u iu;
unsigned int ifmt;
int err;
@@ -35,9 +35,8 @@ static void *erofs_read_inode(struct erofs_buf *buf,
dic = kaddr + *ofs;
ifmt = le16_to_cpu(dic->i_format);
-
if (ifmt & ~EROFS_I_ALL) {
- erofs_err(inode->i_sb, "unsupported i_format %u of nid %llu",
+ erofs_err(sb, "unsupported i_format %u of nid %llu",
ifmt, vi->nid);
err = -EOPNOTSUPP;
goto err_out;
@@ -45,7 +44,7 @@ static void *erofs_read_inode(struct erofs_buf *buf,
vi->datalayout = erofs_inode_datalayout(ifmt);
if (vi->datalayout >= EROFS_INODE_DATALAYOUT_MAX) {
- erofs_err(inode->i_sb, "unsupported datalayout %u of nid %llu",
+ erofs_err(sb, "unsupported datalayout %u of nid %llu",
vi->datalayout, vi->nid);
err = -EOPNOTSUPP;
goto err_out;
@@ -82,40 +81,15 @@ static void *erofs_read_inode(struct erofs_buf *buf,
vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount);
inode->i_mode = le16_to_cpu(die->i_mode);
- switch (inode->i_mode & S_IFMT) {
- case S_IFREG:
- case S_IFDIR:
- case S_IFLNK:
- vi->raw_blkaddr = le32_to_cpu(die->i_u.raw_blkaddr);
- break;
- case S_IFCHR:
- case S_IFBLK:
- inode->i_rdev =
- new_decode_dev(le32_to_cpu(die->i_u.rdev));
- break;
- case S_IFIFO:
- case S_IFSOCK:
- inode->i_rdev = 0;
- break;
- default:
- goto bogusimode;
- }
+ iu = die->i_u;
i_uid_write(inode, le32_to_cpu(die->i_uid));
i_gid_write(inode, le32_to_cpu(die->i_gid));
set_nlink(inode, le32_to_cpu(die->i_nlink));
-
- /* extended inode has its own timestamp */
+ /* each extended inode has its own timestamp */
inode_set_ctime(inode, le64_to_cpu(die->i_mtime),
le32_to_cpu(die->i_mtime_nsec));
inode->i_size = le64_to_cpu(die->i_size);
-
- /* total blocks for compressed files */
- if (erofs_inode_is_data_compressed(vi->datalayout))
- nblks = le32_to_cpu(die->i_u.compressed_blocks);
- else if (vi->datalayout == EROFS_INODE_CHUNK_BASED)
- /* fill chunked inode summary info */
- vi->chunkformat = le16_to_cpu(die->i_u.c.format);
kfree(copied);
copied = NULL;
break;
@@ -125,49 +99,51 @@ static void *erofs_read_inode(struct erofs_buf *buf,
vi->xattr_isize = erofs_xattr_ibody_size(dic->i_xattr_icount);
inode->i_mode = le16_to_cpu(dic->i_mode);
- switch (inode->i_mode & S_IFMT) {
- case S_IFREG:
- case S_IFDIR:
- case S_IFLNK:
- vi->raw_blkaddr = le32_to_cpu(dic->i_u.raw_blkaddr);
- break;
- case S_IFCHR:
- case S_IFBLK:
- inode->i_rdev =
- new_decode_dev(le32_to_cpu(dic->i_u.rdev));
- break;
- case S_IFIFO:
- case S_IFSOCK:
- inode->i_rdev = 0;
- break;
- default:
- goto bogusimode;
- }
+ iu = dic->i_u;
i_uid_write(inode, le16_to_cpu(dic->i_uid));
i_gid_write(inode, le16_to_cpu(dic->i_gid));
set_nlink(inode, le16_to_cpu(dic->i_nlink));
-
/* use build time for compact inodes */
inode_set_ctime(inode, sbi->build_time, sbi->build_time_nsec);
inode->i_size = le32_to_cpu(dic->i_size);
- if (erofs_inode_is_data_compressed(vi->datalayout))
- nblks = le32_to_cpu(dic->i_u.compressed_blocks);
- else if (vi->datalayout == EROFS_INODE_CHUNK_BASED)
- vi->chunkformat = le16_to_cpu(dic->i_u.c.format);
break;
default:
- erofs_err(inode->i_sb,
- "unsupported on-disk inode version %u of nid %llu",
+ erofs_err(sb, "unsupported on-disk inode version %u of nid %llu",
erofs_inode_version(ifmt), vi->nid);
err = -EOPNOTSUPP;
goto err_out;
}
- if (vi->datalayout == EROFS_INODE_CHUNK_BASED) {
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ case S_IFDIR:
+ case S_IFLNK:
+ vi->raw_blkaddr = le32_to_cpu(iu.raw_blkaddr);
+ break;
+ case S_IFCHR:
+ case S_IFBLK:
+ inode->i_rdev = new_decode_dev(le32_to_cpu(iu.rdev));
+ break;
+ case S_IFIFO:
+ case S_IFSOCK:
+ inode->i_rdev = 0;
+ break;
+ default:
+ erofs_err(sb, "bogus i_mode (%o) @ nid %llu", inode->i_mode,
+ vi->nid);
+ err = -EFSCORRUPTED;
+ goto err_out;
+ }
+
+ /* total blocks for compressed files */
+ if (erofs_inode_is_data_compressed(vi->datalayout)) {
+ nblks = le32_to_cpu(iu.compressed_blocks);
+ } else if (vi->datalayout == EROFS_INODE_CHUNK_BASED) {
+ /* fill chunked inode summary info */
+ vi->chunkformat = le16_to_cpu(iu.c.format);
if (vi->chunkformat & ~EROFS_CHUNK_FORMAT_ALL) {
- erofs_err(inode->i_sb,
- "unsupported chunk format %x of nid %llu",
+ erofs_err(sb, "unsupported chunk format %x of nid %llu",
vi->chunkformat, vi->nid);
err = -EOPNOTSUPP;
goto err_out;
@@ -175,7 +151,8 @@ static void *erofs_read_inode(struct erofs_buf *buf,
vi->chunkbits = sb->s_blocksize_bits +
(vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK);
}
- inode->i_mtime = inode->i_atime = inode_get_ctime(inode);
+ inode_set_mtime_to_ts(inode,
+ inode_set_atime_to_ts(inode, inode_get_ctime(inode)));
inode->i_flags &= ~S_DAX;
if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
@@ -190,10 +167,6 @@ static void *erofs_read_inode(struct erofs_buf *buf,
inode->i_blocks = nblks << (sb->s_blocksize_bits - 9);
return kaddr;
-bogusimode:
- erofs_err(inode->i_sb, "bogus i_mode (%o) @ nid %llu",
- inode->i_mode, vi->nid);
- err = -EFSCORRUPTED;
err_out:
DBG_BUGON(1);
kfree(copied);
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 4ff88d0dd980..b0409badb017 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -8,8 +8,10 @@
#define __EROFS_INTERNAL_H
#include <linux/fs.h>
+#include <linux/dax.h>
#include <linux/dcache.h>
#include <linux/mm.h>
+#include <linux/module.h>
#include <linux/pagemap.h>
#include <linux/bio.h>
#include <linux/magic.h>
@@ -47,7 +49,7 @@ typedef u32 erofs_blk_t;
struct erofs_device_info {
char *path;
struct erofs_fscache *fscache;
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
struct dax_device *dax_dev;
u64 dax_part_off;
@@ -228,8 +230,6 @@ struct erofs_buf {
};
#define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL })
-#define ROOT_NID(sb) ((sb)->root_nid)
-
#define erofs_blknr(sb, addr) ((addr) >> (sb)->s_blocksize_bits)
#define erofs_blkoff(sb, addr) ((addr) & ((sb)->s_blocksize - 1))
#define erofs_pos(sb, blk) ((erofs_off_t)(blk) << (sb)->s_blocksize_bits)
@@ -469,9 +469,6 @@ int __init z_erofs_init_zip_subsystem(void);
void z_erofs_exit_zip_subsystem(void);
int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
struct erofs_workgroup *egrp);
-int z_erofs_load_lz4_config(struct super_block *sb,
- struct erofs_super_block *dsb,
- struct z_erofs_lz4_cfgs *lz4, int len);
int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
int flags);
void *erofs_get_pcpubuf(unsigned int requiredpages);
@@ -480,6 +477,7 @@ int erofs_pcpubuf_growsize(unsigned int nrpages);
void __init erofs_pcpubuf_init(void);
void erofs_pcpubuf_exit(void);
int erofs_init_managed_cache(struct super_block *sb);
+int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb);
#else
static inline void erofs_shrinker_register(struct super_block *sb) {}
static inline void erofs_shrinker_unregister(struct super_block *sb) {}
@@ -487,16 +485,6 @@ static inline int erofs_init_shrinker(void) { return 0; }
static inline void erofs_exit_shrinker(void) {}
static inline int z_erofs_init_zip_subsystem(void) { return 0; }
static inline void z_erofs_exit_zip_subsystem(void) {}
-static inline int z_erofs_load_lz4_config(struct super_block *sb,
- struct erofs_super_block *dsb,
- struct z_erofs_lz4_cfgs *lz4, int len)
-{
- if (lz4 || dsb->u1.lz4_max_distance) {
- erofs_err(sb, "lz4 algorithm isn't enabled");
- return -EINVAL;
- }
- return 0;
-}
static inline void erofs_pcpubuf_init(void) {}
static inline void erofs_pcpubuf_exit(void) {}
static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; }
@@ -505,41 +493,17 @@ static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; }
#ifdef CONFIG_EROFS_FS_ZIP_LZMA
int __init z_erofs_lzma_init(void);
void z_erofs_lzma_exit(void);
-int z_erofs_load_lzma_config(struct super_block *sb,
- struct erofs_super_block *dsb,
- struct z_erofs_lzma_cfgs *lzma, int size);
#else
static inline int z_erofs_lzma_init(void) { return 0; }
static inline int z_erofs_lzma_exit(void) { return 0; }
-static inline int z_erofs_load_lzma_config(struct super_block *sb,
- struct erofs_super_block *dsb,
- struct z_erofs_lzma_cfgs *lzma, int size) {
- if (lzma) {
- erofs_err(sb, "lzma algorithm isn't enabled");
- return -EINVAL;
- }
- return 0;
-}
#endif /* !CONFIG_EROFS_FS_ZIP_LZMA */
#ifdef CONFIG_EROFS_FS_ZIP_DEFLATE
int __init z_erofs_deflate_init(void);
void z_erofs_deflate_exit(void);
-int z_erofs_load_deflate_config(struct super_block *sb,
- struct erofs_super_block *dsb,
- struct z_erofs_deflate_cfgs *dfl, int size);
#else
static inline int z_erofs_deflate_init(void) { return 0; }
static inline int z_erofs_deflate_exit(void) { return 0; }
-static inline int z_erofs_load_deflate_config(struct super_block *sb,
- struct erofs_super_block *dsb,
- struct z_erofs_deflate_cfgs *dfl, int size) {
- if (dfl) {
- erofs_err(sb, "deflate algorithm isn't enabled");
- return -EINVAL;
- }
- return 0;
-}
#endif /* !CONFIG_EROFS_FS_ZIP_DEFLATE */
#ifdef CONFIG_EROFS_FS_ONDEMAND
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 3700af9ee173..3789d6224513 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -4,14 +4,11 @@
* https://www.huawei.com/
* Copyright (C) 2021, Alibaba Cloud
*/
-#include <linux/module.h>
#include <linux/statfs.h>
-#include <linux/parser.h>
#include <linux/seq_file.h>
#include <linux/crc32c.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
-#include <linux/dax.h>
#include <linux/exportfs.h>
#include "xattr.h"
@@ -156,68 +153,15 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
return buffer;
}
-#ifdef CONFIG_EROFS_FS_ZIP
-static int erofs_load_compr_cfgs(struct super_block *sb,
- struct erofs_super_block *dsb)
+#ifndef CONFIG_EROFS_FS_ZIP
+static int z_erofs_parse_cfgs(struct super_block *sb,
+ struct erofs_super_block *dsb)
{
- struct erofs_sb_info *sbi = EROFS_SB(sb);
- struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
- unsigned int algs, alg;
- erofs_off_t offset;
- int size, ret = 0;
-
- sbi->available_compr_algs = le16_to_cpu(dsb->u1.available_compr_algs);
- if (sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS) {
- erofs_err(sb, "try to load compressed fs with unsupported algorithms %x",
- sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS);
- return -EINVAL;
- }
-
- erofs_init_metabuf(&buf, sb);
- offset = EROFS_SUPER_OFFSET + sbi->sb_size;
- alg = 0;
- for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) {
- void *data;
-
- if (!(algs & 1))
- continue;
-
- data = erofs_read_metadata(sb, &buf, &offset, &size);
- if (IS_ERR(data)) {
- ret = PTR_ERR(data);
- break;
- }
+ if (!dsb->u1.available_compr_algs)
+ return 0;
- switch (alg) {
- case Z_EROFS_COMPRESSION_LZ4:
- ret = z_erofs_load_lz4_config(sb, dsb, data, size);
- break;
- case Z_EROFS_COMPRESSION_LZMA:
- ret = z_erofs_load_lzma_config(sb, dsb, data, size);
- break;
- case Z_EROFS_COMPRESSION_DEFLATE:
- ret = z_erofs_load_deflate_config(sb, dsb, data, size);
- break;
- default:
- DBG_BUGON(1);
- ret = -EFAULT;
- }
- kfree(data);
- if (ret)
- break;
- }
- erofs_put_metabuf(&buf);
- return ret;
-}
-#else
-static int erofs_load_compr_cfgs(struct super_block *sb,
- struct erofs_super_block *dsb)
-{
- if (dsb->u1.available_compr_algs) {
- erofs_err(sb, "try to load compressed fs when compression is disabled");
- return -EINVAL;
- }
- return 0;
+ erofs_err(sb, "compression disabled, unable to mount compressed EROFS");
+ return -EOPNOTSUPP;
}
#endif
@@ -227,7 +171,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
struct erofs_sb_info *sbi = EROFS_SB(sb);
struct erofs_fscache *fscache;
struct erofs_deviceslot *dis;
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
void *ptr;
ptr = erofs_read_metabuf(buf, sb, erofs_blknr(sb, *pos), EROFS_KMAP);
@@ -251,13 +195,13 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
return PTR_ERR(fscache);
dif->fscache = fscache;
} else if (!sbi->devs->flatdev) {
- bdev = blkdev_get_by_path(dif->path, BLK_OPEN_READ, sb->s_type,
- NULL);
- if (IS_ERR(bdev))
- return PTR_ERR(bdev);
- dif->bdev = bdev;
- dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off,
- NULL, NULL);
+ bdev_handle = bdev_open_by_path(dif->path, BLK_OPEN_READ,
+ sb->s_type, NULL);
+ if (IS_ERR(bdev_handle))
+ return PTR_ERR(bdev_handle);
+ dif->bdev_handle = bdev_handle;
+ dif->dax_dev = fs_dax_get_by_bdev(bdev_handle->bdev,
+ &dif->dax_part_off, NULL, NULL);
}
dif->blocks = le32_to_cpu(dis->blocks);
@@ -406,10 +350,7 @@ static int erofs_read_superblock(struct super_block *sb)
}
/* parse on-disk compression configurations */
- if (erofs_sb_has_compr_cfgs(sbi))
- ret = erofs_load_compr_cfgs(sb, dsb);
- else
- ret = z_erofs_load_lz4_config(sb, dsb, NULL, 0);
+ ret = z_erofs_parse_cfgs(sb, dsb);
if (ret < 0)
goto out;
@@ -626,6 +567,7 @@ static struct dentry *erofs_get_parent(struct dentry *child)
}
static const struct export_operations erofs_export_ops = {
+ .encode_fh = generic_encode_ino32_fh,
.fh_to_dentry = erofs_fh_to_dentry,
.fh_to_parent = erofs_fh_to_parent,
.get_parent = erofs_get_parent,
@@ -724,13 +666,13 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
xa_init(&sbi->managed_pslots);
#endif
- inode = erofs_iget(sb, ROOT_NID(sbi));
+ inode = erofs_iget(sb, sbi->root_nid);
if (IS_ERR(inode))
return PTR_ERR(inode);
if (!S_ISDIR(inode->i_mode)) {
erofs_err(sb, "rootino(nid %llu) is not a directory(i_mode %o)",
- ROOT_NID(sbi), inode->i_mode);
+ sbi->root_nid, inode->i_mode);
iput(inode);
return -EINVAL;
}
@@ -760,7 +702,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
if (err)
return err;
- erofs_info(sb, "mounted with root inode @ nid %llu.", ROOT_NID(sbi));
+ erofs_info(sb, "mounted with root inode @ nid %llu.", sbi->root_nid);
return 0;
}
@@ -806,8 +748,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data)
struct erofs_device_info *dif = ptr;
fs_put_dax(dif->dax_dev, NULL);
- if (dif->bdev)
- blkdev_put(dif->bdev, &erofs_fs_type);
+ if (dif->bdev_handle)
+ bdev_release(dif->bdev_handle);
erofs_fscache_unregister_cookie(dif->fscache);
dif->fscache = NULL;
kfree(dif->path);
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index cc6fb9e98899..5dea308764b4 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -77,12 +77,7 @@ struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
struct erofs_sb_info *const sbi = EROFS_SB(sb);
struct erofs_workgroup *pre;
- /*
- * Bump up before making this visible to others for the XArray in order
- * to avoid potential UAF without serialized by xa_lock.
- */
- lockref_get(&grp->lockref);
-
+ DBG_BUGON(grp->lockref.count < 1);
repeat:
xa_lock(&sbi->managed_pslots);
pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index,
@@ -96,7 +91,6 @@ repeat:
cond_resched();
goto repeat;
}
- lockref_put_return(&grp->lockref);
grp = pre;
}
xa_unlock(&sbi->managed_pslots);
@@ -270,19 +264,24 @@ static unsigned long erofs_shrink_scan(struct shrinker *shrink,
return freed;
}
-static struct shrinker erofs_shrinker_info = {
- .scan_objects = erofs_shrink_scan,
- .count_objects = erofs_shrink_count,
- .seeks = DEFAULT_SEEKS,
-};
+static struct shrinker *erofs_shrinker_info;
int __init erofs_init_shrinker(void)
{
- return register_shrinker(&erofs_shrinker_info, "erofs-shrinker");
+ erofs_shrinker_info = shrinker_alloc(0, "erofs-shrinker");
+ if (!erofs_shrinker_info)
+ return -ENOMEM;
+
+ erofs_shrinker_info->count_objects = erofs_shrink_count;
+ erofs_shrinker_info->scan_objects = erofs_shrink_scan;
+
+ shrinker_register(erofs_shrinker_info);
+
+ return 0;
}
void erofs_exit_shrinker(void)
{
- unregister_shrinker(&erofs_shrinker_info);
+ shrinker_free(erofs_shrinker_info);
}
#endif /* !CONFIG_EROFS_FS_ZIP */
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index 09d341675e89..b58316b49a43 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -168,7 +168,7 @@ const struct xattr_handler __maybe_unused erofs_xattr_security_handler = {
};
#endif
-const struct xattr_handler *erofs_xattr_handlers[] = {
+const struct xattr_handler * const erofs_xattr_handlers[] = {
&erofs_xattr_user_handler,
&erofs_xattr_trusted_handler,
#ifdef CONFIG_EROFS_FS_SECURITY
diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h
index f16283cb8c93..b246cd0e135e 100644
--- a/fs/erofs/xattr.h
+++ b/fs/erofs/xattr.h
@@ -23,7 +23,7 @@ static inline const char *erofs_xattr_prefix(unsigned int idx,
{
const struct xattr_handler *handler = NULL;
- static const struct xattr_handler *xattr_handler_map[] = {
+ static const struct xattr_handler * const xattr_handler_map[] = {
[EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler,
#ifdef CONFIG_EROFS_FS_POSIX_ACL
[EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access,
@@ -44,7 +44,7 @@ static inline const char *erofs_xattr_prefix(unsigned int idx,
return xattr_prefix(handler);
}
-extern const struct xattr_handler *erofs_xattr_handlers[];
+extern const struct xattr_handler * const erofs_xattr_handlers[];
int erofs_xattr_prefixes_init(struct super_block *sb);
void erofs_xattr_prefixes_cleanup(struct super_block *sb);
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 036f610e044b..a7e6847f6f8f 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -796,6 +796,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
return PTR_ERR(pcl);
spin_lock_init(&pcl->obj.lockref.lock);
+ pcl->obj.lockref.count = 1; /* one ref for this request */
pcl->algorithmformat = map->m_algorithmformat;
pcl->length = 0;
pcl->partial = true;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 1d9a71a0c4c1..2877cc01cff1 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -256,10 +256,10 @@ static u64 loop_check_gen = 0;
static struct eventpoll *inserting_into;
/* Slab cache used to allocate "struct epitem" */
-static struct kmem_cache *epi_cache __read_mostly;
+static struct kmem_cache *epi_cache __ro_after_init;
/* Slab cache used to allocate "struct eppoll_entry" */
-static struct kmem_cache *pwq_cache __read_mostly;
+static struct kmem_cache *pwq_cache __ro_after_init;
/*
* List of files with newly added links, where we may need to limit the number
@@ -271,7 +271,7 @@ struct epitems_head {
};
static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR;
-static struct kmem_cache *ephead_cache __read_mostly;
+static struct kmem_cache *ephead_cache __ro_after_init;
static inline void free_ephead(struct epitems_head *head)
{
diff --git a/fs/exec.c b/fs/exec.c
index 6518e33ea813..4aa19b24f281 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -713,7 +713,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
* process cleanup to remove whatever mess we made.
*/
if (length != move_page_tables(vma, old_start,
- vma, new_start, length, false))
+ vma, new_start, length, false, true))
return -ENOMEM;
lru_add_drain();
@@ -986,8 +986,6 @@ static int exec_mmap(struct mm_struct *mm)
tsk = current;
old_mm = current->mm;
exec_mm_release(tsk, old_mm);
- if (old_mm)
- sync_mm_rss(old_mm);
ret = down_write_killable(&tsk->signal->exec_update_lock);
if (ret)
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index e1586bba6d86..9f9295847a4e 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -287,7 +287,7 @@ get_new:
mutex_unlock(&EXFAT_SB(sb)->s_lock);
if (!dir_emit(ctx, nb->lfn, strlen(nb->lfn), inum,
- (de.attr & ATTR_SUBDIR) ? DT_DIR : DT_REG))
+ (de.attr & EXFAT_ATTR_SUBDIR) ? DT_DIR : DT_REG))
goto out;
ctx->pos = cpos;
goto get_new;
@@ -359,7 +359,7 @@ unsigned int exfat_get_entry_type(struct exfat_dentry *ep)
if (ep->type == EXFAT_VOLUME)
return TYPE_VOLUME;
if (ep->type == EXFAT_FILE) {
- if (le16_to_cpu(ep->dentry.file.attr) & ATTR_SUBDIR)
+ if (le16_to_cpu(ep->dentry.file.attr) & EXFAT_ATTR_SUBDIR)
return TYPE_DIR;
return TYPE_FILE;
}
@@ -410,19 +410,21 @@ static void exfat_set_entry_type(struct exfat_dentry *ep, unsigned int type)
ep->type = EXFAT_VOLUME;
} else if (type == TYPE_DIR) {
ep->type = EXFAT_FILE;
- ep->dentry.file.attr = cpu_to_le16(ATTR_SUBDIR);
+ ep->dentry.file.attr = cpu_to_le16(EXFAT_ATTR_SUBDIR);
} else if (type == TYPE_FILE) {
ep->type = EXFAT_FILE;
- ep->dentry.file.attr = cpu_to_le16(ATTR_ARCHIVE);
+ ep->dentry.file.attr = cpu_to_le16(EXFAT_ATTR_ARCHIVE);
}
}
static void exfat_init_stream_entry(struct exfat_dentry *ep,
- unsigned char flags, unsigned int start_clu,
- unsigned long long size)
+ unsigned int start_clu, unsigned long long size)
{
exfat_set_entry_type(ep, TYPE_STREAM);
- ep->dentry.stream.flags = flags;
+ if (size == 0)
+ ep->dentry.stream.flags = ALLOC_FAT_CHAIN;
+ else
+ ep->dentry.stream.flags = ALLOC_NO_FAT_CHAIN;
ep->dentry.stream.start_clu = cpu_to_le32(start_clu);
ep->dentry.stream.valid_size = cpu_to_le64(size);
ep->dentry.stream.size = cpu_to_le64(size);
@@ -488,9 +490,7 @@ int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir,
if (!ep)
return -EIO;
- exfat_init_stream_entry(ep,
- (type == TYPE_FILE) ? ALLOC_FAT_CHAIN : ALLOC_NO_FAT_CHAIN,
- start_clu, size);
+ exfat_init_stream_entry(ep, start_clu, size);
exfat_update_bh(bh, IS_DIRSYNC(inode));
brelse(bh);
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index f55498e5c23d..a7a2c35d74fb 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -234,6 +234,8 @@ struct exfat_mount_options {
discard:1, /* Issue discard requests on deletions */
keep_last_dots:1; /* Keep trailing periods in paths */
int time_offset; /* Offset of timestamps from UTC (in minutes) */
+ /* Support creating zero-size directory, default: false */
+ bool zero_size_dir;
};
/*
@@ -357,10 +359,10 @@ static inline int exfat_mode_can_hold_ro(struct inode *inode)
static inline mode_t exfat_make_mode(struct exfat_sb_info *sbi,
unsigned short attr, mode_t mode)
{
- if ((attr & ATTR_READONLY) && !(attr & ATTR_SUBDIR))
+ if ((attr & EXFAT_ATTR_READONLY) && !(attr & EXFAT_ATTR_SUBDIR))
mode &= ~0222;
- if (attr & ATTR_SUBDIR)
+ if (attr & EXFAT_ATTR_SUBDIR)
return (mode & ~sbi->options.fs_dmask) | S_IFDIR;
return (mode & ~sbi->options.fs_fmask) | S_IFREG;
@@ -372,18 +374,18 @@ static inline unsigned short exfat_make_attr(struct inode *inode)
unsigned short attr = EXFAT_I(inode)->attr;
if (S_ISDIR(inode->i_mode))
- attr |= ATTR_SUBDIR;
+ attr |= EXFAT_ATTR_SUBDIR;
if (exfat_mode_can_hold_ro(inode) && !(inode->i_mode & 0222))
- attr |= ATTR_READONLY;
+ attr |= EXFAT_ATTR_READONLY;
return attr;
}
static inline void exfat_save_attr(struct inode *inode, unsigned short attr)
{
if (exfat_mode_can_hold_ro(inode))
- EXFAT_I(inode)->attr = attr & (ATTR_RWMASK | ATTR_READONLY);
+ EXFAT_I(inode)->attr = attr & (EXFAT_ATTR_RWMASK | EXFAT_ATTR_READONLY);
else
- EXFAT_I(inode)->attr = attr & ATTR_RWMASK;
+ EXFAT_I(inode)->attr = attr & EXFAT_ATTR_RWMASK;
}
static inline bool exfat_is_last_sector_in_cluster(struct exfat_sb_info *sbi,
@@ -549,6 +551,7 @@ void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
u8 tz, __le16 time, __le16 date, u8 time_cs);
void exfat_truncate_atime(struct timespec64 *ts);
+void exfat_truncate_inode_atime(struct inode *inode);
void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
u8 *tz, __le16 *time, __le16 *date, u8 *time_cs);
u16 exfat_calc_chksum16(void *data, int len, u16 chksum, int type);
diff --git a/fs/exfat/exfat_raw.h b/fs/exfat/exfat_raw.h
index 0ece2e43cf49..971a1ccd0e89 100644
--- a/fs/exfat/exfat_raw.h
+++ b/fs/exfat/exfat_raw.h
@@ -64,15 +64,16 @@
#define CS_DEFAULT 2
/* file attributes */
-#define ATTR_READONLY 0x0001
-#define ATTR_HIDDEN 0x0002
-#define ATTR_SYSTEM 0x0004
-#define ATTR_VOLUME 0x0008
-#define ATTR_SUBDIR 0x0010
-#define ATTR_ARCHIVE 0x0020
-
-#define ATTR_RWMASK (ATTR_HIDDEN | ATTR_SYSTEM | ATTR_VOLUME | \
- ATTR_SUBDIR | ATTR_ARCHIVE)
+#define EXFAT_ATTR_READONLY 0x0001
+#define EXFAT_ATTR_HIDDEN 0x0002
+#define EXFAT_ATTR_SYSTEM 0x0004
+#define EXFAT_ATTR_VOLUME 0x0008
+#define EXFAT_ATTR_SUBDIR 0x0010
+#define EXFAT_ATTR_ARCHIVE 0x0020
+
+#define EXFAT_ATTR_RWMASK (EXFAT_ATTR_HIDDEN | EXFAT_ATTR_SYSTEM | \
+ EXFAT_ATTR_VOLUME | EXFAT_ATTR_SUBDIR | \
+ EXFAT_ATTR_ARCHIVE)
#define BOOTSEC_JUMP_BOOT_LEN 3
#define BOOTSEC_FS_NAME_LEN 8
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 32395ef686a2..bfdfafe00993 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -8,6 +8,9 @@
#include <linux/cred.h>
#include <linux/buffer_head.h>
#include <linux/blkdev.h>
+#include <linux/fsnotify.h>
+#include <linux/security.h>
+#include <linux/msdos_fs.h>
#include "exfat_raw.h"
#include "exfat_fs.h"
@@ -22,7 +25,7 @@ static int exfat_cont_expand(struct inode *inode, loff_t size)
if (err)
return err;
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
if (!IS_SYNC(inode))
@@ -144,7 +147,7 @@ int __exfat_truncate(struct inode *inode)
}
if (ei->type == TYPE_FILE)
- ei->attr |= ATTR_ARCHIVE;
+ ei->attr |= EXFAT_ATTR_ARCHIVE;
/*
* update the directory entry
@@ -290,10 +293,10 @@ int exfat_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
}
if (attr->ia_valid & ATTR_SIZE)
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
setattr_copy(&nop_mnt_idmap, inode, attr);
- exfat_truncate_atime(&inode->i_atime);
+ exfat_truncate_inode_atime(inode);
if (attr->ia_valid & ATTR_SIZE) {
error = exfat_block_truncate_page(inode, attr->ia_size);
@@ -316,6 +319,93 @@ out:
return error;
}
+/*
+ * modified ioctls from fat/file.c by Welmer Almesberger
+ */
+static int exfat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
+{
+ u32 attr;
+
+ inode_lock_shared(inode);
+ attr = exfat_make_attr(inode);
+ inode_unlock_shared(inode);
+
+ return put_user(attr, user_attr);
+}
+
+static int exfat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
+{
+ struct inode *inode = file_inode(file);
+ struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb);
+ int is_dir = S_ISDIR(inode->i_mode);
+ u32 attr, oldattr;
+ struct iattr ia;
+ int err;
+
+ err = get_user(attr, user_attr);
+ if (err)
+ goto out;
+
+ err = mnt_want_write_file(file);
+ if (err)
+ goto out;
+ inode_lock(inode);
+
+ oldattr = exfat_make_attr(inode);
+
+ /*
+ * Mask attributes so we don't set reserved fields.
+ */
+ attr &= (EXFAT_ATTR_READONLY | EXFAT_ATTR_HIDDEN | EXFAT_ATTR_SYSTEM |
+ EXFAT_ATTR_ARCHIVE);
+ attr |= (is_dir ? EXFAT_ATTR_SUBDIR : 0);
+
+ /* Equivalent to a chmod() */
+ ia.ia_valid = ATTR_MODE | ATTR_CTIME;
+ ia.ia_ctime = current_time(inode);
+ if (is_dir)
+ ia.ia_mode = exfat_make_mode(sbi, attr, 0777);
+ else
+ ia.ia_mode = exfat_make_mode(sbi, attr, 0666 | (inode->i_mode & 0111));
+
+ /* The root directory has no attributes */
+ if (inode->i_ino == EXFAT_ROOT_INO && attr != EXFAT_ATTR_SUBDIR) {
+ err = -EINVAL;
+ goto out_unlock_inode;
+ }
+
+ if (((attr | oldattr) & EXFAT_ATTR_SYSTEM) &&
+ !capable(CAP_LINUX_IMMUTABLE)) {
+ err = -EPERM;
+ goto out_unlock_inode;
+ }
+
+ /*
+ * The security check is questionable... We single
+ * out the RO attribute for checking by the security
+ * module, just because it maps to a file mode.
+ */
+ err = security_inode_setattr(file_mnt_idmap(file),
+ file->f_path.dentry, &ia);
+ if (err)
+ goto out_unlock_inode;
+
+ /* This MUST be done before doing anything irreversible... */
+ err = exfat_setattr(file_mnt_idmap(file), file->f_path.dentry, &ia);
+ if (err)
+ goto out_unlock_inode;
+
+ fsnotify_change(file->f_path.dentry, ia.ia_valid);
+
+ exfat_save_attr(inode, attr);
+ mark_inode_dirty(inode);
+out_unlock_inode:
+ inode_unlock(inode);
+ mnt_drop_write_file(file);
+out:
+ return err;
+}
+
static int exfat_ioctl_fitrim(struct inode *inode, unsigned long arg)
{
struct fstrim_range range;
@@ -346,8 +436,13 @@ static int exfat_ioctl_fitrim(struct inode *inode, unsigned long arg)
long exfat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
+ u32 __user *user_attr = (u32 __user *)arg;
switch (cmd) {
+ case FAT_IOCTL_GET_ATTRIBUTES:
+ return exfat_ioctl_get_attributes(inode, user_attr);
+ case FAT_IOCTL_SET_ATTRIBUTES:
+ return exfat_ioctl_set_attributes(filp, user_attr);
case FITRIM:
return exfat_ioctl_fitrim(inode, arg);
default:
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 13329baeafbc..e7ff58b8e68c 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -26,6 +26,7 @@ int __exfat_write_inode(struct inode *inode, int sync)
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(inode);
bool is_dir = (ei->type == TYPE_DIR) ? true : false;
+ struct timespec64 ts;
if (inode->i_ino == EXFAT_ROOT_INO)
return 0;
@@ -55,16 +56,18 @@ int __exfat_write_inode(struct inode *inode, int sync)
&ep->dentry.file.create_time,
&ep->dentry.file.create_date,
&ep->dentry.file.create_time_cs);
- exfat_set_entry_time(sbi, &inode->i_mtime,
- &ep->dentry.file.modify_tz,
- &ep->dentry.file.modify_time,
- &ep->dentry.file.modify_date,
- &ep->dentry.file.modify_time_cs);
- exfat_set_entry_time(sbi, &inode->i_atime,
- &ep->dentry.file.access_tz,
- &ep->dentry.file.access_time,
- &ep->dentry.file.access_date,
- NULL);
+ ts = inode_get_mtime(inode);
+ exfat_set_entry_time(sbi, &ts,
+ &ep->dentry.file.modify_tz,
+ &ep->dentry.file.modify_time,
+ &ep->dentry.file.modify_date,
+ &ep->dentry.file.modify_time_cs);
+ ts = inode_get_atime(inode);
+ exfat_set_entry_time(sbi, &ts,
+ &ep->dentry.file.access_tz,
+ &ep->dentry.file.access_time,
+ &ep->dentry.file.access_date,
+ NULL);
/* File size should be zero if there is no cluster allocated */
on_disk_size = i_size_read(inode);
@@ -355,7 +358,7 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to)
if (to > i_size_read(inode)) {
truncate_pagecache(inode, i_size_read(inode));
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
exfat_truncate(inode);
}
}
@@ -397,9 +400,9 @@ static int exfat_write_end(struct file *file, struct address_space *mapping,
if (err < len)
exfat_write_failed(mapping, pos+len);
- if (!(err < 0) && !(ei->attr & ATTR_ARCHIVE)) {
- inode->i_mtime = inode_set_ctime_current(inode);
- ei->attr |= ATTR_ARCHIVE;
+ if (!(err < 0) && !(ei->attr & EXFAT_ATTR_ARCHIVE)) {
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+ ei->attr |= EXFAT_ATTR_ARCHIVE;
mark_inode_dirty(inode);
}
@@ -547,7 +550,7 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info)
inode_inc_iversion(inode);
inode->i_generation = get_random_u32();
- if (info->attr & ATTR_SUBDIR) { /* directory */
+ if (info->attr & EXFAT_ATTR_SUBDIR) { /* directory */
inode->i_generation &= ~1;
inode->i_mode = exfat_make_mode(sbi, info->attr, 0777);
inode->i_op = &exfat_dir_inode_operations;
@@ -576,10 +579,10 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info)
exfat_save_attr(inode, info->attr);
inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> 9;
- inode->i_mtime = info->mtime;
+ inode_set_mtime_to_ts(inode, info->mtime);
inode_set_ctime_to_ts(inode, info->mtime);
ei->i_crtime = info->crtime;
- inode->i_atime = info->atime;
+ inode_set_atime_to_ts(inode, info->atime);
return 0;
}
diff --git a/fs/exfat/misc.c b/fs/exfat/misc.c
index 2e1a1a6b1021..fa8459828046 100644
--- a/fs/exfat/misc.c
+++ b/fs/exfat/misc.c
@@ -126,6 +126,14 @@ void exfat_truncate_atime(struct timespec64 *ts)
ts->tv_nsec = 0;
}
+void exfat_truncate_inode_atime(struct inode *inode)
+{
+ struct timespec64 atime = inode_get_atime(inode);
+
+ exfat_truncate_atime(&atime);
+ inode_set_atime_to_ts(inode, atime);
+}
+
u16 exfat_calc_chksum16(void *data, int len, u16 chksum, int type)
{
int i;
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 1b9f587f6cca..5d737e0b639a 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -351,14 +351,20 @@ static int exfat_find_empty_entry(struct inode *inode,
if (exfat_check_max_dentries(inode))
return -ENOSPC;
- /* we trust p_dir->size regardless of FAT type */
- if (exfat_find_last_cluster(sb, p_dir, &last_clu))
- return -EIO;
-
/*
* Allocate new cluster to this directory
*/
- exfat_chain_set(&clu, last_clu + 1, 0, p_dir->flags);
+ if (ei->start_clu != EXFAT_EOF_CLUSTER) {
+ /* we trust p_dir->size regardless of FAT type */
+ if (exfat_find_last_cluster(sb, p_dir, &last_clu))
+ return -EIO;
+
+ exfat_chain_set(&clu, last_clu + 1, 0, p_dir->flags);
+ } else {
+ /* This directory is empty */
+ exfat_chain_set(&clu, EXFAT_EOF_CLUSTER, 0,
+ ALLOC_NO_FAT_CHAIN);
+ }
/* allocate a cluster */
ret = exfat_alloc_cluster(inode, 1, &clu, IS_DIRSYNC(inode));
@@ -368,6 +374,11 @@ static int exfat_find_empty_entry(struct inode *inode,
if (exfat_zeroed_cluster(inode, clu.dir))
return -EIO;
+ if (ei->start_clu == EXFAT_EOF_CLUSTER) {
+ ei->start_clu = clu.dir;
+ p_dir->dir = clu.dir;
+ }
+
/* append to the FAT chain */
if (clu.flags != p_dir->flags) {
/* no-fat-chain bit is disabled,
@@ -507,7 +518,7 @@ static int exfat_add_entry(struct inode *inode, const char *path,
goto out;
}
- if (type == TYPE_DIR) {
+ if (type == TYPE_DIR && !sbi->options.zero_size_dir) {
ret = exfat_alloc_new_dir(inode, &clu);
if (ret)
goto out;
@@ -534,13 +545,16 @@ static int exfat_add_entry(struct inode *inode, const char *path,
info->type = type;
if (type == TYPE_FILE) {
- info->attr = ATTR_ARCHIVE;
+ info->attr = EXFAT_ATTR_ARCHIVE;
info->start_clu = EXFAT_EOF_CLUSTER;
info->size = 0;
info->num_subdirs = 0;
} else {
- info->attr = ATTR_SUBDIR;
- info->start_clu = start_clu;
+ info->attr = EXFAT_ATTR_SUBDIR;
+ if (sbi->options.zero_size_dir)
+ info->start_clu = EXFAT_EOF_CLUSTER;
+ else
+ info->start_clu = start_clu;
info->size = clu_size;
info->num_subdirs = EXFAT_MIN_SUBDIR;
}
@@ -569,7 +583,7 @@ static int exfat_create(struct mnt_idmap *idmap, struct inode *dir,
goto unlock;
inode_inc_iversion(dir);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
if (IS_DIRSYNC(dir))
exfat_sync_inode(dir);
else
@@ -582,8 +596,9 @@ static int exfat_create(struct mnt_idmap *idmap, struct inode *dir,
goto unlock;
inode_inc_iversion(inode);
- inode->i_mtime = inode->i_atime = EXFAT_I(inode)->i_crtime = inode_set_ctime_current(inode);
- exfat_truncate_atime(&inode->i_atime);
+ EXFAT_I(inode)->i_crtime = simple_inode_init_ts(inode);
+ exfat_truncate_inode_atime(inode);
+
/* timestamp is already written, so mark_inode_dirty() is unneeded. */
d_instantiate(dentry, inode);
@@ -645,7 +660,7 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
info->type = exfat_get_entry_type(ep);
info->attr = le16_to_cpu(ep->dentry.file.attr);
info->size = le64_to_cpu(ep2->dentry.stream.valid_size);
- if ((info->type == TYPE_FILE) && (info->size == 0)) {
+ if (info->size == 0) {
info->flags = ALLOC_NO_FAT_CHAIN;
info->start_clu = EXFAT_EOF_CLUSTER;
} else {
@@ -816,16 +831,16 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry)
ei->dir.dir = DIR_DELETED;
inode_inc_iversion(dir);
- dir->i_mtime = dir->i_atime = inode_set_ctime_current(dir);
- exfat_truncate_atime(&dir->i_atime);
+ simple_inode_init_ts(dir);
+ exfat_truncate_inode_atime(dir);
if (IS_DIRSYNC(dir))
exfat_sync_inode(dir);
else
mark_inode_dirty(dir);
clear_nlink(inode);
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
- exfat_truncate_atime(&inode->i_atime);
+ simple_inode_init_ts(inode);
+ exfat_truncate_inode_atime(inode);
exfat_unhash_inode(inode);
exfat_d_version_set(dentry, inode_query_iversion(dir));
unlock:
@@ -851,7 +866,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
goto unlock;
inode_inc_iversion(dir);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
if (IS_DIRSYNC(dir))
exfat_sync_inode(dir);
else
@@ -865,8 +880,8 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
goto unlock;
inode_inc_iversion(inode);
- inode->i_mtime = inode->i_atime = EXFAT_I(inode)->i_crtime = inode_set_ctime_current(inode);
- exfat_truncate_atime(&inode->i_atime);
+ EXFAT_I(inode)->i_crtime = simple_inode_init_ts(inode);
+ exfat_truncate_inode_atime(inode);
/* timestamp is already written, so mark_inode_dirty() is unneeded. */
d_instantiate(dentry, inode);
@@ -888,6 +903,9 @@ static int exfat_check_dir_empty(struct super_block *sb,
dentries_per_clu = sbi->dentries_per_clu;
+ if (p_dir->dir == EXFAT_EOF_CLUSTER)
+ return 0;
+
exfat_chain_dup(&clu, p_dir);
while (clu.dir != EXFAT_EOF_CLUSTER) {
@@ -977,8 +995,8 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
ei->dir.dir = DIR_DELETED;
inode_inc_iversion(dir);
- dir->i_mtime = dir->i_atime = inode_set_ctime_current(dir);
- exfat_truncate_atime(&dir->i_atime);
+ simple_inode_init_ts(dir);
+ exfat_truncate_inode_atime(dir);
if (IS_DIRSYNC(dir))
exfat_sync_inode(dir);
else
@@ -986,8 +1004,8 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
drop_nlink(dir);
clear_nlink(inode);
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
- exfat_truncate_atime(&inode->i_atime);
+ simple_inode_init_ts(inode);
+ exfat_truncate_inode_atime(inode);
exfat_unhash_inode(inode);
exfat_d_version_set(dentry, inode_query_iversion(dir));
unlock:
@@ -1032,8 +1050,8 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir,
*epnew = *epold;
if (exfat_get_entry_type(epnew) == TYPE_FILE) {
- epnew->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE);
- ei->attr |= ATTR_ARCHIVE;
+ epnew->dentry.file.attr |= cpu_to_le16(EXFAT_ATTR_ARCHIVE);
+ ei->attr |= EXFAT_ATTR_ARCHIVE;
}
exfat_update_bh(new_bh, sync);
brelse(old_bh);
@@ -1064,8 +1082,8 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir,
ei->entry = newentry;
} else {
if (exfat_get_entry_type(epold) == TYPE_FILE) {
- epold->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE);
- ei->attr |= ATTR_ARCHIVE;
+ epold->dentry.file.attr |= cpu_to_le16(EXFAT_ATTR_ARCHIVE);
+ ei->attr |= EXFAT_ATTR_ARCHIVE;
}
exfat_update_bh(old_bh, sync);
brelse(old_bh);
@@ -1113,8 +1131,8 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir,
*epnew = *epmov;
if (exfat_get_entry_type(epnew) == TYPE_FILE) {
- epnew->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE);
- ei->attr |= ATTR_ARCHIVE;
+ epnew->dentry.file.attr |= cpu_to_le16(EXFAT_ATTR_ARCHIVE);
+ ei->attr |= EXFAT_ATTR_ARCHIVE;
}
exfat_update_bh(new_bh, IS_DIRSYNC(inode));
brelse(mov_bh);
@@ -1255,7 +1273,8 @@ static int __exfat_rename(struct inode *old_parent_inode,
}
/* Free the clusters if new_inode is a dir(as if exfat_rmdir) */
- if (new_entry_type == TYPE_DIR) {
+ if (new_entry_type == TYPE_DIR &&
+ new_ei->start_clu != EXFAT_EOF_CLUSTER) {
/* new_ei, new_clu_to_free */
struct exfat_chain new_clu_to_free;
@@ -1312,7 +1331,7 @@ static int exfat_rename(struct mnt_idmap *idmap,
inode_inc_iversion(new_dir);
simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
EXFAT_I(new_dir)->i_crtime = current_time(new_dir);
- exfat_truncate_atime(&new_dir->i_atime);
+ exfat_truncate_inode_atime(new_dir);
if (IS_DIRSYNC(new_dir))
exfat_sync_inode(new_dir);
else
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 2778bd9b631e..d9d4fa91010b 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -165,6 +165,8 @@ static int exfat_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",sys_tz");
else if (opts->time_offset)
seq_printf(m, ",time_offset=%d", opts->time_offset);
+ if (opts->zero_size_dir)
+ seq_puts(m, ",zero_size_dir");
return 0;
}
@@ -209,6 +211,7 @@ enum {
Opt_keep_last_dots,
Opt_sys_tz,
Opt_time_offset,
+ Opt_zero_size_dir,
/* Deprecated options */
Opt_utf8,
@@ -237,6 +240,7 @@ static const struct fs_parameter_spec exfat_parameters[] = {
fsparam_flag("keep_last_dots", Opt_keep_last_dots),
fsparam_flag("sys_tz", Opt_sys_tz),
fsparam_s32("time_offset", Opt_time_offset),
+ fsparam_flag("zero_size_dir", Opt_zero_size_dir),
__fsparam(NULL, "utf8", Opt_utf8, fs_param_deprecated,
NULL),
__fsparam(NULL, "debug", Opt_debug, fs_param_deprecated,
@@ -305,6 +309,9 @@ static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param)
return -EINVAL;
opts->time_offset = result.int_32;
break;
+ case Opt_zero_size_dir:
+ opts->zero_size_dir = true;
+ break;
case Opt_utf8:
case Opt_debug:
case Opt_namecase:
@@ -360,7 +367,7 @@ static int exfat_read_root(struct inode *inode)
inode->i_gid = sbi->options.fs_gid;
inode_inc_iversion(inode);
inode->i_generation = 0;
- inode->i_mode = exfat_make_mode(sbi, ATTR_SUBDIR, 0777);
+ inode->i_mode = exfat_make_mode(sbi, EXFAT_ATTR_SUBDIR, 0777);
inode->i_op = &exfat_dir_inode_operations;
inode->i_fop = &exfat_dir_operations;
@@ -369,9 +376,9 @@ static int exfat_read_root(struct inode *inode)
ei->i_size_aligned = i_size_read(inode);
ei->i_size_ondisk = i_size_read(inode);
- exfat_save_attr(inode, ATTR_SUBDIR);
- inode->i_mtime = inode->i_atime = ei->i_crtime = inode_set_ctime_current(inode);
- exfat_truncate_atime(&inode->i_atime);
+ exfat_save_attr(inode, EXFAT_ATTR_SUBDIR);
+ ei->i_crtime = simple_inode_init_ts(inode);
+ exfat_truncate_inode_atime(inode);
return 0;
}
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index c20704aa21b3..3ae0154c5680 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -342,43 +342,30 @@ out:
return error;
}
+#define FILEID_INO64_GEN_LEN 3
+
/**
- * export_encode_fh - default export_operations->encode_fh function
+ * exportfs_encode_ino64_fid - encode non-decodeable 64bit ino file id
* @inode: the object to encode
* @fid: where to store the file handle fragment
- * @max_len: maximum length to store there
- * @parent: parent directory inode, if wanted
+ * @max_len: maximum length to store there (in 4 byte units)
*
- * This default encode_fh function assumes that the 32 inode number
- * is suitable for locating an inode, and that the generation number
- * can be used to check that it is still valid. It places them in the
- * filehandle fragment where export_decode_fh expects to find them.
+ * This generic function is used to encode a non-decodeable file id for
+ * fanotify for filesystems that do not support NFS export.
*/
-static int export_encode_fh(struct inode *inode, struct fid *fid,
- int *max_len, struct inode *parent)
+static int exportfs_encode_ino64_fid(struct inode *inode, struct fid *fid,
+ int *max_len)
{
- int len = *max_len;
- int type = FILEID_INO32_GEN;
-
- if (parent && (len < 4)) {
- *max_len = 4;
- return FILEID_INVALID;
- } else if (len < 2) {
- *max_len = 2;
+ if (*max_len < FILEID_INO64_GEN_LEN) {
+ *max_len = FILEID_INO64_GEN_LEN;
return FILEID_INVALID;
}
- len = 2;
- fid->i32.ino = inode->i_ino;
- fid->i32.gen = inode->i_generation;
- if (parent) {
- fid->i32.parent_ino = parent->i_ino;
- fid->i32.parent_gen = parent->i_generation;
- len = 4;
- type = FILEID_INO32_GEN_PARENT;
- }
- *max_len = len;
- return type;
+ fid->i64.ino = inode->i_ino;
+ fid->i64.gen = inode->i_generation;
+ *max_len = FILEID_INO64_GEN_LEN;
+
+ return FILEID_INO64_GEN;
}
/**
@@ -396,17 +383,13 @@ int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
{
const struct export_operations *nop = inode->i_sb->s_export_op;
- /*
- * If a decodeable file handle was requested, we need to make sure that
- * filesystem can decode file handles.
- */
- if (nop && !(flags & EXPORT_FH_FID) && !nop->fh_to_dentry)
+ if (!exportfs_can_encode_fh(nop, flags))
return -EOPNOTSUPP;
- if (nop && nop->encode_fh)
- return nop->encode_fh(inode, fid->raw, max_len, parent);
+ if (!nop && (flags & EXPORT_FH_FID))
+ return exportfs_encode_ino64_fid(inode, fid, max_len);
- return export_encode_fh(inode, fid, max_len, parent);
+ return nop->encode_fh(inode, fid->raw, max_len, parent);
}
EXPORT_SYMBOL_GPL(exportfs_encode_inode_fh);
@@ -456,7 +439,7 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len,
/*
* Try to get any dentry for the given file handle from the filesystem.
*/
- if (!nop || !nop->fh_to_dentry)
+ if (!exportfs_can_decode_fh(nop))
return ERR_PTR(-ESTALE);
result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
if (IS_ERR_OR_NULL(result))
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index b335f17f682f..4fb155b5a958 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -81,34 +81,34 @@ ext2_last_byte(struct inode *inode, unsigned long page_nr)
return last_byte;
}
-static void ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
+static void ext2_commit_chunk(struct folio *folio, loff_t pos, unsigned len)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = folio->mapping;
struct inode *dir = mapping->host;
inode_inc_iversion(dir);
- block_write_end(NULL, mapping, pos, len, len, page, NULL);
+ block_write_end(NULL, mapping, pos, len, len, &folio->page, NULL);
if (pos+len > dir->i_size) {
i_size_write(dir, pos+len);
mark_inode_dirty(dir);
}
- unlock_page(page);
+ folio_unlock(folio);
}
-static bool ext2_check_page(struct page *page, int quiet, char *kaddr)
+static bool ext2_check_folio(struct folio *folio, int quiet, char *kaddr)
{
- struct inode *dir = page->mapping->host;
+ struct inode *dir = folio->mapping->host;
struct super_block *sb = dir->i_sb;
unsigned chunk_size = ext2_chunk_size(dir);
u32 max_inumber = le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count);
unsigned offs, rec_len;
- unsigned limit = PAGE_SIZE;
+ unsigned limit = folio_size(folio);
ext2_dirent *p;
char *error;
- if ((dir->i_size >> PAGE_SHIFT) == page->index) {
- limit = dir->i_size & ~PAGE_MASK;
+ if (dir->i_size < folio_pos(folio) + limit) {
+ limit = offset_in_folio(folio, dir->i_size);
if (limit & (chunk_size - 1))
goto Ebadsize;
if (!limit)
@@ -132,7 +132,7 @@ static bool ext2_check_page(struct page *page, int quiet, char *kaddr)
if (offs != limit)
goto Eend;
out:
- SetPageChecked(page);
+ folio_set_checked(folio);
return true;
/* Too bad, we had an error */
@@ -160,51 +160,52 @@ Einumber:
bad_entry:
if (!quiet)
ext2_error(sb, __func__, "bad entry in directory #%lu: : %s - "
- "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
- dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs,
+ "offset=%llu, inode=%lu, rec_len=%d, name_len=%d",
+ dir->i_ino, error, folio_pos(folio) + offs,
(unsigned long) le32_to_cpu(p->inode),
rec_len, p->name_len);
goto fail;
Eend:
if (!quiet) {
p = (ext2_dirent *)(kaddr + offs);
- ext2_error(sb, "ext2_check_page",
+ ext2_error(sb, "ext2_check_folio",
"entry in directory #%lu spans the page boundary"
- "offset=%lu, inode=%lu",
- dir->i_ino, (page->index<<PAGE_SHIFT)+offs,
+ "offset=%llu, inode=%lu",
+ dir->i_ino, folio_pos(folio) + offs,
(unsigned long) le32_to_cpu(p->inode));
}
fail:
- SetPageError(page);
+ folio_set_error(folio);
return false;
}
/*
- * Calls to ext2_get_page()/ext2_put_page() must be nested according to the
- * rules documented in kmap_local_page()/kunmap_local().
+ * Calls to ext2_get_folio()/folio_release_kmap() must be nested according
+ * to the rules documented in kmap_local_folio()/kunmap_local().
*
- * NOTE: ext2_find_entry() and ext2_dotdot() act as a call to ext2_get_page()
- * and should be treated as a call to ext2_get_page() for nesting purposes.
+ * NOTE: ext2_find_entry() and ext2_dotdot() act as a call
+ * to folio_release_kmap() and should be treated as a call to
+ * folio_release_kmap() for nesting purposes.
*/
-static void *ext2_get_page(struct inode *dir, unsigned long n,
- int quiet, struct page **page)
+static void *ext2_get_folio(struct inode *dir, unsigned long n,
+ int quiet, struct folio **foliop)
{
struct address_space *mapping = dir->i_mapping;
struct folio *folio = read_mapping_folio(mapping, n, NULL);
- void *page_addr;
+ void *kaddr;
if (IS_ERR(folio))
return ERR_CAST(folio);
- page_addr = kmap_local_folio(folio, n & (folio_nr_pages(folio) - 1));
+ kaddr = kmap_local_folio(folio, 0);
if (unlikely(!folio_test_checked(folio))) {
- if (!ext2_check_page(&folio->page, quiet, page_addr))
+ if (!ext2_check_folio(folio, quiet, kaddr))
goto fail;
}
- *page = &folio->page;
- return page_addr;
+ *foliop = folio;
+ return kaddr;
fail:
- ext2_put_page(&folio->page, page_addr);
+ folio_release_kmap(folio, kaddr);
return ERR_PTR(-EIO);
}
@@ -274,8 +275,8 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
for ( ; n < npages; n++, offset = 0) {
ext2_dirent *de;
- struct page *page;
- char *kaddr = ext2_get_page(inode, n, 0, &page);
+ struct folio *folio;
+ char *kaddr = ext2_get_folio(inode, n, 0, &folio);
char *limit;
if (IS_ERR(kaddr)) {
@@ -299,7 +300,7 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
if (de->rec_len == 0) {
ext2_error(sb, __func__,
"zero-length directory entry");
- ext2_put_page(page, de);
+ folio_release_kmap(folio, de);
return -EIO;
}
if (de->inode) {
@@ -311,13 +312,13 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
if (!dir_emit(ctx, de->name, de->name_len,
le32_to_cpu(de->inode),
d_type)) {
- ext2_put_page(page, de);
+ folio_release_kmap(folio, de);
return 0;
}
}
ctx->pos += ext2_rec_len_from_disk(de->rec_len);
}
- ext2_put_page(page, kaddr);
+ folio_release_kmap(folio, kaddr);
}
return 0;
}
@@ -330,38 +331,35 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
* and the entry itself. Page is returned mapped and unlocked.
* Entry is guaranteed to be valid.
*
- * On Success ext2_put_page() should be called on *res_page.
+ * On Success folio_release_kmap() should be called on *foliop.
*
- * NOTE: Calls to ext2_get_page()/ext2_put_page() must be nested according to
- * the rules documented in kmap_local_page()/kunmap_local().
+ * NOTE: Calls to ext2_get_folio()/folio_release_kmap() must be nested
+ * according to the rules documented in kmap_local_folio()/kunmap_local().
*
- * ext2_find_entry() and ext2_dotdot() act as a call to ext2_get_page() and
- * should be treated as a call to ext2_get_page() for nesting purposes.
+ * ext2_find_entry() and ext2_dotdot() act as a call to ext2_get_folio()
+ * and should be treated as a call to ext2_get_folio() for nesting
+ * purposes.
*/
struct ext2_dir_entry_2 *ext2_find_entry (struct inode *dir,
- const struct qstr *child, struct page **res_page)
+ const struct qstr *child, struct folio **foliop)
{
const char *name = child->name;
int namelen = child->len;
unsigned reclen = EXT2_DIR_REC_LEN(namelen);
unsigned long start, n;
unsigned long npages = dir_pages(dir);
- struct page *page = NULL;
struct ext2_inode_info *ei = EXT2_I(dir);
ext2_dirent * de;
if (npages == 0)
goto out;
- /* OFFSET_CACHE */
- *res_page = NULL;
-
start = ei->i_dir_start_lookup;
if (start >= npages)
start = 0;
n = start;
do {
- char *kaddr = ext2_get_page(dir, n, 0, &page);
+ char *kaddr = ext2_get_folio(dir, n, 0, foliop);
if (IS_ERR(kaddr))
return ERR_CAST(kaddr);
@@ -371,18 +369,18 @@ struct ext2_dir_entry_2 *ext2_find_entry (struct inode *dir,
if (de->rec_len == 0) {
ext2_error(dir->i_sb, __func__,
"zero-length directory entry");
- ext2_put_page(page, de);
+ folio_release_kmap(*foliop, de);
goto out;
}
if (ext2_match(namelen, name, de))
goto found;
de = ext2_next_entry(de);
}
- ext2_put_page(page, kaddr);
+ folio_release_kmap(*foliop, kaddr);
if (++n >= npages)
n = 0;
- /* next page is past the blocks we've got */
+ /* next folio is past the blocks we've got */
if (unlikely(n > (dir->i_blocks >> (PAGE_SHIFT - 9)))) {
ext2_error(dir->i_sb, __func__,
"dir %lu size %lld exceeds block count %llu",
@@ -395,7 +393,6 @@ out:
return ERR_PTR(-ENOENT);
found:
- *res_page = page;
ei->i_dir_start_lookup = n;
return de;
}
@@ -404,17 +401,18 @@ found:
* Return the '..' directory entry and the page in which the entry was found
* (as a parameter - p).
*
- * On Success ext2_put_page() should be called on *p.
+ * On Success folio_release_kmap() should be called on *foliop.
*
- * NOTE: Calls to ext2_get_page()/ext2_put_page() must be nested according to
- * the rules documented in kmap_local_page()/kunmap_local().
+ * NOTE: Calls to ext2_get_folio()/folio_release_kmap() must be nested
+ * according to the rules documented in kmap_local_folio()/kunmap_local().
*
- * ext2_find_entry() and ext2_dotdot() act as a call to ext2_get_page() and
- * should be treated as a call to ext2_get_page() for nesting purposes.
+ * ext2_find_entry() and ext2_dotdot() act as a call to ext2_get_folio()
+ * and should be treated as a call to ext2_get_folio() for nesting
+ * purposes.
*/
-struct ext2_dir_entry_2 *ext2_dotdot(struct inode *dir, struct page **p)
+struct ext2_dir_entry_2 *ext2_dotdot(struct inode *dir, struct folio **foliop)
{
- ext2_dirent *de = ext2_get_page(dir, 0, 0, p);
+ ext2_dirent *de = ext2_get_folio(dir, 0, 0, foliop);
if (!IS_ERR(de))
return ext2_next_entry(de);
@@ -424,23 +422,22 @@ struct ext2_dir_entry_2 *ext2_dotdot(struct inode *dir, struct page **p)
int ext2_inode_by_name(struct inode *dir, const struct qstr *child, ino_t *ino)
{
struct ext2_dir_entry_2 *de;
- struct page *page;
-
- de = ext2_find_entry(dir, child, &page);
+ struct folio *folio;
+
+ de = ext2_find_entry(dir, child, &folio);
if (IS_ERR(de))
return PTR_ERR(de);
*ino = le32_to_cpu(de->inode);
- ext2_put_page(page, de);
+ folio_release_kmap(folio, de);
return 0;
}
-static int ext2_prepare_chunk(struct page *page, loff_t pos, unsigned len)
+static int ext2_prepare_chunk(struct folio *folio, loff_t pos, unsigned len)
{
- return __block_write_begin(page, pos, len, ext2_get_block);
+ return __block_write_begin(&folio->page, pos, len, ext2_get_block);
}
-
static int ext2_handle_dirsync(struct inode *dir)
{
int err;
@@ -452,23 +449,23 @@ static int ext2_handle_dirsync(struct inode *dir)
}
int ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
- struct page *page, struct inode *inode, bool update_times)
+ struct folio *folio, struct inode *inode, bool update_times)
{
- loff_t pos = page_offset(page) + offset_in_page(de);
+ loff_t pos = folio_pos(folio) + offset_in_folio(folio, de);
unsigned len = ext2_rec_len_from_disk(de->rec_len);
int err;
- lock_page(page);
- err = ext2_prepare_chunk(page, pos, len);
+ folio_lock(folio);
+ err = ext2_prepare_chunk(folio, pos, len);
if (err) {
- unlock_page(page);
+ folio_unlock(folio);
return err;
}
de->inode = cpu_to_le32(inode->i_ino);
ext2_set_de_type(de, inode);
- ext2_commit_chunk(page, pos, len);
+ ext2_commit_chunk(folio, pos, len);
if (update_times)
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
mark_inode_dirty(dir);
return ext2_handle_dirsync(dir);
@@ -485,7 +482,7 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode)
unsigned chunk_size = ext2_chunk_size(dir);
unsigned reclen = EXT2_DIR_REC_LEN(namelen);
unsigned short rec_len, name_len;
- struct page *page = NULL;
+ struct folio *folio = NULL;
ext2_dirent * de;
unsigned long npages = dir_pages(dir);
unsigned long n;
@@ -494,19 +491,19 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode)
/*
* We take care of directory expansion in the same loop.
- * This code plays outside i_size, so it locks the page
+ * This code plays outside i_size, so it locks the folio
* to protect that region.
*/
for (n = 0; n <= npages; n++) {
- char *kaddr = ext2_get_page(dir, n, 0, &page);
+ char *kaddr = ext2_get_folio(dir, n, 0, &folio);
char *dir_end;
if (IS_ERR(kaddr))
return PTR_ERR(kaddr);
- lock_page(page);
+ folio_lock(folio);
dir_end = kaddr + ext2_last_byte(dir, n);
de = (ext2_dirent *)kaddr;
- kaddr += PAGE_SIZE - reclen;
+ kaddr += folio_size(folio) - reclen;
while ((char *)de <= kaddr) {
if ((char *)de == dir_end) {
/* We hit i_size */
@@ -533,15 +530,15 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode)
goto got_it;
de = (ext2_dirent *) ((char *) de + rec_len);
}
- unlock_page(page);
- ext2_put_page(page, kaddr);
+ folio_unlock(folio);
+ folio_release_kmap(folio, kaddr);
}
BUG();
return -EINVAL;
got_it:
- pos = page_offset(page) + offset_in_page(de);
- err = ext2_prepare_chunk(page, pos, rec_len);
+ pos = folio_pos(folio) + offset_in_folio(folio, de);
+ err = ext2_prepare_chunk(folio, pos, rec_len);
if (err)
goto out_unlock;
if (de->inode) {
@@ -554,17 +551,17 @@ got_it:
memcpy(de->name, name, namelen);
de->inode = cpu_to_le32(inode->i_ino);
ext2_set_de_type (de, inode);
- ext2_commit_chunk(page, pos, rec_len);
- dir->i_mtime = inode_set_ctime_current(dir);
+ ext2_commit_chunk(folio, pos, rec_len);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
mark_inode_dirty(dir);
err = ext2_handle_dirsync(dir);
/* OFFSET_CACHE */
out_put:
- ext2_put_page(page, de);
+ folio_release_kmap(folio, de);
return err;
out_unlock:
- unlock_page(page);
+ folio_unlock(folio);
goto out_put;
}
@@ -572,18 +569,21 @@ out_unlock:
* ext2_delete_entry deletes a directory entry by merging it with the
* previous entry. Page is up-to-date.
*/
-int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct page *page)
+int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct folio *folio)
{
- struct inode *inode = page->mapping->host;
- char *kaddr = (char *)((unsigned long)dir & PAGE_MASK);
- unsigned from = offset_in_page(dir) & ~(ext2_chunk_size(inode)-1);
- unsigned to = offset_in_page(dir) +
- ext2_rec_len_from_disk(dir->rec_len);
+ struct inode *inode = folio->mapping->host;
+ size_t from, to;
+ char *kaddr;
loff_t pos;
- ext2_dirent *pde = NULL;
- ext2_dirent *de = (ext2_dirent *)(kaddr + from);
+ ext2_dirent *de, *pde = NULL;
int err;
+ from = offset_in_folio(folio, dir);
+ to = from + ext2_rec_len_from_disk(dir->rec_len);
+ kaddr = (char *)dir - from;
+ from &= ~(ext2_chunk_size(inode)-1);
+ de = (ext2_dirent *)(kaddr + from);
+
while ((char*)de < (char*)dir) {
if (de->rec_len == 0) {
ext2_error(inode->i_sb, __func__,
@@ -594,19 +594,19 @@ int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct page *page)
de = ext2_next_entry(de);
}
if (pde)
- from = offset_in_page(pde);
- pos = page_offset(page) + from;
- lock_page(page);
- err = ext2_prepare_chunk(page, pos, to - from);
+ from = offset_in_folio(folio, pde);
+ pos = folio_pos(folio) + from;
+ folio_lock(folio);
+ err = ext2_prepare_chunk(folio, pos, to - from);
if (err) {
- unlock_page(page);
+ folio_unlock(folio);
return err;
}
if (pde)
pde->rec_len = ext2_rec_len_to_disk(to - from);
dir->inode = 0;
- ext2_commit_chunk(page, pos, to - from);
- inode->i_mtime = inode_set_ctime_current(inode);
+ ext2_commit_chunk(folio, pos, to - from);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
EXT2_I(inode)->i_flags &= ~EXT2_BTREE_FL;
mark_inode_dirty(inode);
return ext2_handle_dirsync(inode);
@@ -617,21 +617,21 @@ int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct page *page)
*/
int ext2_make_empty(struct inode *inode, struct inode *parent)
{
- struct page *page = grab_cache_page(inode->i_mapping, 0);
+ struct folio *folio = filemap_grab_folio(inode->i_mapping, 0);
unsigned chunk_size = ext2_chunk_size(inode);
struct ext2_dir_entry_2 * de;
int err;
void *kaddr;
- if (!page)
- return -ENOMEM;
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
- err = ext2_prepare_chunk(page, 0, chunk_size);
+ err = ext2_prepare_chunk(folio, 0, chunk_size);
if (err) {
- unlock_page(page);
+ folio_unlock(folio);
goto fail;
}
- kaddr = kmap_local_page(page);
+ kaddr = kmap_local_folio(folio, 0);
memset(kaddr, 0, chunk_size);
de = (struct ext2_dir_entry_2 *)kaddr;
de->name_len = 1;
@@ -647,26 +647,26 @@ int ext2_make_empty(struct inode *inode, struct inode *parent)
memcpy (de->name, "..\0", 4);
ext2_set_de_type (de, inode);
kunmap_local(kaddr);
- ext2_commit_chunk(page, 0, chunk_size);
+ ext2_commit_chunk(folio, 0, chunk_size);
err = ext2_handle_dirsync(inode);
fail:
- put_page(page);
+ folio_put(folio);
return err;
}
/*
* routine to check that the specified directory is empty (for rmdir)
*/
-int ext2_empty_dir (struct inode * inode)
+int ext2_empty_dir(struct inode *inode)
{
- struct page *page;
+ struct folio *folio;
char *kaddr;
unsigned long i, npages = dir_pages(inode);
for (i = 0; i < npages; i++) {
ext2_dirent *de;
- kaddr = ext2_get_page(inode, i, 0, &page);
+ kaddr = ext2_get_folio(inode, i, 0, &folio);
if (IS_ERR(kaddr))
return 0;
@@ -695,12 +695,12 @@ int ext2_empty_dir (struct inode * inode)
}
de = ext2_next_entry(de);
}
- ext2_put_page(page, kaddr);
+ folio_release_kmap(folio, kaddr);
}
return 1;
not_empty:
- ext2_put_page(page, kaddr);
+ folio_release_kmap(folio, kaddr);
return 0;
}
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 7fdd685c384d..677a9ad45dcb 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -717,22 +717,17 @@ extern void ext2_init_block_alloc_info(struct inode *);
extern void ext2_rsv_window_add(struct super_block *sb, struct ext2_reserve_window_node *rsv);
/* dir.c */
-extern int ext2_add_link (struct dentry *, struct inode *);
-extern int ext2_inode_by_name(struct inode *dir,
+int ext2_add_link(struct dentry *, struct inode *);
+int ext2_inode_by_name(struct inode *dir,
const struct qstr *child, ino_t *ino);
-extern int ext2_make_empty(struct inode *, struct inode *);
-extern struct ext2_dir_entry_2 *ext2_find_entry(struct inode *, const struct qstr *,
- struct page **);
-extern int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct page *page);
-extern int ext2_empty_dir (struct inode *);
-extern struct ext2_dir_entry_2 *ext2_dotdot(struct inode *dir, struct page **p);
+int ext2_make_empty(struct inode *, struct inode *);
+struct ext2_dir_entry_2 *ext2_find_entry(struct inode *, const struct qstr *,
+ struct folio **foliop);
+int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct folio *folio);
+int ext2_empty_dir(struct inode *);
+struct ext2_dir_entry_2 *ext2_dotdot(struct inode *dir, struct folio **foliop);
int ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
- struct page *page, struct inode *inode, bool update_times);
-static inline void ext2_put_page(struct page *page, void *page_addr)
-{
- kunmap_local(page_addr);
- put_page(page);
-}
+ struct folio *folio, struct inode *inode, bool update_times);
/* ialloc.c */
extern struct inode * ext2_new_inode (struct inode *, umode_t, const struct qstr *);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 1039e5bf90af..4ddc36f4dbd4 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -258,7 +258,6 @@ static ssize_t ext2_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
goto out_unlock;
}
- iocb->ki_pos += status;
ret += status;
endbyte = pos + status - 1;
ret2 = filemap_write_and_wait_range(inode->i_mapping, pos,
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index c24d0de95a83..fdf63e9c6e7c 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -546,7 +546,7 @@ got:
inode->i_ino = ino;
inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
memset(ei->i_data, 0, sizeof(ei->i_data));
ei->i_flags =
ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 314b415ee518..464faf6c217e 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1291,7 +1291,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
__ext2_truncate_blocks(inode, newsize);
filemap_invalidate_unlock(inode->i_mapping);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
if (inode_needs_sync(inode)) {
sync_mapping_buffers(inode->i_mapping);
sync_inode_metadata(inode, 1);
@@ -1412,10 +1412,9 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
i_gid_write(inode, i_gid);
set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
inode->i_size = le32_to_cpu(raw_inode->i_size);
- inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
+ inode_set_atime(inode, (signed)le32_to_cpu(raw_inode->i_atime), 0);
inode_set_ctime(inode, (signed)le32_to_cpu(raw_inode->i_ctime), 0);
- inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
- inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
+ inode_set_mtime(inode, (signed)le32_to_cpu(raw_inode->i_mtime), 0);
ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
/* We now have enough fields to check if the inode was active or not.
* This is needed because nfsd might try to access dead inodes
@@ -1544,9 +1543,9 @@ static int __ext2_write_inode(struct inode *inode, int do_sync)
}
raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
raw_inode->i_size = cpu_to_le32(inode->i_size);
- raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
- raw_inode->i_ctime = cpu_to_le32(inode_get_ctime(inode).tv_sec);
- raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+ raw_inode->i_atime = cpu_to_le32(inode_get_atime_sec(inode));
+ raw_inode->i_ctime = cpu_to_le32(inode_get_ctime_sec(inode));
+ raw_inode->i_mtime = cpu_to_le32(inode_get_mtime_sec(inode));
raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 059517068adc..65f702b1da5b 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -273,21 +273,21 @@ static int ext2_unlink(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
struct ext2_dir_entry_2 *de;
- struct page *page;
+ struct folio *folio;
int err;
err = dquot_initialize(dir);
if (err)
goto out;
- de = ext2_find_entry(dir, &dentry->d_name, &page);
+ de = ext2_find_entry(dir, &dentry->d_name, &folio);
if (IS_ERR(de)) {
err = PTR_ERR(de);
goto out;
}
- err = ext2_delete_entry(de, page);
- ext2_put_page(page, de);
+ err = ext2_delete_entry(de, folio);
+ folio_release_kmap(folio, de);
if (err)
goto out;
@@ -321,9 +321,9 @@ static int ext2_rename (struct mnt_idmap * idmap,
{
struct inode * old_inode = d_inode(old_dentry);
struct inode * new_inode = d_inode(new_dentry);
- struct page * dir_page = NULL;
+ struct folio *dir_folio = NULL;
struct ext2_dir_entry_2 * dir_de = NULL;
- struct page * old_page;
+ struct folio * old_folio;
struct ext2_dir_entry_2 * old_de;
int err;
@@ -338,19 +338,19 @@ static int ext2_rename (struct mnt_idmap * idmap,
if (err)
return err;
- old_de = ext2_find_entry(old_dir, &old_dentry->d_name, &old_page);
+ old_de = ext2_find_entry(old_dir, &old_dentry->d_name, &old_folio);
if (IS_ERR(old_de))
return PTR_ERR(old_de);
if (S_ISDIR(old_inode->i_mode)) {
err = -EIO;
- dir_de = ext2_dotdot(old_inode, &dir_page);
+ dir_de = ext2_dotdot(old_inode, &dir_folio);
if (!dir_de)
goto out_old;
}
if (new_inode) {
- struct page *new_page;
+ struct folio *new_folio;
struct ext2_dir_entry_2 *new_de;
err = -ENOTEMPTY;
@@ -358,13 +358,13 @@ static int ext2_rename (struct mnt_idmap * idmap,
goto out_dir;
new_de = ext2_find_entry(new_dir, &new_dentry->d_name,
- &new_page);
+ &new_folio);
if (IS_ERR(new_de)) {
err = PTR_ERR(new_de);
goto out_dir;
}
- err = ext2_set_link(new_dir, new_de, new_page, old_inode, true);
- ext2_put_page(new_page, new_de);
+ err = ext2_set_link(new_dir, new_de, new_folio, old_inode, true);
+ folio_release_kmap(new_folio, new_de);
if (err)
goto out_dir;
inode_set_ctime_current(new_inode);
@@ -386,19 +386,19 @@ static int ext2_rename (struct mnt_idmap * idmap,
inode_set_ctime_current(old_inode);
mark_inode_dirty(old_inode);
- err = ext2_delete_entry(old_de, old_page);
+ err = ext2_delete_entry(old_de, old_folio);
if (!err && dir_de) {
if (old_dir != new_dir)
- err = ext2_set_link(old_inode, dir_de, dir_page,
+ err = ext2_set_link(old_inode, dir_de, dir_folio,
new_dir, false);
inode_dec_link_count(old_dir);
}
out_dir:
if (dir_de)
- ext2_put_page(dir_page, dir_de);
+ folio_release_kmap(dir_folio, dir_de);
out_old:
- ext2_put_page(old_page, old_de);
+ folio_release_kmap(old_folio, old_de);
return err;
}
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index aaf3e3e88cb2..01f9addc8b1f 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -397,6 +397,7 @@ static struct dentry *ext2_fh_to_parent(struct super_block *sb, struct fid *fid,
}
static const struct export_operations ext2_export_ops = {
+ .encode_fh = generic_encode_ino32_fh,
.fh_to_dentry = ext2_fh_to_dentry,
.fh_to_parent = ext2_fh_to_parent,
.get_parent = ext2_get_parent,
@@ -1572,7 +1573,7 @@ out:
if (inode->i_size < off+len-towrite)
i_size_write(inode, off+len-towrite);
inode_inc_iversion(inode);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
return len - towrite;
}
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 20f741184673..e849241ebb8f 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -98,7 +98,7 @@ static struct buffer_head *ext2_xattr_cache_find(struct inode *,
static void ext2_xattr_rehash(struct ext2_xattr_header *,
struct ext2_xattr_entry *);
-static const struct xattr_handler *ext2_xattr_handler_map[] = {
+static const struct xattr_handler * const ext2_xattr_handler_map[] = {
[EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler,
#ifdef CONFIG_EXT2_FS_POSIX_ACL
[EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access,
@@ -110,7 +110,7 @@ static const struct xattr_handler *ext2_xattr_handler_map[] = {
#endif
};
-const struct xattr_handler *ext2_xattr_handlers[] = {
+const struct xattr_handler * const ext2_xattr_handlers[] = {
&ext2_xattr_user_handler,
&ext2_xattr_trusted_handler,
#ifdef CONFIG_EXT2_FS_SECURITY
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index 7925f596e8e2..6a4966949047 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -72,7 +72,7 @@ extern void ext2_xattr_delete_inode(struct inode *);
extern struct mb_cache *ext2_xattr_create_cache(void);
extern void ext2_xattr_destroy_cache(struct mb_cache *cache);
-extern const struct xattr_handler *ext2_xattr_handlers[];
+extern const struct xattr_handler * const ext2_xattr_handlers[];
# else /* CONFIG_EXT2_FS_XATTR */
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 0c5a79c3b5d4..ef4c19e5f570 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -68,6 +68,11 @@ extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
static inline int
ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
{
+ /* usually, the umask is applied by posix_acl_create(), but if
+ ext4 ACL support is disabled at compile time, we need to do
+ it here, because posix_acl_create() will never be called */
+ inode->i_mode &= ~current_umask();
+
return 0;
}
#endif /* CONFIG_EXT4_FS_POSIX_ACL */
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 79b20d6ae39e..591fb3f710be 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -22,6 +22,7 @@
#include "mballoc.h"
#include <trace/events/ext4.h>
+#include <kunit/static_stub.h>
static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
ext4_group_t block_group);
@@ -111,10 +112,8 @@ static unsigned ext4_num_overhead_clusters(struct super_block *sb,
itbl_blk_start = ext4_inode_table(sb, gdp);
itbl_blk_end = itbl_blk_start + sbi->s_itb_per_group - 1;
if (itbl_blk_start <= end && itbl_blk_end >= start) {
- itbl_blk_start = itbl_blk_start >= start ?
- itbl_blk_start : start;
- itbl_blk_end = itbl_blk_end <= end ?
- itbl_blk_end : end;
+ itbl_blk_start = max(itbl_blk_start, start);
+ itbl_blk_end = min(itbl_blk_end, end);
itbl_cluster_start = EXT4_B2C(sbi, itbl_blk_start - start);
itbl_cluster_end = EXT4_B2C(sbi, itbl_blk_end - start);
@@ -274,6 +273,9 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct buffer_head *bh_p;
+ KUNIT_STATIC_STUB_REDIRECT(ext4_get_group_desc,
+ sb, block_group, bh);
+
if (block_group >= ngroups) {
ext4_error(sb, "block_group >= groups_count - block_group = %u,"
" groups_count = %u", block_group, ngroups);
@@ -468,6 +470,9 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
ext4_fsblk_t bitmap_blk;
int err;
+ KUNIT_STATIC_STUB_REDIRECT(ext4_read_block_bitmap_nowait,
+ sb, block_group, ignore_locked);
+
desc = ext4_get_group_desc(sb, block_group, NULL);
if (!desc)
return ERR_PTR(-EFSCORRUPTED);
@@ -563,6 +568,9 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
{
struct ext4_group_desc *desc;
+ KUNIT_STATIC_STUB_REDIRECT(ext4_wait_block_bitmap,
+ sb, block_group, bh);
+
if (!buffer_new(bh))
return 0;
desc = ext4_get_group_desc(sb, block_group, NULL);
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 453d4da5de52..7ae0b61258a7 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -232,19 +232,14 @@ static bool ext4_has_stable_inodes(struct super_block *sb)
return ext4_has_feature_stable_inodes(sb);
}
-static void ext4_get_ino_and_lblk_bits(struct super_block *sb,
- int *ino_bits_ret, int *lblk_bits_ret)
-{
- *ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count);
- *lblk_bits_ret = 8 * sizeof(ext4_lblk_t);
-}
-
const struct fscrypt_operations ext4_cryptops = {
- .key_prefix = "ext4:",
+ .needs_bounce_pages = 1,
+ .has_32bit_inodes = 1,
+ .supports_subblock_data_units = 1,
+ .legacy_key_prefix = "ext4:",
.get_context = ext4_get_context,
.set_context = ext4_set_context,
.get_dummy_policy = ext4_get_dummy_policy,
.empty_dir = ext4_empty_dir,
.has_stable_inodes = ext4_has_stable_inodes,
- .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits,
};
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9418359b1d9d..a5d784872303 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -891,10 +891,13 @@ do { \
(raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (ts).tv_sec, S32_MIN, S32_MAX)); \
} while (0)
-#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
- EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, (inode)->xtime)
+#define EXT4_INODE_SET_ATIME(inode, raw_inode) \
+ EXT4_INODE_SET_XTIME_VAL(i_atime, inode, raw_inode, inode_get_atime(inode))
-#define EXT4_INODE_SET_CTIME(inode, raw_inode) \
+#define EXT4_INODE_SET_MTIME(inode, raw_inode) \
+ EXT4_INODE_SET_XTIME_VAL(i_mtime, inode, raw_inode, inode_get_mtime(inode))
+
+#define EXT4_INODE_SET_CTIME(inode, raw_inode) \
EXT4_INODE_SET_XTIME_VAL(i_ctime, inode, raw_inode, inode_get_ctime(inode))
#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \
@@ -910,9 +913,16 @@ do { \
.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime) \
})
-#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \
+#define EXT4_INODE_GET_ATIME(inode, raw_inode) \
+do { \
+ inode_set_atime_to_ts(inode, \
+ EXT4_INODE_GET_XTIME_VAL(i_atime, inode, raw_inode)); \
+} while (0)
+
+#define EXT4_INODE_GET_MTIME(inode, raw_inode) \
do { \
- (inode)->xtime = EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode); \
+ inode_set_mtime_to_ts(inode, \
+ EXT4_INODE_GET_XTIME_VAL(i_mtime, inode, raw_inode)); \
} while (0)
#define EXT4_INODE_GET_CTIME(inode, raw_inode) \
@@ -1494,6 +1504,7 @@ struct ext4_sb_info {
loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
struct buffer_head * s_sbh; /* Buffer containing the super block */
struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */
+ /* Array of bh's for the block group descriptors */
struct buffer_head * __rcu *s_group_desc;
unsigned int s_mount_opt;
unsigned int s_mount_opt2;
@@ -1537,7 +1548,7 @@ struct ext4_sb_info {
unsigned long s_commit_interval;
u32 s_max_batch_time;
u32 s_min_batch_time;
- struct block_device *s_journal_bdev;
+ struct bdev_handle *s_journal_bdev_handle;
#ifdef CONFIG_QUOTA
/* Names of quota files with journalled quota */
char __rcu *s_qf_names[EXT4_MAXQUOTAS];
@@ -1564,7 +1575,7 @@ struct ext4_sb_info {
unsigned int *s_mb_maxs;
unsigned int s_group_info_size;
unsigned int s_mb_free_pending;
- struct list_head s_freed_data_list; /* List of blocks to be freed
+ struct list_head s_freed_data_list[2]; /* List of blocks to be freed
after commit completed */
struct list_head s_discard_list;
struct work_struct s_discard_work;
@@ -1653,7 +1664,7 @@ struct ext4_sb_info {
__u32 s_csum_seed;
/* Reclaim extents from extent status tree */
- struct shrinker s_es_shrinker;
+ struct shrinker *s_es_shrinker;
struct list_head s_es_list; /* List of inodes with reclaimable extents */
long s_es_nr_inode;
struct ext4_es_stats s_es_stats;
@@ -1676,7 +1687,8 @@ struct ext4_sb_info {
/*
* Barrier between writepages ops and changing any inode's JOURNAL_DATA
- * or EXTENTS flag.
+ * or EXTENTS flag or between writepages ops and changing DELALLOC or
+ * DIOREAD_NOLOCK mount options on remount.
*/
struct percpu_rw_semaphore s_writepages_rwsem;
struct dax_device *s_daxdev;
@@ -2924,7 +2936,7 @@ extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid);
extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
- int len, int state);
+ int len, bool state);
static inline bool ext4_mb_cr_expensive(enum criteria cr)
{
return cr >= CR_GOAL_LEN_SLOW;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 202c76996b62..d5efe076d3d3 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1010,6 +1010,11 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
ix = curp->p_idx;
}
+ if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
+ EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
+ return -EFSCORRUPTED;
+ }
+
len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
BUG_ON(len < 0);
if (len > 0) {
@@ -1019,11 +1024,6 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
}
- if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
- EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
- return -EFSCORRUPTED;
- }
-
ix->ei_block = cpu_to_le32(logical);
ext4_idx_store_pblock(ix, ptr);
le16_add_cpu(&curp->p_hdr->eh_entries, 1);
@@ -4481,7 +4481,8 @@ retry:
if (epos > new_size)
epos = new_size;
if (ext4_update_inode_size(inode, epos) & 0x1)
- inode->i_mtime = inode_get_ctime(inode);
+ inode_set_mtime_to_ts(inode,
+ inode_get_ctime(inode));
}
ret2 = ext4_mark_inode_dirty(handle, inode);
ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -4617,7 +4618,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
/* Now release the pages and zero block aligned part of pages */
truncate_pagecache_range(inode, start, end - 1);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
flags);
@@ -4642,7 +4643,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
goto out_mutex;
}
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
if (new_size)
ext4_update_inode_size(inode, new_size);
ret = ext4_mark_inode_dirty(handle, inode);
@@ -5378,7 +5379,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
up_write(&EXT4_I(inode)->i_data_sem);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
ret = ext4_mark_inode_dirty(handle, inode);
ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -5488,7 +5489,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
/* Expand file to avoid data loss if there is error while shifting */
inode->i_size += len;
EXT4_I(inode)->i_disksize += len;
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
ret = ext4_mark_inode_dirty(handle, inode);
if (ret)
goto out_stop;
@@ -6080,13 +6081,13 @@ int ext4_ext_clear_bb(struct inode *inode)
for (j = 0; j < path->p_depth; j++) {
ext4_mb_mark_bb(inode->i_sb,
- path[j].p_block, 1, 0);
+ path[j].p_block, 1, false);
ext4_fc_record_regions(inode->i_sb, inode->i_ino,
0, path[j].p_block, 1, 1);
}
ext4_free_ext_path(path);
}
- ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
+ ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
ext4_fc_record_regions(inode->i_sb, inode->i_ino,
map.m_lblk, map.m_pblk, map.m_len, 1);
}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 6f7de14c0fa8..4a00e2f019d9 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -152,8 +152,9 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
struct ext4_inode_info *locked_ei);
-static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len);
+static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len,
+ struct pending_reservation **prealloc);
int __init ext4_init_es(void)
{
@@ -448,6 +449,19 @@ static void ext4_es_list_del(struct inode *inode)
spin_unlock(&sbi->s_es_lock);
}
+static inline struct pending_reservation *__alloc_pending(bool nofail)
+{
+ if (!nofail)
+ return kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
+
+ return kmem_cache_zalloc(ext4_pending_cachep, GFP_KERNEL | __GFP_NOFAIL);
+}
+
+static inline void __free_pending(struct pending_reservation *pr)
+{
+ kmem_cache_free(ext4_pending_cachep, pr);
+}
+
/*
* Returns true if we cannot fail to allocate memory for this extent_status
* entry and cannot reclaim it until its status changes.
@@ -836,11 +850,12 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
{
struct extent_status newes;
ext4_lblk_t end = lblk + len - 1;
- int err1 = 0;
- int err2 = 0;
+ int err1 = 0, err2 = 0, err3 = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct extent_status *es1 = NULL;
struct extent_status *es2 = NULL;
+ struct pending_reservation *pr = NULL;
+ bool revise_pending = false;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return;
@@ -868,11 +883,17 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_es_insert_extent_check(inode, &newes);
+ revise_pending = sbi->s_cluster_ratio > 1 &&
+ test_opt(inode->i_sb, DELALLOC) &&
+ (status & (EXTENT_STATUS_WRITTEN |
+ EXTENT_STATUS_UNWRITTEN));
retry:
if (err1 && !es1)
es1 = __es_alloc_extent(true);
if ((err1 || err2) && !es2)
es2 = __es_alloc_extent(true);
+ if ((err1 || err2 || err3) && revise_pending && !pr)
+ pr = __alloc_pending(true);
write_lock(&EXT4_I(inode)->i_es_lock);
err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
@@ -897,13 +918,18 @@ retry:
es2 = NULL;
}
- if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) &&
- (status & EXTENT_STATUS_WRITTEN ||
- status & EXTENT_STATUS_UNWRITTEN))
- __revise_pending(inode, lblk, len);
+ if (revise_pending) {
+ err3 = __revise_pending(inode, lblk, len, &pr);
+ if (err3 != 0)
+ goto error;
+ if (pr) {
+ __free_pending(pr);
+ pr = NULL;
+ }
+ }
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
- if (err1 || err2)
+ if (err1 || err2 || err3)
goto retry;
ext4_es_print_tree(inode);
@@ -1311,7 +1337,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
rc->ndelonly--;
node = rb_next(&pr->rb_node);
rb_erase(&pr->rb_node, &tree->root);
- kmem_cache_free(ext4_pending_cachep, pr);
+ __free_pending(pr);
if (!node)
break;
pr = rb_entry(node, struct pending_reservation,
@@ -1405,8 +1431,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
}
}
if (count_reserved)
- count_rsvd(inode, lblk, orig_es.es_len - len1 - len2,
- &orig_es, &rc);
+ count_rsvd(inode, orig_es.es_lblk + len1,
+ orig_es.es_len - len1 - len2, &orig_es, &rc);
goto out_get_reserved;
}
@@ -1606,7 +1632,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink,
unsigned long nr;
struct ext4_sb_info *sbi;
- sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
+ sbi = shrink->private_data;
nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
return nr;
@@ -1615,8 +1641,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink,
static unsigned long ext4_es_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct ext4_sb_info *sbi = container_of(shrink,
- struct ext4_sb_info, s_es_shrinker);
+ struct ext4_sb_info *sbi = shrink->private_data;
int nr_to_scan = sc->nr_to_scan;
int ret, nr_shrunk;
@@ -1700,13 +1725,17 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
if (err)
goto err3;
- sbi->s_es_shrinker.scan_objects = ext4_es_scan;
- sbi->s_es_shrinker.count_objects = ext4_es_count;
- sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
- err = register_shrinker(&sbi->s_es_shrinker, "ext4-es:%s",
- sbi->s_sb->s_id);
- if (err)
+ sbi->s_es_shrinker = shrinker_alloc(0, "ext4-es:%s", sbi->s_sb->s_id);
+ if (!sbi->s_es_shrinker) {
+ err = -ENOMEM;
goto err4;
+ }
+
+ sbi->s_es_shrinker->scan_objects = ext4_es_scan;
+ sbi->s_es_shrinker->count_objects = ext4_es_count;
+ sbi->s_es_shrinker->private_data = sbi;
+
+ shrinker_register(sbi->s_es_shrinker);
return 0;
err4:
@@ -1726,7 +1755,7 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
- unregister_shrinker(&sbi->s_es_shrinker);
+ shrinker_free(sbi->s_es_shrinker);
}
/*
@@ -1907,11 +1936,13 @@ static struct pending_reservation *__get_pending(struct inode *inode,
*
* @inode - file containing the cluster
* @lblk - logical block in the cluster to be added
+ * @prealloc - preallocated pending entry
*
* Returns 0 on successful insertion and -ENOMEM on failure. If the
* pending reservation is already in the set, returns successfully.
*/
-static int __insert_pending(struct inode *inode, ext4_lblk_t lblk)
+static int __insert_pending(struct inode *inode, ext4_lblk_t lblk,
+ struct pending_reservation **prealloc)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
@@ -1937,10 +1968,15 @@ static int __insert_pending(struct inode *inode, ext4_lblk_t lblk)
}
}
- pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
- if (pr == NULL) {
- ret = -ENOMEM;
- goto out;
+ if (likely(*prealloc == NULL)) {
+ pr = __alloc_pending(false);
+ if (!pr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ } else {
+ pr = *prealloc;
+ *prealloc = NULL;
}
pr->lclu = lclu;
@@ -1970,7 +2006,7 @@ static void __remove_pending(struct inode *inode, ext4_lblk_t lblk)
if (pr != NULL) {
tree = &EXT4_I(inode)->i_pending_tree;
rb_erase(&pr->rb_node, &tree->root);
- kmem_cache_free(ext4_pending_cachep, pr);
+ __free_pending(pr);
}
}
@@ -2029,10 +2065,10 @@ void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
bool allocated)
{
struct extent_status newes;
- int err1 = 0;
- int err2 = 0;
+ int err1 = 0, err2 = 0, err3 = 0;
struct extent_status *es1 = NULL;
struct extent_status *es2 = NULL;
+ struct pending_reservation *pr = NULL;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return;
@@ -2052,6 +2088,8 @@ retry:
es1 = __es_alloc_extent(true);
if ((err1 || err2) && !es2)
es2 = __es_alloc_extent(true);
+ if ((err1 || err2 || err3) && allocated && !pr)
+ pr = __alloc_pending(true);
write_lock(&EXT4_I(inode)->i_es_lock);
err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1);
@@ -2074,11 +2112,18 @@ retry:
es2 = NULL;
}
- if (allocated)
- __insert_pending(inode, lblk);
+ if (allocated) {
+ err3 = __insert_pending(inode, lblk, &pr);
+ if (err3 != 0)
+ goto error;
+ if (pr) {
+ __free_pending(pr);
+ pr = NULL;
+ }
+ }
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
- if (err1 || err2)
+ if (err1 || err2 || err3)
goto retry;
ext4_es_print_tree(inode);
@@ -2184,21 +2229,24 @@ unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
* @inode - file containing the range
* @lblk - logical block defining the start of range
* @len - length of range in blocks
+ * @prealloc - preallocated pending entry
*
* Used after a newly allocated extent is added to the extents status tree.
* Requires that the extents in the range have either written or unwritten
* status. Must be called while holding i_es_lock.
*/
-static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len)
+static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len,
+ struct pending_reservation **prealloc)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_lblk_t end = lblk + len - 1;
ext4_lblk_t first, last;
bool f_del = false, l_del = false;
+ int ret = 0;
if (len == 0)
- return;
+ return 0;
/*
* Two cases - block range within single cluster and block range
@@ -2219,7 +2267,9 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
f_del = __es_scan_range(inode, &ext4_es_is_delonly,
first, lblk - 1);
if (f_del) {
- __insert_pending(inode, first);
+ ret = __insert_pending(inode, first, prealloc);
+ if (ret < 0)
+ goto out;
} else {
last = EXT4_LBLK_CMASK(sbi, end) +
sbi->s_cluster_ratio - 1;
@@ -2227,9 +2277,11 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
l_del = __es_scan_range(inode,
&ext4_es_is_delonly,
end + 1, last);
- if (l_del)
- __insert_pending(inode, last);
- else
+ if (l_del) {
+ ret = __insert_pending(inode, last, prealloc);
+ if (ret < 0)
+ goto out;
+ } else
__remove_pending(inode, last);
}
} else {
@@ -2237,18 +2289,24 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
if (first != lblk)
f_del = __es_scan_range(inode, &ext4_es_is_delonly,
first, lblk - 1);
- if (f_del)
- __insert_pending(inode, first);
- else
+ if (f_del) {
+ ret = __insert_pending(inode, first, prealloc);
+ if (ret < 0)
+ goto out;
+ } else
__remove_pending(inode, first);
last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
if (last != end)
l_del = __es_scan_range(inode, &ext4_es_is_delonly,
end + 1, last);
- if (l_del)
- __insert_pending(inode, last);
- else
+ if (l_del) {
+ ret = __insert_pending(inode, last, prealloc);
+ if (ret < 0)
+ goto out;
+ } else
__remove_pending(inode, last);
}
+out:
+ return ret;
}
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index b06de728b3b6..87c009e0c59a 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -1806,7 +1806,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
* at the end of the FC replay using our array of
* modified inodes.
*/
- ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
+ ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
goto next;
}
@@ -1875,7 +1875,7 @@ ext4_fc_replay_del_range(struct super_block *sb,
if (ret > 0) {
remaining -= ret;
cur += ret;
- ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
+ ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
} else {
remaining -= map.m_len;
cur += map.m_len;
@@ -1934,12 +1934,12 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
if (!IS_ERR(path)) {
for (j = 0; j < path->p_depth; j++)
ext4_mb_mark_bb(inode->i_sb,
- path[j].p_block, 1, 1);
+ path[j].p_block, 1, true);
ext4_free_ext_path(path);
}
cur += ret;
ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
- map.m_len, 1);
+ map.m_len, true);
} else {
cur = cur + (map.m_len ? map.m_len : 1);
}
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6830ea3a6c59..6aa15dafc677 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -306,80 +306,38 @@ out:
}
static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
- ssize_t written, size_t count)
+ ssize_t count)
{
handle_t *handle;
- bool truncate = false;
- u8 blkbits = inode->i_blkbits;
- ext4_lblk_t written_blk, end_blk;
- int ret;
-
- /*
- * Note that EXT4_I(inode)->i_disksize can get extended up to
- * inode->i_size while the I/O was running due to writeback of delalloc
- * blocks. But, the code in ext4_iomap_alloc() is careful to use
- * zeroed/unwritten extents if this is possible; thus we won't leave
- * uninitialized blocks in a file even if we didn't succeed in writing
- * as much as we intended.
- */
- WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize);
- if (offset + count <= EXT4_I(inode)->i_disksize) {
- /*
- * We need to ensure that the inode is removed from the orphan
- * list if it has been added prematurely, due to writeback of
- * delalloc blocks.
- */
- if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-
- if (IS_ERR(handle)) {
- ext4_orphan_del(NULL, inode);
- return PTR_ERR(handle);
- }
-
- ext4_orphan_del(handle, inode);
- ext4_journal_stop(handle);
- }
-
- return written;
- }
-
- if (written < 0)
- goto truncate;
+ lockdep_assert_held_write(&inode->i_rwsem);
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle)) {
- written = PTR_ERR(handle);
- goto truncate;
- }
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
- if (ext4_update_inode_size(inode, offset + written)) {
- ret = ext4_mark_inode_dirty(handle, inode);
+ if (ext4_update_inode_size(inode, offset + count)) {
+ int ret = ext4_mark_inode_dirty(handle, inode);
if (unlikely(ret)) {
- written = ret;
ext4_journal_stop(handle);
- goto truncate;
+ return ret;
}
}
- /*
- * We may need to truncate allocated but not written blocks beyond EOF.
- */
- written_blk = ALIGN(offset + written, 1 << blkbits);
- end_blk = ALIGN(offset + count, 1 << blkbits);
- if (written_blk < end_blk && ext4_can_truncate(inode))
- truncate = true;
-
- /*
- * Remove the inode from the orphan list if it has been extended and
- * everything went OK.
- */
- if (!truncate && inode->i_nlink)
+ if (inode->i_nlink)
ext4_orphan_del(handle, inode);
ext4_journal_stop(handle);
- if (truncate) {
-truncate:
+ return count;
+}
+
+/*
+ * Clean up the inode after DIO or DAX extending write has completed and the
+ * inode size has been updated using ext4_handle_inode_extension().
+ */
+static void ext4_inode_extension_cleanup(struct inode *inode, ssize_t count)
+{
+ lockdep_assert_held_write(&inode->i_rwsem);
+ if (count < 0) {
ext4_truncate_failed_write(inode);
/*
* If the truncate operation failed early, then the inode may
@@ -388,9 +346,29 @@ truncate:
*/
if (inode->i_nlink)
ext4_orphan_del(NULL, inode);
+ return;
}
+ /*
+ * If i_disksize got extended either due to writeback of delalloc
+ * blocks or extending truncate while the DIO was running we could fail
+ * to cleanup the orphan list in ext4_handle_inode_extension(). Do it
+ * now.
+ */
+ if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
+ handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- return written;
+ if (IS_ERR(handle)) {
+ /*
+ * The write has successfully completed. Not much to
+ * do with the error here so just cleanup the orphan
+ * list and hope for the best.
+ */
+ ext4_orphan_del(NULL, inode);
+ return;
+ }
+ ext4_orphan_del(handle, inode);
+ ext4_journal_stop(handle);
+ }
}
static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
@@ -399,31 +377,23 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
loff_t pos = iocb->ki_pos;
struct inode *inode = file_inode(iocb->ki_filp);
+ if (!error && size && flags & IOMAP_DIO_UNWRITTEN)
+ error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
if (error)
return error;
-
- if (size && flags & IOMAP_DIO_UNWRITTEN) {
- error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
- if (error < 0)
- return error;
- }
/*
- * If we are extending the file, we have to update i_size here before
- * page cache gets invalidated in iomap_dio_rw(). Otherwise racing
- * buffered reads could zero out too much from page cache pages. Update
- * of on-disk size will happen later in ext4_dio_write_iter() where
- * we have enough information to also perform orphan list handling etc.
- * Note that we perform all extending writes synchronously under
- * i_rwsem held exclusively so i_size update is safe here in that case.
- * If the write was not extending, we cannot see pos > i_size here
- * because operations reducing i_size like truncate wait for all
- * outstanding DIO before updating i_size.
+ * Note that EXT4_I(inode)->i_disksize can get extended up to
+ * inode->i_size while the I/O was running due to writeback of delalloc
+ * blocks. But the code in ext4_iomap_alloc() is careful to use
+ * zeroed/unwritten extents if this is possible; thus we won't leave
+ * uninitialized blocks in a file even if we didn't succeed in writing
+ * as much as we intended. Also we can race with truncate or write
+ * expanding the file so we have to be a bit careful here.
*/
- pos += size;
- if (pos > i_size_read(inode))
- i_size_write(inode, pos);
-
- return 0;
+ if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) &&
+ pos + size <= i_size_read(inode))
+ return size;
+ return ext4_handle_inode_extension(inode, pos, size);
}
static const struct iomap_dio_ops ext4_dio_write_ops = {
@@ -569,18 +539,20 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
return ext4_buffered_write_iter(iocb, from);
}
+ /*
+ * Prevent inline data from being created since we are going to allocate
+ * blocks for DIO. We know the inode does not currently have inline data
+ * because ext4_should_use_dio() checked for it, but we have to clear
+ * the state flag before the write checks because a lock cycle could
+ * introduce races with other writers.
+ */
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+
ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend,
&unwritten, &dio_flags);
if (ret <= 0)
return ret;
- /*
- * Make sure inline data cannot be created anymore since we are going
- * to allocate blocks for DIO. We know the inode does not have any
- * inline data now because ext4_dio_supported() checked for that.
- */
- ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
-
offset = iocb->ki_pos;
count = ret;
@@ -606,9 +578,16 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
dio_flags, NULL, 0);
if (ret == -ENOTBLK)
ret = 0;
-
- if (extend)
- ret = ext4_handle_inode_extension(inode, offset, ret, count);
+ if (extend) {
+ /*
+ * We always perform extending DIO write synchronously so by
+ * now the IO is completed and ext4_handle_inode_extension()
+ * was called. Cleanup the inode in case of error or race with
+ * writeback of delalloc blocks.
+ */
+ WARN_ON_ONCE(ret == -EIOCBQUEUED);
+ ext4_inode_extension_cleanup(inode, ret);
+ }
out:
if (ilock_shared)
@@ -689,8 +668,10 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
- if (extend)
- ret = ext4_handle_inode_extension(inode, offset, ret, count);
+ if (extend) {
+ ret = ext4_handle_inode_extension(inode, offset, ret);
+ ext4_inode_extension_cleanup(inode, ret);
+ }
out:
inode_unlock(inode);
if (ret > 0)
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index cdf9bfe10137..11e6f33677a2 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -576,8 +576,9 @@ static bool ext4_getfsmap_is_valid_device(struct super_block *sb,
if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX ||
fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev))
return true;
- if (EXT4_SB(sb)->s_journal_bdev &&
- fm->fmr_device == new_encode_dev(EXT4_SB(sb)->s_journal_bdev->bd_dev))
+ if (EXT4_SB(sb)->s_journal_bdev_handle &&
+ fm->fmr_device ==
+ new_encode_dev(EXT4_SB(sb)->s_journal_bdev_handle->bdev->bd_dev))
return true;
return false;
}
@@ -647,9 +648,9 @@ int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head,
memset(handlers, 0, sizeof(handlers));
handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev);
handlers[0].gfd_fn = ext4_getfsmap_datadev;
- if (EXT4_SB(sb)->s_journal_bdev) {
+ if (EXT4_SB(sb)->s_journal_bdev_handle) {
handlers[1].gfd_dev = new_encode_dev(
- EXT4_SB(sb)->s_journal_bdev->bd_dev);
+ EXT4_SB(sb)->s_journal_bdev_handle->bdev->bd_dev);
handlers[1].gfd_fn = ext4_getfsmap_logdev;
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index b65058d972f9..e9bbb1da2d0a 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1250,8 +1250,8 @@ got:
inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
/* This is the optimal IO size (for stat), not the fs block size */
inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
- ei->i_crtime = inode->i_mtime;
+ simple_inode_init_ts(inode);
+ ei->i_crtime = inode_get_mtime(inode);
memset(ei->i_data, 0, sizeof(ei->i_data));
ei->i_dir_start_lookup = 0;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 012d9259ff53..9a84a5f9fef4 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1037,7 +1037,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
* happen is that the times are slightly out of date
* and/or different from the directory change time.
*/
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
ext4_update_dx_flag(dir);
inode_inc_iversion(dir);
return 1;
@@ -1991,7 +1991,7 @@ out:
ext4_orphan_del(handle, inode);
if (err == 0) {
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
err = ext4_mark_inode_dirty(handle, inode);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4ce35f1c8b0a..61277f7f8722 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -789,10 +789,22 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
+ int ret = 0;
+
ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n",
inode->i_ino, create);
- return _ext4_get_block(inode, iblock, bh_result,
+ ret = _ext4_get_block(inode, iblock, bh_result,
EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
+
+ /*
+ * If the buffer is marked unwritten, mark it as new to make sure it is
+ * zeroed out correctly in case of partial writes. Otherwise, there is
+ * a chance of stale data getting exposed.
+ */
+ if (ret == 0 && buffer_unwritten(bh_result))
+ set_buffer_new(bh_result);
+
+ return ret;
}
/* Maximum number of blocks we map for direct IO at once. */
@@ -1020,10 +1032,8 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
BUG_ON(from > to);
head = folio_buffers(folio);
- if (!head) {
- create_empty_buffers(&folio->page, blocksize, 0);
- head = folio_buffers(folio);
- }
+ if (!head)
+ head = create_empty_buffers(folio, blocksize, 0);
bbits = ilog2(blocksize);
block = (sector_t)folio->index << (PAGE_SHIFT - bbits);
@@ -1153,7 +1163,7 @@ retry_grab:
* starting the handle.
*/
if (!folio_buffers(folio))
- create_empty_buffers(&folio->page, inode->i_sb->s_blocksize, 0);
+ create_empty_buffers(folio, inode->i_sb->s_blocksize, 0);
folio_unlock(folio);
@@ -3643,10 +3653,8 @@ static int __ext4_block_zero_page_range(handle_t *handle,
iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
bh = folio_buffers(folio);
- if (!bh) {
- create_empty_buffers(&folio->page, blocksize, 0);
- bh = folio_buffers(folio);
- }
+ if (!bh)
+ bh = create_empty_buffers(folio, blocksize, 0);
/* Find the buffer that contains "offset" */
pos = blocksize;
@@ -4020,7 +4028,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
ret2 = ext4_mark_inode_dirty(handle, inode);
if (unlikely(ret2))
ret = ret2;
@@ -4180,7 +4188,7 @@ out_stop:
if (inode->i_nlink)
ext4_orphan_del(handle, inode);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
err2 = ext4_mark_inode_dirty(handle, inode);
if (unlikely(err2 && !err))
err = err2;
@@ -4284,8 +4292,8 @@ static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode
raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
EXT4_INODE_SET_CTIME(inode, raw_inode);
- EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
- EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
+ EXT4_INODE_SET_MTIME(inode, raw_inode);
+ EXT4_INODE_SET_ATIME(inode, raw_inode);
EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
@@ -4893,8 +4901,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
}
EXT4_INODE_GET_CTIME(inode, raw_inode);
- EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
- EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
+ EXT4_INODE_GET_ATIME(inode, raw_inode);
+ EXT4_INODE_GET_MTIME(inode, raw_inode);
EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
@@ -5019,8 +5027,8 @@ static void __ext4_update_other_inode_time(struct super_block *sb,
spin_lock(&ei->i_raw_lock);
EXT4_INODE_SET_CTIME(inode, raw_inode);
- EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
- EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
+ EXT4_INODE_SET_MTIME(inode, raw_inode);
+ EXT4_INODE_SET_ATIME(inode, raw_inode);
ext4_inode_csum_set(inode, raw_inode, ei);
spin_unlock(&ei->i_raw_lock);
trace_ext4_other_inode_update_time(inode, orig_ino);
@@ -5413,7 +5421,8 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
* update c/mtime in shrink case below
*/
if (!shrink)
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode,
+ inode_set_ctime_current(inode));
if (shrink)
ext4_fc_track_range(handle, inode,
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 0bfe2ce589e2..4f931f80cb34 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -312,13 +312,22 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
struct ext4_inode_info *ei1;
struct ext4_inode_info *ei2;
unsigned long tmp;
+ struct timespec64 ts1, ts2;
ei1 = EXT4_I(inode1);
ei2 = EXT4_I(inode2);
swap(inode1->i_version, inode2->i_version);
- swap(inode1->i_atime, inode2->i_atime);
- swap(inode1->i_mtime, inode2->i_mtime);
+
+ ts1 = inode_get_atime(inode1);
+ ts2 = inode_get_atime(inode2);
+ inode_set_atime_to_ts(inode1, ts2);
+ inode_set_atime_to_ts(inode2, ts1);
+
+ ts1 = inode_get_mtime(inode1);
+ ts2 = inode_get_mtime(inode2);
+ inode_set_mtime_to_ts(inode1, ts2);
+ inode_set_mtime_to_ts(inode2, ts1);
memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP;
diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c
new file mode 100644
index 000000000000..f94901fd3835
--- /dev/null
+++ b/fs/ext4/mballoc-test.c
@@ -0,0 +1,349 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit test of ext4 multiblocks allocation.
+ */
+
+#include <kunit/test.h>
+#include <kunit/static_stub.h>
+
+#include "ext4.h"
+
+struct mbt_grp_ctx {
+ struct buffer_head bitmap_bh;
+ /* desc and gd_bh are just the place holders for now */
+ struct ext4_group_desc desc;
+ struct buffer_head gd_bh;
+};
+
+struct mbt_ctx {
+ struct mbt_grp_ctx *grp_ctx;
+};
+
+struct mbt_ext4_super_block {
+ struct super_block sb;
+ struct mbt_ctx mbt_ctx;
+};
+
+#define MBT_CTX(_sb) (&(container_of((_sb), struct mbt_ext4_super_block, sb)->mbt_ctx))
+#define MBT_GRP_CTX(_sb, _group) (&MBT_CTX(_sb)->grp_ctx[_group])
+
+static struct super_block *mbt_ext4_alloc_super_block(void)
+{
+ struct ext4_super_block *es = kzalloc(sizeof(*es), GFP_KERNEL);
+ struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+ struct mbt_ext4_super_block *fsb = kzalloc(sizeof(*fsb), GFP_KERNEL);
+
+ if (fsb == NULL || sbi == NULL || es == NULL)
+ goto out;
+
+ sbi->s_es = es;
+ fsb->sb.s_fs_info = sbi;
+ return &fsb->sb;
+
+out:
+ kfree(fsb);
+ kfree(sbi);
+ kfree(es);
+ return NULL;
+}
+
+static void mbt_ext4_free_super_block(struct super_block *sb)
+{
+ struct mbt_ext4_super_block *fsb =
+ container_of(sb, struct mbt_ext4_super_block, sb);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ kfree(sbi->s_es);
+ kfree(sbi);
+ kfree(fsb);
+}
+
+struct mbt_ext4_block_layout {
+ unsigned char blocksize_bits;
+ unsigned int cluster_bits;
+ uint32_t blocks_per_group;
+ ext4_group_t group_count;
+ uint16_t desc_size;
+};
+
+static void mbt_init_sb_layout(struct super_block *sb,
+ struct mbt_ext4_block_layout *layout)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+
+ sb->s_blocksize = 1UL << layout->blocksize_bits;
+ sb->s_blocksize_bits = layout->blocksize_bits;
+
+ sbi->s_groups_count = layout->group_count;
+ sbi->s_blocks_per_group = layout->blocks_per_group;
+ sbi->s_cluster_bits = layout->cluster_bits;
+ sbi->s_cluster_ratio = 1U << layout->cluster_bits;
+ sbi->s_clusters_per_group = layout->blocks_per_group >>
+ layout->cluster_bits;
+ sbi->s_desc_size = layout->desc_size;
+
+ es->s_first_data_block = cpu_to_le32(0);
+ es->s_blocks_count_lo = cpu_to_le32(layout->blocks_per_group *
+ layout->group_count);
+}
+
+static int mbt_grp_ctx_init(struct super_block *sb,
+ struct mbt_grp_ctx *grp_ctx)
+{
+ grp_ctx->bitmap_bh.b_data = kzalloc(EXT4_BLOCK_SIZE(sb), GFP_KERNEL);
+ if (grp_ctx->bitmap_bh.b_data == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void mbt_grp_ctx_release(struct mbt_grp_ctx *grp_ctx)
+{
+ kfree(grp_ctx->bitmap_bh.b_data);
+ grp_ctx->bitmap_bh.b_data = NULL;
+}
+
+static void mbt_ctx_mark_used(struct super_block *sb, ext4_group_t group,
+ unsigned int start, unsigned int len)
+{
+ struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group);
+
+ mb_set_bits(grp_ctx->bitmap_bh.b_data, start, len);
+}
+
+/* called after mbt_init_sb_layout */
+static int mbt_ctx_init(struct super_block *sb)
+{
+ struct mbt_ctx *ctx = MBT_CTX(sb);
+ ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+
+ ctx->grp_ctx = kcalloc(ngroups, sizeof(struct mbt_grp_ctx),
+ GFP_KERNEL);
+ if (ctx->grp_ctx == NULL)
+ return -ENOMEM;
+
+ for (i = 0; i < ngroups; i++)
+ if (mbt_grp_ctx_init(sb, &ctx->grp_ctx[i]))
+ goto out;
+
+ /*
+ * first data block(first cluster in first group) is used by
+ * metadata, mark it used to avoid to alloc data block at first
+ * block which will fail ext4_sb_block_valid check.
+ */
+ mb_set_bits(ctx->grp_ctx[0].bitmap_bh.b_data, 0, 1);
+
+ return 0;
+out:
+ while (i-- > 0)
+ mbt_grp_ctx_release(&ctx->grp_ctx[i]);
+ kfree(ctx->grp_ctx);
+ return -ENOMEM;
+}
+
+static void mbt_ctx_release(struct super_block *sb)
+{
+ struct mbt_ctx *ctx = MBT_CTX(sb);
+ ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+
+ for (i = 0; i < ngroups; i++)
+ mbt_grp_ctx_release(&ctx->grp_ctx[i]);
+ kfree(ctx->grp_ctx);
+}
+
+static struct buffer_head *
+ext4_read_block_bitmap_nowait_stub(struct super_block *sb, ext4_group_t block_group,
+ bool ignore_locked)
+{
+ struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, block_group);
+
+ /* paired with brelse from caller of ext4_read_block_bitmap_nowait */
+ get_bh(&grp_ctx->bitmap_bh);
+ return &grp_ctx->bitmap_bh;
+}
+
+static int ext4_wait_block_bitmap_stub(struct super_block *sb,
+ ext4_group_t block_group,
+ struct buffer_head *bh)
+{
+ return 0;
+}
+
+static struct ext4_group_desc *
+ext4_get_group_desc_stub(struct super_block *sb, ext4_group_t block_group,
+ struct buffer_head **bh)
+{
+ struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, block_group);
+
+ if (bh != NULL)
+ *bh = &grp_ctx->gd_bh;
+
+ return &grp_ctx->desc;
+}
+
+static int
+ext4_mb_mark_context_stub(handle_t *handle, struct super_block *sb, bool state,
+ ext4_group_t group, ext4_grpblk_t blkoff,
+ ext4_grpblk_t len, int flags,
+ ext4_grpblk_t *ret_changed)
+{
+ struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group);
+ struct buffer_head *bitmap_bh = &grp_ctx->bitmap_bh;
+
+ if (state)
+ mb_set_bits(bitmap_bh->b_data, blkoff, len);
+ else
+ mb_clear_bits(bitmap_bh->b_data, blkoff, len);
+
+ return 0;
+}
+
+#define TEST_GOAL_GROUP 1
+static int mbt_kunit_init(struct kunit *test)
+{
+ struct mbt_ext4_block_layout *layout =
+ (struct mbt_ext4_block_layout *)(test->param_value);
+ struct super_block *sb;
+ int ret;
+
+ sb = mbt_ext4_alloc_super_block();
+ if (sb == NULL)
+ return -ENOMEM;
+
+ mbt_init_sb_layout(sb, layout);
+
+ ret = mbt_ctx_init(sb);
+ if (ret != 0) {
+ mbt_ext4_free_super_block(sb);
+ return ret;
+ }
+
+ test->priv = sb;
+ kunit_activate_static_stub(test,
+ ext4_read_block_bitmap_nowait,
+ ext4_read_block_bitmap_nowait_stub);
+ kunit_activate_static_stub(test,
+ ext4_wait_block_bitmap,
+ ext4_wait_block_bitmap_stub);
+ kunit_activate_static_stub(test,
+ ext4_get_group_desc,
+ ext4_get_group_desc_stub);
+ kunit_activate_static_stub(test,
+ ext4_mb_mark_context,
+ ext4_mb_mark_context_stub);
+ return 0;
+}
+
+static void mbt_kunit_exit(struct kunit *test)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+
+ mbt_ctx_release(sb);
+ mbt_ext4_free_super_block(sb);
+}
+
+static void test_new_blocks_simple(struct kunit *test)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ struct inode inode = { .i_sb = sb, };
+ struct ext4_allocation_request ar;
+ ext4_group_t i, goal_group = TEST_GOAL_GROUP;
+ int err = 0;
+ ext4_fsblk_t found;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ ar.inode = &inode;
+
+ /* get block at goal */
+ ar.goal = ext4_group_first_block_no(sb, goal_group);
+ found = ext4_mb_new_blocks_simple(&ar, &err);
+ KUNIT_ASSERT_EQ_MSG(test, ar.goal, found,
+ "failed to alloc block at goal, expected %llu found %llu",
+ ar.goal, found);
+
+ /* get block after goal in goal group */
+ ar.goal = ext4_group_first_block_no(sb, goal_group);
+ found = ext4_mb_new_blocks_simple(&ar, &err);
+ KUNIT_ASSERT_EQ_MSG(test, ar.goal + EXT4_C2B(sbi, 1), found,
+ "failed to alloc block after goal in goal group, expected %llu found %llu",
+ ar.goal + 1, found);
+
+ /* get block after goal group */
+ mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb));
+ ar.goal = ext4_group_first_block_no(sb, goal_group);
+ found = ext4_mb_new_blocks_simple(&ar, &err);
+ KUNIT_ASSERT_EQ_MSG(test,
+ ext4_group_first_block_no(sb, goal_group + 1), found,
+ "failed to alloc block after goal group, expected %llu found %llu",
+ ext4_group_first_block_no(sb, goal_group + 1), found);
+
+ /* get block before goal group */
+ for (i = goal_group; i < ext4_get_groups_count(sb); i++)
+ mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb));
+ ar.goal = ext4_group_first_block_no(sb, goal_group);
+ found = ext4_mb_new_blocks_simple(&ar, &err);
+ KUNIT_ASSERT_EQ_MSG(test,
+ ext4_group_first_block_no(sb, 0) + EXT4_C2B(sbi, 1), found,
+ "failed to alloc block before goal group, expected %llu found %llu",
+ ext4_group_first_block_no(sb, 0 + EXT4_C2B(sbi, 1)), found);
+
+ /* no block available, fail to allocate block */
+ for (i = 0; i < ext4_get_groups_count(sb); i++)
+ mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb));
+ ar.goal = ext4_group_first_block_no(sb, goal_group);
+ found = ext4_mb_new_blocks_simple(&ar, &err);
+ KUNIT_ASSERT_NE_MSG(test, err, 0,
+ "unexpectedly get block when no block is available");
+}
+
+static const struct mbt_ext4_block_layout mbt_test_layouts[] = {
+ {
+ .blocksize_bits = 10,
+ .cluster_bits = 3,
+ .blocks_per_group = 8192,
+ .group_count = 4,
+ .desc_size = 64,
+ },
+ {
+ .blocksize_bits = 12,
+ .cluster_bits = 3,
+ .blocks_per_group = 8192,
+ .group_count = 4,
+ .desc_size = 64,
+ },
+ {
+ .blocksize_bits = 16,
+ .cluster_bits = 3,
+ .blocks_per_group = 8192,
+ .group_count = 4,
+ .desc_size = 64,
+ },
+};
+
+static void mbt_show_layout(const struct mbt_ext4_block_layout *layout,
+ char *desc)
+{
+ snprintf(desc, KUNIT_PARAM_DESC_SIZE, "block_bits=%d cluster_bits=%d "
+ "blocks_per_group=%d group_count=%d desc_size=%d\n",
+ layout->blocksize_bits, layout->cluster_bits,
+ layout->blocks_per_group, layout->group_count,
+ layout->desc_size);
+}
+KUNIT_ARRAY_PARAM(mbt_layouts, mbt_test_layouts, mbt_show_layout);
+
+static struct kunit_case mbt_test_cases[] = {
+ KUNIT_CASE_PARAM(test_new_blocks_simple, mbt_layouts_gen_params),
+ {}
+};
+
+static struct kunit_suite mbt_test_suite = {
+ .name = "ext4_mballoc_test",
+ .init = mbt_kunit_init,
+ .exit = mbt_kunit_exit,
+ .test_cases = mbt_test_cases,
+};
+
+kunit_test_suites(&mbt_test_suite);
+
+MODULE_LICENSE("GPL");
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 1e599305d85f..d72b5e3c92ec 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -18,6 +18,7 @@
#include <linux/backing-dev.h>
#include <linux/freezer.h>
#include <trace/events/ext4.h>
+#include <kunit/static_stub.h>
/*
* MUSTDO:
@@ -417,8 +418,6 @@ static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group);
-static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
- ext4_group_t group);
static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
@@ -1361,17 +1360,17 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
* We place the buddy block and bitmap block
* close together
*/
+ grinfo = ext4_get_group_info(sb, group);
+ if (!grinfo) {
+ err = -EFSCORRUPTED;
+ goto out;
+ }
if ((first_block + i) & 1) {
/* this is block of buddy */
BUG_ON(incore == NULL);
mb_debug(sb, "put buddy for group %u in page %lu/%x\n",
group, page->index, i * blocksize);
trace_ext4_mb_buddy_bitmap_load(sb, group);
- grinfo = ext4_get_group_info(sb, group);
- if (!grinfo) {
- err = -EFSCORRUPTED;
- goto out;
- }
grinfo->bb_fragments = 0;
memset(grinfo->bb_counters, 0,
sizeof(*grinfo->bb_counters) *
@@ -1398,7 +1397,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
/* mark all preallocated blks used in in-core bitmap */
ext4_mb_generate_from_pa(sb, data, group);
- ext4_mb_generate_from_freelist(sb, data, group);
+ WARN_ON_ONCE(!RB_EMPTY_ROOT(&grinfo->bb_free_root));
ext4_unlock_group(sb, group);
/* set incore so that the buddy information can be
@@ -3631,7 +3630,8 @@ int ext4_mb_init(struct super_block *sb)
spin_lock_init(&sbi->s_md_lock);
sbi->s_mb_free_pending = 0;
- INIT_LIST_HEAD(&sbi->s_freed_data_list);
+ INIT_LIST_HEAD(&sbi->s_freed_data_list[0]);
+ INIT_LIST_HEAD(&sbi->s_freed_data_list[1]);
INIT_LIST_HEAD(&sbi->s_discard_list);
INIT_WORK(&sbi->s_discard_work, ext4_discard_work);
atomic_set(&sbi->s_retry_alloc_pending, 0);
@@ -3883,19 +3883,10 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_free_data *entry, *tmp;
LIST_HEAD(freed_data_list);
- struct list_head *cut_pos = NULL;
+ struct list_head *s_freed_head = &sbi->s_freed_data_list[commit_tid & 1];
bool wake;
- spin_lock(&sbi->s_md_lock);
- list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) {
- if (entry->efd_tid != commit_tid)
- break;
- cut_pos = &entry->efd_list;
- }
- if (cut_pos)
- list_cut_position(&freed_data_list, &sbi->s_freed_data_list,
- cut_pos);
- spin_unlock(&sbi->s_md_lock);
+ list_replace_init(s_freed_head, &freed_data_list);
list_for_each_entry(entry, &freed_data_list, efd_list)
ext4_free_data_in_buddy(sb, entry);
@@ -3953,6 +3944,111 @@ void ext4_exit_mballoc(void)
ext4_groupinfo_destroy_slabs();
}
+#define EXT4_MB_BITMAP_MARKED_CHECK 0x0001
+#define EXT4_MB_SYNC_UPDATE 0x0002
+static int
+ext4_mb_mark_context(handle_t *handle, struct super_block *sb, bool state,
+ ext4_group_t group, ext4_grpblk_t blkoff,
+ ext4_grpblk_t len, int flags, ext4_grpblk_t *ret_changed)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct buffer_head *bitmap_bh = NULL;
+ struct ext4_group_desc *gdp;
+ struct buffer_head *gdp_bh;
+ int err;
+ unsigned int i, already, changed = len;
+
+ KUNIT_STATIC_STUB_REDIRECT(ext4_mb_mark_context,
+ handle, sb, state, group, blkoff, len,
+ flags, ret_changed);
+
+ if (ret_changed)
+ *ret_changed = 0;
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR(bitmap_bh))
+ return PTR_ERR(bitmap_bh);
+
+ if (handle) {
+ BUFFER_TRACE(bitmap_bh, "getting write access");
+ err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
+ EXT4_JTR_NONE);
+ if (err)
+ goto out_err;
+ }
+
+ err = -EIO;
+ gdp = ext4_get_group_desc(sb, group, &gdp_bh);
+ if (!gdp)
+ goto out_err;
+
+ if (handle) {
+ BUFFER_TRACE(gdp_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sb, gdp_bh,
+ EXT4_JTR_NONE);
+ if (err)
+ goto out_err;
+ }
+
+ ext4_lock_group(sb, group);
+ if (ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_clusters_after_init(sb, group, gdp));
+ }
+
+ if (flags & EXT4_MB_BITMAP_MARKED_CHECK) {
+ already = 0;
+ for (i = 0; i < len; i++)
+ if (mb_test_bit(blkoff + i, bitmap_bh->b_data) ==
+ state)
+ already++;
+ changed = len - already;
+ }
+
+ if (state) {
+ mb_set_bits(bitmap_bh->b_data, blkoff, len);
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_group_clusters(sb, gdp) - changed);
+ } else {
+ mb_clear_bits(bitmap_bh->b_data, blkoff, len);
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_group_clusters(sb, gdp) + changed);
+ }
+
+ ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
+ ext4_group_desc_csum_set(sb, group, gdp);
+ ext4_unlock_group(sb, group);
+ if (ret_changed)
+ *ret_changed = changed;
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, group);
+ struct flex_groups *fg = sbi_array_rcu_deref(sbi,
+ s_flex_groups, flex_group);
+
+ if (state)
+ atomic64_sub(changed, &fg->free_clusters);
+ else
+ atomic64_add(changed, &fg->free_clusters);
+ }
+
+ err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+ if (err)
+ goto out_err;
+ err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
+ if (err)
+ goto out_err;
+
+ if (flags & EXT4_MB_SYNC_UPDATE) {
+ sync_dirty_buffer(bitmap_bh);
+ sync_dirty_buffer(gdp_bh);
+ }
+
+out_err:
+ brelse(bitmap_bh);
+ return err;
+}
/*
* Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
@@ -3962,13 +4058,13 @@ static noinline_for_stack int
ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
handle_t *handle, unsigned int reserv_clstrs)
{
- struct buffer_head *bitmap_bh = NULL;
struct ext4_group_desc *gdp;
- struct buffer_head *gdp_bh;
struct ext4_sb_info *sbi;
struct super_block *sb;
ext4_fsblk_t block;
int err, len;
+ int flags = 0;
+ ext4_grpblk_t changed;
BUG_ON(ac->ac_status != AC_STATUS_FOUND);
BUG_ON(ac->ac_b_ex.fe_len <= 0);
@@ -3976,32 +4072,13 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
sb = ac->ac_sb;
sbi = EXT4_SB(sb);
- bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
- if (IS_ERR(bitmap_bh)) {
- return PTR_ERR(bitmap_bh);
- }
-
- BUFFER_TRACE(bitmap_bh, "getting write access");
- err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
- EXT4_JTR_NONE);
- if (err)
- goto out_err;
-
- err = -EIO;
- gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
+ gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, NULL);
if (!gdp)
- goto out_err;
-
+ return -EIO;
ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
ext4_free_group_clusters(sb, gdp));
- BUFFER_TRACE(gdp_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, sb, gdp_bh, EXT4_JTR_NONE);
- if (err)
- goto out_err;
-
block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
-
len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
if (!ext4_inode_block_valid(ac->ac_inode, block, len)) {
ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
@@ -4010,41 +4087,29 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
* Fix the bitmap and return EFSCORRUPTED
* We leak some of the blocks here.
*/
- ext4_lock_group(sb, ac->ac_b_ex.fe_group);
- mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
- ac->ac_b_ex.fe_len);
- ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
- err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+ err = ext4_mb_mark_context(handle, sb, true,
+ ac->ac_b_ex.fe_group,
+ ac->ac_b_ex.fe_start,
+ ac->ac_b_ex.fe_len,
+ 0, NULL);
if (!err)
err = -EFSCORRUPTED;
- goto out_err;
+ return err;
}
- ext4_lock_group(sb, ac->ac_b_ex.fe_group);
#ifdef AGGRESSIVE_CHECK
- {
- int i;
- for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
- BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
- bitmap_bh->b_data));
- }
- }
+ flags |= EXT4_MB_BITMAP_MARKED_CHECK;
#endif
- mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
- ac->ac_b_ex.fe_len);
- if (ext4_has_group_desc_csum(sb) &&
- (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
- gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
- ext4_free_group_clusters_set(sb, gdp,
- ext4_free_clusters_after_init(sb,
- ac->ac_b_ex.fe_group, gdp));
- }
- len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
- ext4_free_group_clusters_set(sb, gdp, len);
- ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
- ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
+ err = ext4_mb_mark_context(handle, sb, true, ac->ac_b_ex.fe_group,
+ ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len,
+ flags, &changed);
+
+ if (err && changed == 0)
+ return err;
- ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+#ifdef AGGRESSIVE_CHECK
+ BUG_ON(changed != ac->ac_b_ex.fe_len);
+#endif
percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
/*
* Now reduce the dirty block count also. Should not go negative
@@ -4054,21 +4119,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
percpu_counter_sub(&sbi->s_dirtyclusters_counter,
reserv_clstrs);
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t flex_group = ext4_flex_group(sbi,
- ac->ac_b_ex.fe_group);
- atomic64_sub(ac->ac_b_ex.fe_len,
- &sbi_array_rcu_deref(sbi, s_flex_groups,
- flex_group)->free_clusters);
- }
-
- err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
- if (err)
- goto out_err;
- err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
-
-out_err:
- brelse(bitmap_bh);
return err;
}
@@ -4077,17 +4127,13 @@ out_err:
* blocks in bitmaps and update counters.
*/
void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
- int len, int state)
+ int len, bool state)
{
- struct buffer_head *bitmap_bh = NULL;
- struct ext4_group_desc *gdp;
- struct buffer_head *gdp_bh;
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_group_t group;
ext4_grpblk_t blkoff;
- int i, err = 0;
- int already;
- unsigned int clen, clen_changed, thisgrp_len;
+ int err = 0;
+ unsigned int clen, thisgrp_len;
while (len > 0) {
ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
@@ -4108,80 +4154,21 @@ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
ext4_error(sb, "Marking blocks in system zone - "
"Block = %llu, len = %u",
block, thisgrp_len);
- bitmap_bh = NULL;
- break;
- }
-
- bitmap_bh = ext4_read_block_bitmap(sb, group);
- if (IS_ERR(bitmap_bh)) {
- err = PTR_ERR(bitmap_bh);
- bitmap_bh = NULL;
- break;
- }
-
- err = -EIO;
- gdp = ext4_get_group_desc(sb, group, &gdp_bh);
- if (!gdp)
break;
-
- ext4_lock_group(sb, group);
- already = 0;
- for (i = 0; i < clen; i++)
- if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) ==
- !state)
- already++;
-
- clen_changed = clen - already;
- if (state)
- mb_set_bits(bitmap_bh->b_data, blkoff, clen);
- else
- mb_clear_bits(bitmap_bh->b_data, blkoff, clen);
- if (ext4_has_group_desc_csum(sb) &&
- (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
- gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
- ext4_free_group_clusters_set(sb, gdp,
- ext4_free_clusters_after_init(sb, group, gdp));
}
- if (state)
- clen = ext4_free_group_clusters(sb, gdp) - clen_changed;
- else
- clen = ext4_free_group_clusters(sb, gdp) + clen_changed;
-
- ext4_free_group_clusters_set(sb, gdp, clen);
- ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
- ext4_group_desc_csum_set(sb, group, gdp);
-
- ext4_unlock_group(sb, group);
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t flex_group = ext4_flex_group(sbi, group);
- struct flex_groups *fg = sbi_array_rcu_deref(sbi,
- s_flex_groups, flex_group);
-
- if (state)
- atomic64_sub(clen_changed, &fg->free_clusters);
- else
- atomic64_add(clen_changed, &fg->free_clusters);
-
- }
-
- err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
- if (err)
- break;
- sync_dirty_buffer(bitmap_bh);
- err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
- sync_dirty_buffer(gdp_bh);
+ err = ext4_mb_mark_context(NULL, sb, state,
+ group, blkoff, clen,
+ EXT4_MB_BITMAP_MARKED_CHECK |
+ EXT4_MB_SYNC_UPDATE,
+ NULL);
if (err)
break;
block += thisgrp_len;
len -= thisgrp_len;
- brelse(bitmap_bh);
BUG_ON(len < 0);
}
-
- if (err)
- brelse(bitmap_bh);
}
/*
@@ -4491,6 +4478,10 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
start = max(start, rounddown(ac->ac_o_ex.fe_logical,
(ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb)));
+ /* avoid unnecessary preallocation that may trigger assertions */
+ if (start + size > EXT_MAX_BLOCKS)
+ size = EXT_MAX_BLOCKS - start;
+
/* don't cover already allocated blocks in selected range */
if (ar->pleft && start <= ar->lleft) {
size -= ar->lleft + 1 - start;
@@ -4959,31 +4950,6 @@ try_group_pa:
}
/*
- * the function goes through all block freed in the group
- * but not yet committed and marks them used in in-core bitmap.
- * buddy must be generated from this bitmap
- * Need to be called with the ext4 group lock held
- */
-static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
- ext4_group_t group)
-{
- struct rb_node *n;
- struct ext4_group_info *grp;
- struct ext4_free_data *entry;
-
- grp = ext4_get_group_info(sb, group);
- if (!grp)
- return;
- n = rb_first(&(grp->bb_free_root));
-
- while (n) {
- entry = rb_entry(n, struct ext4_free_data, efd_node);
- mb_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
- n = rb_next(n);
- }
-}
-
-/*
* the function goes through all preallocation in this group and marks them
* used in in-core bitmap. buddy must be generated from this bitmap
* Need to be called with ext4 group lock held
@@ -6130,7 +6096,7 @@ ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp)
}
block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i);
- ext4_mb_mark_bb(sb, block, 1, 1);
+ ext4_mb_mark_bb(sb, block, 1, true);
ar->len = 1;
return block;
@@ -6378,7 +6344,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
}
spin_lock(&sbi->s_md_lock);
- list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list);
+ list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list[new_entry->efd_tid & 1]);
sbi->s_mb_free_pending += clusters;
spin_unlock(&sbi->s_md_lock);
}
@@ -6386,43 +6352,15 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
unsigned long count)
{
- struct buffer_head *bitmap_bh;
struct super_block *sb = inode->i_sb;
- struct ext4_group_desc *gdp;
- struct buffer_head *gdp_bh;
ext4_group_t group;
ext4_grpblk_t blkoff;
- int already_freed = 0, err, i;
ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
- bitmap_bh = ext4_read_block_bitmap(sb, group);
- if (IS_ERR(bitmap_bh)) {
- pr_warn("Failed to read block bitmap\n");
- return;
- }
- gdp = ext4_get_group_desc(sb, group, &gdp_bh);
- if (!gdp)
- goto err_out;
-
- for (i = 0; i < count; i++) {
- if (!mb_test_bit(blkoff + i, bitmap_bh->b_data))
- already_freed++;
- }
- mb_clear_bits(bitmap_bh->b_data, blkoff, count);
- err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
- if (err)
- goto err_out;
- ext4_free_group_clusters_set(
- sb, gdp, ext4_free_group_clusters(sb, gdp) +
- count - already_freed);
- ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
- ext4_group_desc_csum_set(sb, group, gdp);
- ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
- sync_dirty_buffer(bitmap_bh);
- sync_dirty_buffer(gdp_bh);
-
-err_out:
- brelse(bitmap_bh);
+ ext4_mb_mark_context(NULL, sb, false, group, blkoff, count,
+ EXT4_MB_BITMAP_MARKED_CHECK |
+ EXT4_MB_SYNC_UPDATE,
+ NULL);
}
/**
@@ -6438,19 +6376,17 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
ext4_fsblk_t block, unsigned long count,
int flags)
{
- struct buffer_head *bitmap_bh = NULL;
struct super_block *sb = inode->i_sb;
- struct ext4_group_desc *gdp;
struct ext4_group_info *grp;
unsigned int overflow;
ext4_grpblk_t bit;
- struct buffer_head *gd_bh;
ext4_group_t block_group;
struct ext4_sb_info *sbi;
struct ext4_buddy e4b;
unsigned int count_clusters;
int err = 0;
- int ret;
+ int mark_flags = 0;
+ ext4_grpblk_t changed;
sbi = EXT4_SB(sb);
@@ -6459,7 +6395,7 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
ext4_error(sb, "Freeing blocks in system zone - "
"Block = %llu, count = %lu", block, count);
/* err = 0. ext4_std_error should be a no op */
- goto error_return;
+ goto error_out;
}
flags |= EXT4_FREE_BLOCKS_VALIDATED;
@@ -6483,55 +6419,35 @@ do_more:
flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
}
count_clusters = EXT4_NUM_B2C(sbi, count);
- bitmap_bh = ext4_read_block_bitmap(sb, block_group);
- if (IS_ERR(bitmap_bh)) {
- err = PTR_ERR(bitmap_bh);
- bitmap_bh = NULL;
- goto error_return;
- }
- gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
- if (!gdp) {
- err = -EIO;
- goto error_return;
- }
+ trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
+
+ /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
+ err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
+ GFP_NOFS|__GFP_NOFAIL);
+ if (err)
+ goto error_out;
if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
!ext4_inode_block_valid(inode, block, count)) {
ext4_error(sb, "Freeing blocks in system zone - "
"Block = %llu, count = %lu", block, count);
/* err = 0. ext4_std_error should be a no op */
- goto error_return;
+ goto error_clean;
}
- BUFFER_TRACE(bitmap_bh, "getting write access");
- err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
- EXT4_JTR_NONE);
- if (err)
- goto error_return;
-
- /*
- * We are about to modify some metadata. Call the journal APIs
- * to unshare ->b_data if a currently-committing transaction is
- * using it
- */
- BUFFER_TRACE(gd_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE);
- if (err)
- goto error_return;
#ifdef AGGRESSIVE_CHECK
- {
- int i;
- for (i = 0; i < count_clusters; i++)
- BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
- }
+ mark_flags |= EXT4_MB_BITMAP_MARKED_CHECK;
#endif
- trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
+ err = ext4_mb_mark_context(handle, sb, false, block_group, bit,
+ count_clusters, mark_flags, &changed);
- /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
- err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
- GFP_NOFS|__GFP_NOFAIL);
- if (err)
- goto error_return;
+
+ if (err && changed == 0)
+ goto error_clean;
+
+#ifdef AGGRESSIVE_CHECK
+ BUG_ON(changed != count_clusters);
+#endif
/*
* We need to make sure we don't reuse the freed block until after the
@@ -6555,13 +6471,8 @@ do_more:
new_entry->efd_tid = handle->h_transaction->t_tid;
ext4_lock_group(sb, block_group);
- mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
ext4_mb_free_metadata(handle, &e4b, new_entry);
} else {
- /* need to update group_info->bb_free and bitmap
- * with group lock held. generate_buddy look at
- * them with group lock_held
- */
if (test_opt(sb, DISCARD)) {
err = ext4_issue_discard(sb, block_group, bit,
count_clusters, NULL);
@@ -6574,23 +6485,11 @@ do_more:
EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
ext4_lock_group(sb, block_group);
- mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
mb_free_blocks(inode, &e4b, bit, count_clusters);
}
- ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
- ext4_free_group_clusters_set(sb, gdp, ret);
- ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
- ext4_group_desc_csum_set(sb, block_group, gdp);
ext4_unlock_group(sb, block_group);
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
- atomic64_add(count_clusters,
- &sbi_array_rcu_deref(sbi, s_flex_groups,
- flex_group)->free_clusters);
- }
-
/*
* on a bigalloc file system, defer the s_freeclusters_counter
* update to the caller (ext4_remove_space and friends) so they
@@ -6603,28 +6502,18 @@ do_more:
count_clusters);
}
- ext4_mb_unload_buddy(&e4b);
-
- /* We dirtied the bitmap block */
- BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
- err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
-
- /* And the group descriptor block */
- BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
- ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
- if (!err)
- err = ret;
-
if (overflow && !err) {
block += count;
count = overflow;
- put_bh(bitmap_bh);
+ ext4_mb_unload_buddy(&e4b);
/* The range changed so it's no longer validated */
flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
goto do_more;
}
-error_return:
- brelse(bitmap_bh);
+
+error_clean:
+ ext4_mb_unload_buddy(&e4b);
+error_out:
ext4_std_error(sb, err);
}
@@ -6742,23 +6631,19 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
ext4_fsblk_t block, unsigned long count)
{
- struct buffer_head *bitmap_bh = NULL;
- struct buffer_head *gd_bh;
ext4_group_t block_group;
ext4_grpblk_t bit;
- unsigned int i;
- struct ext4_group_desc *desc;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_buddy e4b;
- int err = 0, ret, free_clusters_count;
- ext4_grpblk_t clusters_freed;
+ int err = 0;
ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block);
ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1);
unsigned long cluster_count = last_cluster - first_cluster + 1;
+ ext4_grpblk_t changed;
ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
- if (count == 0)
+ if (cluster_count == 0)
return 0;
ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -6770,99 +6655,39 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
ext4_warning(sb, "too many blocks added to group %u",
block_group);
err = -EINVAL;
- goto error_return;
- }
-
- bitmap_bh = ext4_read_block_bitmap(sb, block_group);
- if (IS_ERR(bitmap_bh)) {
- err = PTR_ERR(bitmap_bh);
- bitmap_bh = NULL;
- goto error_return;
+ goto error_out;
}
- desc = ext4_get_group_desc(sb, block_group, &gd_bh);
- if (!desc) {
- err = -EIO;
- goto error_return;
- }
+ err = ext4_mb_load_buddy(sb, block_group, &e4b);
+ if (err)
+ goto error_out;
if (!ext4_sb_block_valid(sb, NULL, block, count)) {
ext4_error(sb, "Adding blocks in system zones - "
"Block = %llu, count = %lu",
block, count);
err = -EINVAL;
- goto error_return;
+ goto error_clean;
}
- BUFFER_TRACE(bitmap_bh, "getting write access");
- err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
- EXT4_JTR_NONE);
- if (err)
- goto error_return;
-
- /*
- * We are about to modify some metadata. Call the journal APIs
- * to unshare ->b_data if a currently-committing transaction is
- * using it
- */
- BUFFER_TRACE(gd_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE);
- if (err)
- goto error_return;
-
- for (i = 0, clusters_freed = 0; i < cluster_count; i++) {
- BUFFER_TRACE(bitmap_bh, "clear bit");
- if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
- ext4_error(sb, "bit already cleared for block %llu",
- (ext4_fsblk_t)(block + i));
- BUFFER_TRACE(bitmap_bh, "bit already cleared");
- } else {
- clusters_freed++;
- }
- }
+ err = ext4_mb_mark_context(handle, sb, false, block_group, bit,
+ cluster_count, EXT4_MB_BITMAP_MARKED_CHECK,
+ &changed);
+ if (err && changed == 0)
+ goto error_clean;
- err = ext4_mb_load_buddy(sb, block_group, &e4b);
- if (err)
- goto error_return;
+ if (changed != cluster_count)
+ ext4_error(sb, "bit already cleared in group %u", block_group);
- /*
- * need to update group_info->bb_free and bitmap
- * with group lock held. generate_buddy look at
- * them with group lock_held
- */
ext4_lock_group(sb, block_group);
- mb_clear_bits(bitmap_bh->b_data, bit, cluster_count);
mb_free_blocks(NULL, &e4b, bit, cluster_count);
- free_clusters_count = clusters_freed +
- ext4_free_group_clusters(sb, desc);
- ext4_free_group_clusters_set(sb, desc, free_clusters_count);
- ext4_block_bitmap_csum_set(sb, desc, bitmap_bh);
- ext4_group_desc_csum_set(sb, block_group, desc);
ext4_unlock_group(sb, block_group);
percpu_counter_add(&sbi->s_freeclusters_counter,
- clusters_freed);
-
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
- atomic64_add(clusters_freed,
- &sbi_array_rcu_deref(sbi, s_flex_groups,
- flex_group)->free_clusters);
- }
+ changed);
+error_clean:
ext4_mb_unload_buddy(&e4b);
-
- /* We dirtied the bitmap block */
- BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
- err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
-
- /* And the group descriptor block */
- BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
- ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
- if (!err)
- err = ret;
-
-error_return:
- brelse(bitmap_bh);
+error_out:
ext4_std_error(sb, err);
return err;
}
@@ -7170,3 +6995,7 @@ out_unload:
return error;
}
+
+#ifdef CONFIG_EXT4_KUNIT_TESTS
+#include "mballoc-test.c"
+#endif
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 18a9e7c47975..3aa57376d9c2 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -183,10 +183,8 @@ mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to)
blocksize = i_blocksize(inode);
head = folio_buffers(folio);
- if (!head) {
- create_empty_buffers(&folio->page, blocksize, 0);
- head = folio_buffers(folio);
- }
+ if (!head)
+ head = create_empty_buffers(folio, blocksize, 0);
block = (sector_t)folio->index << (PAGE_SHIFT - inode->i_blkbits);
for (bh = head, block_start = 0; bh != head || !block_start;
@@ -380,9 +378,10 @@ data_copy:
}
/* Perform all necessary steps similar write_begin()/write_end()
* but keeping in mind that i_size will not change */
- if (!folio_buffers(folio[0]))
- create_empty_buffers(&folio[0]->page, 1 << orig_inode->i_blkbits, 0);
bh = folio_buffers(folio[0]);
+ if (!bh)
+ bh = create_empty_buffers(folio[0],
+ 1 << orig_inode->i_blkbits, 0);
for (i = 0; i < data_offset_in_page; i++)
bh = bh->b_this_page;
for (i = 0; i < block_len_in_page; i++) {
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index bbda587f76b8..d252935f9c8a 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2207,7 +2207,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
* happen is that the times are slightly out of date
* and/or different from the directory change time.
*/
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
ext4_update_dx_flag(dir);
inode_inc_iversion(dir);
err2 = ext4_mark_inode_dirty(handle, dir);
@@ -2280,8 +2280,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
top = data2 + len;
while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) {
if (ext4_check_dir_entry(dir, NULL, de, bh2, data2, len,
- (data2 + (blocksize - csum_size) -
- (char *) de))) {
+ (char *)de - data2)) {
brelse(bh2);
brelse(bh);
return -EFSCORRUPTED;
@@ -3202,7 +3201,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
* recovery. */
inode->i_size = 0;
ext4_orphan_add(handle, inode);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
inode_set_ctime_current(inode);
retval = ext4_mark_inode_dirty(handle, inode);
if (retval)
@@ -3277,7 +3276,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
goto out_handle;
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
ext4_update_dx_flag(dir);
retval = ext4_mark_inode_dirty(handle, dir);
if (retval)
@@ -3648,7 +3647,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
if (ext4_has_feature_filetype(ent->dir->i_sb))
ent->de->file_type = file_type;
inode_inc_iversion(ent->dir);
- ent->dir->i_mtime = inode_set_ctime_current(ent->dir);
+ inode_set_mtime_to_ts(ent->dir, inode_set_ctime_current(ent->dir));
retval = ext4_mark_inode_dirty(handle, ent->dir);
BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata");
if (!ent->inlined) {
@@ -3963,7 +3962,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir,
ext4_dec_count(new.inode);
inode_set_ctime_current(new.inode);
}
- old.dir->i_mtime = inode_set_ctime_current(old.dir);
+ inode_set_mtime_to_ts(old.dir, inode_set_ctime_current(old.dir));
ext4_update_dx_flag(old.dir);
if (old.dir_bh) {
retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 3e7d160f543f..21e8f0aebb3c 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -70,15 +70,8 @@ static void __read_end_io(struct bio *bio)
{
struct folio_iter fi;
- bio_for_each_folio_all(fi, bio) {
- struct folio *folio = fi.folio;
-
- if (bio->bi_status)
- folio_clear_uptodate(folio);
- else
- folio_mark_uptodate(folio);
- folio_unlock(folio);
- }
+ bio_for_each_folio_all(fi, bio)
+ folio_end_read(fi.folio, bio->bi_status == 0);
if (bio->bi_private)
mempool_free(bio->bi_private, bio_post_read_ctx_pool);
bio_put(bio);
@@ -336,8 +329,7 @@ int ext4_mpage_readpages(struct inode *inode,
if (ext4_need_verity(inode, folio->index) &&
!fsverity_verify_folio(folio))
goto set_error_page;
- folio_mark_uptodate(folio);
- folio_unlock(folio);
+ folio_end_read(folio, true);
continue;
}
} else if (fully_mapped) {
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 0361c20910de..4fe061edefdd 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -10,8 +10,6 @@
*/
-#define EXT4FS_DEBUG
-
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/jiffies.h>
@@ -57,7 +55,7 @@ int ext4_resize_begin(struct super_block *sb)
* If the reserved GDT blocks is non-zero, the resize_inode feature
* should always be set.
*/
- if (EXT4_SB(sb)->s_es->s_reserved_gdt_blocks &&
+ if (sbi->s_es->s_reserved_gdt_blocks &&
!ext4_has_feature_resize_inode(sb)) {
ext4_error(sb, "resize_inode disabled but reserved GDT blocks non-zero");
return -EFSCORRUPTED;
@@ -69,9 +67,9 @@ int ext4_resize_begin(struct super_block *sb)
* bad time to do it anyways.
*/
if (EXT4_B2C(sbi, sbi->s_sbh->b_blocknr) !=
- le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
+ le32_to_cpu(sbi->s_es->s_first_data_block)) {
ext4_warning(sb, "won't resize using backup superblock at %llu",
- (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
+ (unsigned long long)sbi->s_sbh->b_blocknr);
return -EPERM;
}
@@ -79,7 +77,7 @@ int ext4_resize_begin(struct super_block *sb)
* We are not allowed to do online-resizing on a filesystem mounted
* with error, because it can destroy the filesystem easily.
*/
- if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
+ if (sbi->s_mount_state & EXT4_ERROR_FS) {
ext4_warning(sb, "There are errors in the filesystem, "
"so online resizing is not allowed");
return -EPERM;
@@ -91,7 +89,7 @@ int ext4_resize_begin(struct super_block *sb)
}
if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING,
- &EXT4_SB(sb)->s_ext4_flags))
+ &sbi->s_ext4_flags))
ret = -EBUSY;
return ret;
@@ -106,18 +104,6 @@ int ext4_resize_end(struct super_block *sb, bool update_backups)
return 0;
}
-static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb,
- ext4_group_t group) {
- return (group >> EXT4_DESC_PER_BLOCK_BITS(sb)) <<
- EXT4_DESC_PER_BLOCK_BITS(sb);
-}
-
-static ext4_fsblk_t ext4_meta_bg_first_block_no(struct super_block *sb,
- ext4_group_t group) {
- group = ext4_meta_bg_first_group(sb, group);
- return ext4_group_first_block_no(sb, group);
-}
-
static ext4_grpblk_t ext4_group_overhead_blocks(struct super_block *sb,
ext4_group_t group) {
ext4_grpblk_t overhead;
@@ -154,8 +140,9 @@ static int verify_group_input(struct super_block *sb,
overhead = ext4_group_overhead_blocks(sb, group);
metaend = start + overhead;
- input->free_clusters_count = free_blocks_count =
- input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
+ free_blocks_count = input->blocks_count - 2 - overhead -
+ sbi->s_itb_per_group;
+ input->free_clusters_count = EXT4_B2C(sbi, free_blocks_count);
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: adding %s group %u: %u blocks "
@@ -460,8 +447,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
ext4_debug("mark clusters [%llu-%llu] used\n", first_cluster,
last_cluster);
- for (count2 = count; count > 0;
- count -= count2, first_cluster += count2) {
+ for (; count > 0; count -= count2, first_cluster += count2) {
ext4_fsblk_t start;
struct buffer_head *bh;
ext4_group_t group;
@@ -560,13 +546,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
if (meta_bg == 0 && !ext4_bg_has_super(sb, group))
goto handle_itb;
- if (meta_bg == 1) {
- ext4_group_t first_group;
- first_group = ext4_meta_bg_first_group(sb, group);
- if (first_group != group + 1 &&
- first_group != group + EXT4_DESC_PER_BLOCK(sb) - 1)
- goto handle_itb;
- }
+ if (meta_bg == 1)
+ goto handle_itb;
block = start + ext4_bg_has_super(sb, group);
/* Copy all of the GDT blocks into the backup in this group */
@@ -614,7 +595,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
}
handle_itb:
- /* Initialize group tables of the grop @group */
+ /* Initialize group tables of the group @group */
if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED))
goto handle_bb;
@@ -704,16 +685,14 @@ handle_ib:
block = start;
}
- if (count) {
- err = set_flexbg_block_bitmap(sb, handle,
- flex_gd,
- EXT4_B2C(sbi, start),
- EXT4_B2C(sbi,
- start + count
- - 1));
- if (err)
- goto out;
- }
+ err = set_flexbg_block_bitmap(sb, handle,
+ flex_gd,
+ EXT4_B2C(sbi, start),
+ EXT4_B2C(sbi,
+ start + count
+ - 1));
+ if (err)
+ goto out;
}
out:
@@ -952,7 +931,13 @@ errout:
}
/*
- * add_new_gdb_meta_bg is the sister of add_new_gdb.
+ * If there is no available space in the existing block group descriptors for
+ * the new block group and there are no reserved block group descriptors, then
+ * the meta_bg feature will get enabled, and es->s_first_meta_bg will get set
+ * to the first block group that is managed using meta_bg and s_first_meta_bg
+ * must be a multiple of EXT4_DESC_PER_BLOCK(sb).
+ * This function will be called when first group of meta_bg is added to bring
+ * new group descriptors block of new added meta_bg.
*/
static int add_new_gdb_meta_bg(struct super_block *sb,
handle_t *handle, ext4_group_t group) {
@@ -962,8 +947,8 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
int err;
- gdblock = ext4_meta_bg_first_block_no(sb, group) +
- ext4_bg_has_super(sb, group);
+ gdblock = ext4_group_first_block_no(sb, group) +
+ ext4_bg_has_super(sb, group);
gdb_bh = ext4_sb_bread(sb, gdblock, 0);
if (IS_ERR(gdb_bh))
return PTR_ERR(gdb_bh);
@@ -1087,9 +1072,6 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
for (i = 0; i < reserved_gdb; i++) {
int err2;
data = (__le32 *)primary[i]->b_data;
- /* printk("reserving backup %lu[%u] = %lu\n",
- primary[i]->b_blocknr, gdbackups,
- blk + primary[i]->b_blocknr); */
data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]);
if (!err)
@@ -1191,8 +1173,10 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
ext4_group_first_block_no(sb, group));
BUFFER_TRACE(bh, "get_write_access");
if ((err = ext4_journal_get_write_access(handle, sb, bh,
- EXT4_JTR_NONE)))
+ EXT4_JTR_NONE))) {
+ brelse(bh);
break;
+ }
lock_buffer(bh);
memcpy(bh->b_data, data, size);
if (rest)
@@ -1601,7 +1585,8 @@ exit_journal:
int gdb_num_end = ((group + flex_gd->count - 1) /
EXT4_DESC_PER_BLOCK(sb));
int meta_bg = ext4_has_feature_meta_bg(sb);
- sector_t old_gdb = 0;
+ sector_t padding_blocks = meta_bg ? 0 : sbi->s_sbh->b_blocknr -
+ ext4_group_first_block_no(sb, 0);
update_backups(sb, ext4_group_first_block_no(sb, 0),
(char *)es, sizeof(struct ext4_super_block), 0);
@@ -1610,11 +1595,8 @@ exit_journal:
gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc,
gdb_num);
- if (old_gdb == gdb_bh->b_blocknr)
- continue;
- update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data,
- gdb_bh->b_size, meta_bg);
- old_gdb = gdb_bh->b_blocknr;
+ update_backups(sb, gdb_bh->b_blocknr - padding_blocks,
+ gdb_bh->b_data, gdb_bh->b_size, meta_bg);
}
}
exit:
@@ -1980,9 +1962,7 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)
errout:
ret = ext4_journal_stop(handle);
- if (!err)
- err = ret;
- return ret;
+ return err ? err : ret;
invalid_resize_inode:
ext4_error(sb, "corrupted/inconsistent resize inode");
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dbebd8b3127e..c5fcf377ab1f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -244,18 +244,25 @@ static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb,
struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block,
blk_opf_t op_flags)
{
- return __ext4_sb_bread_gfp(sb, block, op_flags, __GFP_MOVABLE);
+ gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping,
+ ~__GFP_FS) | __GFP_MOVABLE;
+
+ return __ext4_sb_bread_gfp(sb, block, op_flags, gfp);
}
struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
sector_t block)
{
- return __ext4_sb_bread_gfp(sb, block, 0, 0);
+ gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping,
+ ~__GFP_FS);
+
+ return __ext4_sb_bread_gfp(sb, block, 0, gfp);
}
void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
{
- struct buffer_head *bh = sb_getblk_gfp(sb, block, 0);
+ struct buffer_head *bh = bdev_getblk(sb->s_bdev, block,
+ sb->s_blocksize, GFP_NOWAIT | __GFP_NOWARN);
if (likely(bh)) {
if (trylock_buffer(bh))
@@ -768,7 +775,8 @@ static void update_super_work(struct work_struct *work)
*/
if (!sb_rdonly(sbi->s_sb) && journal) {
struct buffer_head *sbh = sbi->s_sbh;
- bool call_notify_err;
+ bool call_notify_err = false;
+
handle = jbd2_journal_start(journal, 1);
if (IS_ERR(handle))
goto write_directly;
@@ -1351,14 +1359,14 @@ static void ext4_put_super(struct super_block *sb)
sync_blockdev(sb->s_bdev);
invalidate_bdev(sb->s_bdev);
- if (sbi->s_journal_bdev) {
+ if (sbi->s_journal_bdev_handle) {
/*
* Invalidate the journal device's buffers. We don't want them
* floating about in memory - the physical journal device may
* hotswapped, and it breaks the `ro-after' testing code.
*/
- sync_blockdev(sbi->s_journal_bdev);
- invalidate_bdev(sbi->s_journal_bdev);
+ sync_blockdev(sbi->s_journal_bdev_handle->bdev);
+ invalidate_bdev(sbi->s_journal_bdev_handle->bdev);
}
ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
@@ -1646,6 +1654,7 @@ static const struct super_operations ext4_sops = {
};
static const struct export_operations ext4_export_ops = {
+ .encode_fh = generic_encode_ino32_fh,
.fh_to_dentry = ext4_fh_to_dentry,
.fh_to_parent = ext4_fh_to_parent,
.get_parent = ext4_get_parent,
@@ -4233,7 +4242,7 @@ int ext4_calculate_overhead(struct super_block *sb)
* Add the internal journal blocks whether the journal has been
* loaded or not
*/
- if (sbi->s_journal && !sbi->s_journal_bdev)
+ if (sbi->s_journal && !sbi->s_journal_bdev_handle)
overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len);
else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
/* j_inum for internal journal is non-zero */
@@ -5670,9 +5679,9 @@ failed_mount:
#endif
fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
brelse(sbi->s_sbh);
- if (sbi->s_journal_bdev) {
- invalidate_bdev(sbi->s_journal_bdev);
- blkdev_put(sbi->s_journal_bdev, sb);
+ if (sbi->s_journal_bdev_handle) {
+ invalidate_bdev(sbi->s_journal_bdev_handle->bdev);
+ bdev_release(sbi->s_journal_bdev_handle);
}
out_fail:
invalidate_bdev(sb->s_bdev);
@@ -5842,12 +5851,13 @@ static journal_t *ext4_open_inode_journal(struct super_block *sb,
return journal;
}
-static struct block_device *ext4_get_journal_blkdev(struct super_block *sb,
+static struct bdev_handle *ext4_get_journal_blkdev(struct super_block *sb,
dev_t j_dev, ext4_fsblk_t *j_start,
ext4_fsblk_t *j_len)
{
struct buffer_head *bh;
struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
int hblock, blocksize;
ext4_fsblk_t sb_block;
unsigned long offset;
@@ -5856,16 +5866,17 @@ static struct block_device *ext4_get_journal_blkdev(struct super_block *sb,
/* see get_tree_bdev why this is needed and safe */
up_write(&sb->s_umount);
- bdev = blkdev_get_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb,
- &fs_holder_ops);
+ bdev_handle = bdev_open_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE,
+ sb, &fs_holder_ops);
down_write(&sb->s_umount);
- if (IS_ERR(bdev)) {
+ if (IS_ERR(bdev_handle)) {
ext4_msg(sb, KERN_ERR,
"failed to open journal device unknown-block(%u,%u) %ld",
- MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev));
- return ERR_CAST(bdev);
+ MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_handle));
+ return bdev_handle;
}
+ bdev = bdev_handle->bdev;
blocksize = sb->s_blocksize;
hblock = bdev_logical_block_size(bdev);
if (blocksize < hblock) {
@@ -5912,12 +5923,12 @@ static struct block_device *ext4_get_journal_blkdev(struct super_block *sb,
*j_start = sb_block + 1;
*j_len = ext4_blocks_count(es);
brelse(bh);
- return bdev;
+ return bdev_handle;
out_bh:
brelse(bh);
out_bdev:
- blkdev_put(bdev, sb);
+ bdev_release(bdev_handle);
return ERR_PTR(errno);
}
@@ -5927,14 +5938,14 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb,
journal_t *journal;
ext4_fsblk_t j_start;
ext4_fsblk_t j_len;
- struct block_device *journal_bdev;
+ struct bdev_handle *bdev_handle;
int errno = 0;
- journal_bdev = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len);
- if (IS_ERR(journal_bdev))
- return ERR_CAST(journal_bdev);
+ bdev_handle = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len);
+ if (IS_ERR(bdev_handle))
+ return ERR_CAST(bdev_handle);
- journal = jbd2_journal_init_dev(journal_bdev, sb->s_bdev, j_start,
+ journal = jbd2_journal_init_dev(bdev_handle->bdev, sb->s_bdev, j_start,
j_len, sb->s_blocksize);
if (IS_ERR(journal)) {
ext4_msg(sb, KERN_ERR, "failed to create device journal");
@@ -5949,14 +5960,14 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb,
goto out_journal;
}
journal->j_private = sb;
- EXT4_SB(sb)->s_journal_bdev = journal_bdev;
+ EXT4_SB(sb)->s_journal_bdev_handle = bdev_handle;
ext4_init_journal_params(sb, journal);
return journal;
out_journal:
jbd2_journal_destroy(journal);
out_bdev:
- blkdev_put(journal_bdev, sb);
+ bdev_release(bdev_handle);
return ERR_PTR(errno);
}
@@ -6442,6 +6453,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
struct ext4_mount_options old_opts;
ext4_group_t g;
int err = 0;
+ int alloc_ctx;
#ifdef CONFIG_QUOTA
int enable_quota = 0;
int i, j;
@@ -6482,7 +6494,16 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
}
+ /*
+ * Changing the DIOREAD_NOLOCK or DELALLOC mount options may cause
+ * two calls to ext4_should_dioread_nolock() to return inconsistent
+ * values, triggering WARN_ON in ext4_add_complete_io(). we grab
+ * here s_writepages_rwsem to avoid race between writepages ops and
+ * remount.
+ */
+ alloc_ctx = ext4_writepages_down_write(sb);
ext4_apply_options(fc, sb);
+ ext4_writepages_up_write(sb, alloc_ctx);
if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
test_opt(sb, JOURNAL_CHECKSUM)) {
@@ -6700,6 +6721,8 @@ restore_opts:
if (sb_rdonly(sb) && !(old_sb_flags & SB_RDONLY) &&
sb_any_quota_suspended(sb))
dquot_resume(sb, -1);
+
+ alloc_ctx = ext4_writepages_down_write(sb);
sb->s_flags = old_sb_flags;
sbi->s_mount_opt = old_opts.s_mount_opt;
sbi->s_mount_opt2 = old_opts.s_mount_opt2;
@@ -6708,6 +6731,8 @@ restore_opts:
sbi->s_commit_interval = old_opts.s_commit_interval;
sbi->s_min_batch_time = old_opts.s_min_batch_time;
sbi->s_max_batch_time = old_opts.s_max_batch_time;
+ ext4_writepages_up_write(sb, alloc_ctx);
+
if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
ext4_release_system_zone(sb);
#ifdef CONFIG_QUOTA
@@ -7127,7 +7152,7 @@ static int ext4_quota_off(struct super_block *sb, int type)
}
EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL);
inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
err = ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
out_unlock:
@@ -7300,12 +7325,12 @@ static inline int ext3_feature_set_ok(struct super_block *sb)
static void ext4_kill_sb(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct block_device *journal_bdev = sbi ? sbi->s_journal_bdev : NULL;
+ struct bdev_handle *handle = sbi ? sbi->s_journal_bdev_handle : NULL;
kill_block_super(sb);
- if (journal_bdev)
- blkdev_put(journal_bdev, sb);
+ if (handle)
+ bdev_release(handle);
}
static struct file_system_type ext4_fs_type = {
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 92ba28cebac6..82dc5e673d5c 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -98,7 +98,7 @@ static const struct xattr_handler * const ext4_xattr_handler_map[] = {
[EXT4_XATTR_INDEX_HURD] = &ext4_xattr_hurd_handler,
};
-const struct xattr_handler *ext4_xattr_handlers[] = {
+const struct xattr_handler * const ext4_xattr_handlers[] = {
&ext4_xattr_user_handler,
&ext4_xattr_trusted_handler,
#ifdef CONFIG_EXT4_FS_SECURITY
@@ -356,7 +356,7 @@ ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size)
static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode)
{
- return ((u64) inode_get_ctime(ea_inode).tv_sec << 32) |
+ return ((u64) inode_get_ctime_sec(ea_inode) << 32) |
(u32) inode_peek_iversion_raw(ea_inode);
}
@@ -368,12 +368,12 @@ static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count)
static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode)
{
- return (u32)ea_inode->i_atime.tv_sec;
+ return (u32) inode_get_atime_sec(ea_inode);
}
static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash)
{
- ea_inode->i_atime.tv_sec = hash;
+ inode_set_atime(ea_inode, hash, 0);
}
/*
@@ -418,7 +418,7 @@ free_bhs:
return ret;
}
-#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec)
+#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode_get_mtime_sec(inode)))
static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
u32 ea_inode_hash, struct inode **ea_inode)
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 824faf0b15a8..bd97c4aa8177 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -193,7 +193,7 @@ extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
struct ext4_inode *raw_inode, handle_t *handle);
extern void ext4_evict_ea_inode(struct inode *inode);
-extern const struct xattr_handler *ext4_xattr_handlers[];
+extern const struct xattr_handler * const ext4_xattr_handlers[];
extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
struct ext4_xattr_ibody_find *is);
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index d820801f473e..36e5dab6baae 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -893,14 +893,15 @@ static bool cluster_has_invalid_data(struct compress_ctx *cc)
bool f2fs_sanity_check_cluster(struct dnode_of_data *dn)
{
+#ifdef CONFIG_F2FS_CHECK_FS
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
unsigned int cluster_size = F2FS_I(dn->inode)->i_cluster_size;
- bool compressed = dn->data_blkaddr == COMPRESS_ADDR;
int cluster_end = 0;
+ unsigned int count;
int i;
char *reason = "";
- if (!compressed)
+ if (dn->data_blkaddr != COMPRESS_ADDR)
return false;
/* [..., COMPR_ADDR, ...] */
@@ -909,7 +910,7 @@ bool f2fs_sanity_check_cluster(struct dnode_of_data *dn)
goto out;
}
- for (i = 1; i < cluster_size; i++) {
+ for (i = 1, count = 1; i < cluster_size; i++, count++) {
block_t blkaddr = data_blkaddr(dn->inode, dn->node_page,
dn->ofs_in_node + i);
@@ -929,19 +930,42 @@ bool f2fs_sanity_check_cluster(struct dnode_of_data *dn)
goto out;
}
}
+
+ f2fs_bug_on(F2FS_I_SB(dn->inode), count != cluster_size &&
+ !is_inode_flag_set(dn->inode, FI_COMPRESS_RELEASED));
+
return false;
out:
f2fs_warn(sbi, "access invalid cluster, ino:%lu, nid:%u, ofs_in_node:%u, reason:%s",
dn->inode->i_ino, dn->nid, dn->ofs_in_node, reason);
set_sbi_flag(sbi, SBI_NEED_FSCK);
return true;
+#else
+ return false;
+#endif
+}
+
+static int __f2fs_get_cluster_blocks(struct inode *inode,
+ struct dnode_of_data *dn)
+{
+ unsigned int cluster_size = F2FS_I(inode)->i_cluster_size;
+ int count, i;
+
+ for (i = 1, count = 1; i < cluster_size; i++) {
+ block_t blkaddr = data_blkaddr(dn->inode, dn->node_page,
+ dn->ofs_in_node + i);
+
+ if (__is_valid_data_blkaddr(blkaddr))
+ count++;
+ }
+
+ return count;
}
static int __f2fs_cluster_blocks(struct inode *inode,
- unsigned int cluster_idx, bool compr)
+ unsigned int cluster_idx, bool compr_blks)
{
struct dnode_of_data dn;
- unsigned int cluster_size = F2FS_I(inode)->i_cluster_size;
unsigned int start_idx = cluster_idx <<
F2FS_I(inode)->i_log_cluster_size;
int ret;
@@ -956,31 +980,14 @@ static int __f2fs_cluster_blocks(struct inode *inode,
if (f2fs_sanity_check_cluster(&dn)) {
ret = -EFSCORRUPTED;
- f2fs_handle_error(F2FS_I_SB(inode), ERROR_CORRUPTED_CLUSTER);
goto fail;
}
if (dn.data_blkaddr == COMPRESS_ADDR) {
- int i;
-
- ret = 1;
- for (i = 1; i < cluster_size; i++) {
- block_t blkaddr;
-
- blkaddr = data_blkaddr(dn.inode,
- dn.node_page, dn.ofs_in_node + i);
- if (compr) {
- if (__is_valid_data_blkaddr(blkaddr))
- ret++;
- } else {
- if (blkaddr != NULL_ADDR)
- ret++;
- }
- }
-
- f2fs_bug_on(F2FS_I_SB(inode),
- !compr && ret != cluster_size &&
- !is_inode_flag_set(inode, FI_COMPRESS_RELEASED));
+ if (compr_blks)
+ ret = __f2fs_get_cluster_blocks(inode, &dn);
+ else
+ ret = 1;
}
fail:
f2fs_put_dnode(&dn);
@@ -993,7 +1000,7 @@ static int f2fs_compressed_blocks(struct compress_ctx *cc)
return __f2fs_cluster_blocks(cc->inode, cc->cluster_idx, true);
}
-/* return # of valid blocks in compressed cluster */
+/* return whether cluster is compressed one or not */
int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index)
{
return __f2fs_cluster_blocks(inode,
@@ -1976,7 +1983,7 @@ void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi)
int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi)
{
dev_t dev = sbi->sb->s_bdev->bd_dev;
- char slab_name[32];
+ char slab_name[35];
if (!f2fs_sb_has_compression(sbi))
return 0;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 916e317ac925..4e42b5f24deb 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1690,9 +1690,7 @@ next_block:
map->m_flags |= F2FS_MAP_NEW;
} else if (is_hole) {
if (f2fs_compressed_file(inode) &&
- f2fs_sanity_check_cluster(&dn) &&
- (flag != F2FS_GET_BLOCK_FIEMAP ||
- IS_ENABLED(CONFIG_F2FS_CHECK_FS))) {
+ f2fs_sanity_check_cluster(&dn)) {
err = -EFSCORRUPTED;
f2fs_handle_error(sbi,
ERROR_CORRUPTED_CLUSTER);
@@ -2344,8 +2342,10 @@ skip_reading_dnode:
f2fs_wait_on_block_writeback(inode, blkaddr);
if (f2fs_load_compressed_page(sbi, page, blkaddr)) {
- if (atomic_dec_and_test(&dic->remaining_pages))
+ if (atomic_dec_and_test(&dic->remaining_pages)) {
f2fs_decompress_cluster(dic, true);
+ break;
+ }
continue;
}
@@ -2665,6 +2665,11 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio)
return true;
if (f2fs_is_atomic_file(inode))
return true;
+ /* rewrite low ratio compress data w/ OPU mode to avoid fragmentation */
+ if (f2fs_compressed_file(inode) &&
+ F2FS_OPTION(sbi).compress_mode == COMPR_MODE_USER &&
+ is_inode_flag_set(inode, FI_ENABLE_COMPRESS))
+ return true;
/* swap file is migrating in aligned write mode */
if (is_inode_flag_set(inode, FI_ALIGNED_WRITE))
@@ -3023,7 +3028,8 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
{
int ret = 0;
int done = 0, retry = 0;
- struct page *pages[F2FS_ONSTACK_PAGES];
+ struct page *pages_local[F2FS_ONSTACK_PAGES];
+ struct page **pages = pages_local;
struct folio_batch fbatch;
struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
struct bio *bio = NULL;
@@ -3047,6 +3053,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
#endif
int nr_folios, p, idx;
int nr_pages;
+ unsigned int max_pages = F2FS_ONSTACK_PAGES;
pgoff_t index;
pgoff_t end; /* Inclusive */
pgoff_t done_index;
@@ -3056,6 +3063,15 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
int submitted = 0;
int i;
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (f2fs_compressed_file(inode) &&
+ 1 << cc.log_cluster_size > F2FS_ONSTACK_PAGES) {
+ pages = f2fs_kzalloc(sbi, sizeof(struct page *) <<
+ cc.log_cluster_size, GFP_NOFS | __GFP_NOFAIL);
+ max_pages = 1 << cc.log_cluster_size;
+ }
+#endif
+
folio_batch_init(&fbatch);
if (get_dirty_pages(mapping->host) <=
@@ -3101,7 +3117,7 @@ again:
add_more:
pages[nr_pages] = folio_page(folio, idx);
folio_get(folio);
- if (++nr_pages == F2FS_ONSTACK_PAGES) {
+ if (++nr_pages == max_pages) {
index = folio->index + idx + 1;
folio_batch_release(&fbatch);
goto write;
@@ -3283,6 +3299,11 @@ next:
if (bio)
f2fs_submit_merged_ipu_write(sbi, &bio, NULL);
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (pages != pages_local)
+ kfree(pages);
+#endif
+
return ret;
}
@@ -4055,7 +4076,7 @@ next:
sis->highest_bit = cur_lblock - 1;
out:
if (not_aligned)
- f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(%u * N)",
+ f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(%lu * N)",
not_aligned, blks_per_sec * F2FS_BLKSIZE);
return ret;
}
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 8aa29fe2e87b..042593aed1ec 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -455,7 +455,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
de->file_type = fs_umode_to_ftype(inode->i_mode);
set_page_dirty(page);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
f2fs_mark_inode_dirty_sync(dir, false);
f2fs_put_page(page, 1);
}
@@ -609,7 +609,7 @@ void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode,
f2fs_i_links_write(dir, true);
clear_inode_flag(inode, FI_NEW_INODE);
}
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
f2fs_mark_inode_dirty_sync(dir, false);
if (F2FS_I(dir)->i_current_depth != current_depth)
@@ -919,7 +919,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
}
f2fs_put_page(page, 1);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
f2fs_mark_inode_dirty_sync(dir, false);
if (inode)
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 0e2d49140c07..ad8dfac73bd4 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -74,40 +74,14 @@ static void __set_extent_info(struct extent_info *ei,
}
}
-static bool __may_read_extent_tree(struct inode *inode)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-
- if (!test_opt(sbi, READ_EXTENT_CACHE))
- return false;
- if (is_inode_flag_set(inode, FI_NO_EXTENT))
- return false;
- if (is_inode_flag_set(inode, FI_COMPRESSED_FILE) &&
- !f2fs_sb_has_readonly(sbi))
- return false;
- return S_ISREG(inode->i_mode);
-}
-
-static bool __may_age_extent_tree(struct inode *inode)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-
- if (!test_opt(sbi, AGE_EXTENT_CACHE))
- return false;
- if (is_inode_flag_set(inode, FI_COMPRESSED_FILE))
- return false;
- if (file_is_cold(inode))
- return false;
-
- return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode);
-}
-
static bool __init_may_extent_tree(struct inode *inode, enum extent_type type)
{
if (type == EX_READ)
- return __may_read_extent_tree(inode);
- else if (type == EX_BLOCK_AGE)
- return __may_age_extent_tree(inode);
+ return test_opt(F2FS_I_SB(inode), READ_EXTENT_CACHE) &&
+ S_ISREG(inode->i_mode);
+ if (type == EX_BLOCK_AGE)
+ return test_opt(F2FS_I_SB(inode), AGE_EXTENT_CACHE) &&
+ (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode));
return false;
}
@@ -120,7 +94,22 @@ static bool __may_extent_tree(struct inode *inode, enum extent_type type)
if (list_empty(&F2FS_I_SB(inode)->s_list))
return false;
- return __init_may_extent_tree(inode, type);
+ if (!__init_may_extent_tree(inode, type))
+ return false;
+
+ if (type == EX_READ) {
+ if (is_inode_flag_set(inode, FI_NO_EXTENT))
+ return false;
+ if (is_inode_flag_set(inode, FI_COMPRESSED_FILE) &&
+ !f2fs_sb_has_readonly(F2FS_I_SB(inode)))
+ return false;
+ } else if (type == EX_BLOCK_AGE) {
+ if (is_inode_flag_set(inode, FI_COMPRESSED_FILE))
+ return false;
+ if (file_is_cold(inode))
+ return false;
+ }
+ return true;
}
static void __try_update_largest_extent(struct extent_tree *et,
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 6d688e42d89c..9043cedfa12b 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1234,6 +1234,7 @@ struct f2fs_bio_info {
#define FDEV(i) (sbi->devs[i])
#define RDEV(i) (raw_super->devs[i])
struct f2fs_dev_info {
+ struct bdev_handle *bdev_handle;
struct block_device *bdev;
char path[MAX_PATH_LEN];
unsigned int total_segments;
@@ -3317,13 +3318,15 @@ static inline void clear_file(struct inode *inode, int type)
static inline bool f2fs_is_time_consistent(struct inode *inode)
{
- struct timespec64 ctime = inode_get_ctime(inode);
+ struct timespec64 ts = inode_get_atime(inode);
- if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime))
+ if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &ts))
return false;
- if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &ctime))
+ ts = inode_get_ctime(inode);
+ if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &ts))
return false;
- if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime))
+ ts = inode_get_mtime(inode);
+ if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &ts))
return false;
return true;
}
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index ca5904129b16..e50363583f01 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -798,7 +798,7 @@ int f2fs_truncate(struct inode *inode)
if (err)
return err;
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
f2fs_mark_inode_dirty_sync(inode, false);
return 0;
}
@@ -905,9 +905,9 @@ static void __setattr_copy(struct mnt_idmap *idmap,
i_uid_update(idmap, attr, inode);
i_gid_update(idmap, attr, inode);
if (ia_valid & ATTR_ATIME)
- inode->i_atime = attr->ia_atime;
+ inode_set_atime_to_ts(inode, attr->ia_atime);
if (ia_valid & ATTR_MTIME)
- inode->i_mtime = attr->ia_mtime;
+ inode_set_mtime_to_ts(inode, attr->ia_mtime);
if (ia_valid & ATTR_CTIME)
inode_set_ctime_to_ts(inode, attr->ia_ctime);
if (ia_valid & ATTR_MODE) {
@@ -1012,7 +1012,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
return err;
spin_lock(&F2FS_I(inode)->i_size_lock);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
F2FS_I(inode)->last_disk_size = i_size_read(inode);
spin_unlock(&F2FS_I(inode)->i_size_lock);
}
@@ -1840,7 +1840,7 @@ static long f2fs_fallocate(struct file *file, int mode,
}
if (!ret) {
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
f2fs_mark_inode_dirty_sync(inode, false);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
}
@@ -2888,10 +2888,10 @@ out_src:
if (ret)
goto out_unlock;
- src->i_mtime = inode_set_ctime_current(src);
+ inode_set_mtime_to_ts(src, inode_set_ctime_current(src));
f2fs_mark_inode_dirty_sync(src, false);
if (src != dst) {
- dst->i_mtime = inode_set_ctime_current(dst);
+ inode_set_mtime_to_ts(dst, inode_set_ctime_current(dst));
f2fs_mark_inode_dirty_sync(dst, false);
}
f2fs_update_time(sbi, REQ_TIME);
@@ -3258,11 +3258,12 @@ int f2fs_precache_extents(struct inode *inode)
return -EOPNOTSUPP;
map.m_lblk = 0;
+ map.m_pblk = 0;
map.m_next_pgofs = NULL;
map.m_next_extent = &m_next_extent;
map.m_seg_type = NO_CHECK_TYPE;
map.m_may_create = false;
- end = max_file_blocks(inode);
+ end = F2FS_BLK_ALIGN(i_size_read(inode));
while (map.m_lblk < end) {
map.m_len = end - map.m_lblk;
@@ -3270,7 +3271,7 @@ int f2fs_precache_extents(struct inode *inode)
f2fs_down_write(&fi->i_gc_rwsem[WRITE]);
err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_PRECACHE);
f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
- if (err)
+ if (err || !map.m_len)
return err;
map.m_lblk = m_next_extent;
@@ -4005,6 +4006,15 @@ static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg)
F2FS_I(inode)->i_compress_algorithm = option.algorithm;
F2FS_I(inode)->i_log_cluster_size = option.log_cluster_size;
F2FS_I(inode)->i_cluster_size = BIT(option.log_cluster_size);
+ /* Set default level */
+ if (F2FS_I(inode)->i_compress_algorithm == COMPRESS_ZSTD)
+ F2FS_I(inode)->i_compress_level = F2FS_ZSTD_DEFAULT_CLEVEL;
+ else
+ F2FS_I(inode)->i_compress_level = 0;
+ /* Adjust mount option level */
+ if (option.algorithm == F2FS_OPTION(sbi).compress_algorithm &&
+ F2FS_OPTION(sbi).compress_level)
+ F2FS_I(inode)->i_compress_level = F2FS_OPTION(sbi).compress_level;
f2fs_mark_inode_dirty_sync(inode, true);
if (!f2fs_is_compress_backend_ready(inode))
@@ -4849,6 +4859,9 @@ static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len,
filp->f_mode &= ~FMODE_RANDOM;
spin_unlock(&filp->f_lock);
return 0;
+ } else if (advice == POSIX_FADV_WILLNEED && offset == 0) {
+ /* Load extent cache at the first readahead. */
+ f2fs_precache_extents(inode);
}
err = generic_fadvise(filp, offset, len, advice);
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 2fe25619ccb5..ac00423f117b 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -699,7 +699,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
set_page_dirty(page);
f2fs_put_page(page, 1);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
f2fs_mark_inode_dirty_sync(dir, false);
if (inode)
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index cde243840abd..560bfcad1af2 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -315,7 +315,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
f2fs_has_inline_xattr(inode) &&
(!fi->i_inline_xattr_size ||
fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) {
- f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_inline_xattr_size: %d, max: %zu",
+ f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_inline_xattr_size: %d, max: %lu",
__func__, inode->i_ino, fi->i_inline_xattr_size,
MAX_INLINE_XATTR_SIZE);
return false;
@@ -386,9 +386,9 @@ static void init_idisk_time(struct inode *inode)
{
struct f2fs_inode_info *fi = F2FS_I(inode);
- fi->i_disk_time[0] = inode->i_atime;
+ fi->i_disk_time[0] = inode_get_atime(inode);
fi->i_disk_time[1] = inode_get_ctime(inode);
- fi->i_disk_time[2] = inode->i_mtime;
+ fi->i_disk_time[2] = inode_get_mtime(inode);
}
static int do_read_inode(struct inode *inode)
@@ -417,12 +417,12 @@ static int do_read_inode(struct inode *inode)
inode->i_size = le64_to_cpu(ri->i_size);
inode->i_blocks = SECTOR_FROM_BLOCK(le64_to_cpu(ri->i_blocks) - 1);
- inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime);
+ inode_set_atime(inode, le64_to_cpu(ri->i_atime),
+ le32_to_cpu(ri->i_atime_nsec));
inode_set_ctime(inode, le64_to_cpu(ri->i_ctime),
le32_to_cpu(ri->i_ctime_nsec));
- inode->i_mtime.tv_sec = le64_to_cpu(ri->i_mtime);
- inode->i_atime.tv_nsec = le32_to_cpu(ri->i_atime_nsec);
- inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
+ inode_set_mtime(inode, le64_to_cpu(ri->i_mtime),
+ le32_to_cpu(ri->i_mtime_nsec));
inode->i_generation = le32_to_cpu(ri->i_generation);
if (S_ISDIR(inode->i_mode))
fi->i_current_depth = le32_to_cpu(ri->i_current_depth);
@@ -698,12 +698,12 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
}
set_raw_inline(inode, ri);
- ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
- ri->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
- ri->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
- ri->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
- ri->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
- ri->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ ri->i_atime = cpu_to_le64(inode_get_atime_sec(inode));
+ ri->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+ ri->i_mtime = cpu_to_le64(inode_get_mtime_sec(inode));
+ ri->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode));
+ ri->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
+ ri->i_mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
if (S_ISDIR(inode->i_mode))
ri->i_current_depth =
cpu_to_le32(F2FS_I(inode)->i_current_depth);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 193b22a2d6bf..d0053b0284d8 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -243,8 +243,8 @@ static struct inode *f2fs_new_inode(struct mnt_idmap *idmap,
inode->i_ino = ino;
inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
- F2FS_I(inode)->i_crtime = inode->i_mtime;
+ simple_inode_init_ts(inode);
+ F2FS_I(inode)->i_crtime = inode_get_mtime(inode);
inode->i_generation = get_random_u32();
if (S_ISDIR(inode->i_mode))
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index ee2e1dd64f25..6c7f6a649d27 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -633,7 +633,7 @@ static void f2fs_ra_node_pages(struct page *parent, int start, int n)
/* Then, try readahead for siblings of the desired node */
end = start + n;
- end = min(end, NIDS_PER_BLOCK);
+ end = min(end, (int)NIDS_PER_BLOCK);
for (i = start; i < end; i++) {
nid = get_nid(parent, i, false);
f2fs_ra_node_page(sbi, nid);
@@ -1467,7 +1467,8 @@ page_hit:
ofs_of_node(page), cpver_of_node(page),
next_blkaddr_of_node(page));
set_sbi_flag(sbi, SBI_NEED_FSCK);
- err = -EINVAL;
+ f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER);
+ err = -EFSCORRUPTED;
out_err:
ClearPageUptodate(page);
out_put_err:
@@ -2389,7 +2390,7 @@ static int scan_nat_page(struct f2fs_sb_info *sbi,
blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
if (blk_addr == NEW_ADDR)
- return -EINVAL;
+ return -EFSCORRUPTED;
if (blk_addr == NULL_ADDR) {
add_free_nid(sbi, start_nid, true, true);
@@ -2504,7 +2505,14 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
if (ret) {
f2fs_up_read(&nm_i->nat_tree_lock);
- f2fs_err(sbi, "NAT is corrupt, run fsck to fix it");
+
+ if (ret == -EFSCORRUPTED) {
+ f2fs_err(sbi, "NAT is corrupt, run fsck to fix it");
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_handle_error(sbi,
+ ERROR_INCONSISTENT_NAT);
+ }
+
return ret;
}
}
@@ -2743,7 +2751,9 @@ recover_xnid:
f2fs_update_inode_page(inode);
/* 3: update and set xattr node page dirty */
- memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE);
+ if (page)
+ memcpy(F2FS_NODE(xpage), F2FS_NODE(page),
+ VALID_XATTR_BLOCK_SIZE);
set_page_dirty(xpage);
f2fs_put_page(xpage, 1);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 7be60df277a5..b56d0f1078a7 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -320,12 +320,12 @@ static int recover_inode(struct inode *inode, struct page *page)
}
f2fs_i_size_write(inode, le64_to_cpu(raw->i_size));
- inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime);
+ inode_set_atime(inode, le64_to_cpu(raw->i_atime),
+ le32_to_cpu(raw->i_atime_nsec));
inode_set_ctime(inode, le64_to_cpu(raw->i_ctime),
le32_to_cpu(raw->i_ctime_nsec));
- inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime);
- inode->i_atime.tv_nsec = le32_to_cpu(raw->i_atime_nsec);
- inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
+ inode_set_mtime(inode, le64_to_cpu(raw->i_mtime),
+ le32_to_cpu(raw->i_mtime_nsec));
F2FS_I(inode)->i_advise = raw->i_advise;
F2FS_I(inode)->i_flags = le32_to_cpu(raw->i_flags);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index d05b41608fc0..727d016318f9 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -4910,22 +4910,31 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
}
/*
- * The write pointer matches with the valid blocks or
- * already points to the end of the zone.
+ * When safely unmounted in the previous mount, we can trust write
+ * pointers. Otherwise, finish zones.
*/
- if ((last_valid_block + 1 == wp_block) ||
- (zone->wp == zone->start + zone->len))
- return 0;
-
- if (last_valid_block + 1 == zone_block) {
+ if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
/*
- * If there is no valid block in the zone and if write pointer
- * is not at zone start, reset the write pointer.
+ * The write pointer matches with the valid blocks or
+ * already points to the end of the zone.
*/
- f2fs_notice(sbi,
- "Zone without valid block has non-zero write "
- "pointer. Reset the write pointer: wp[0x%x,0x%x]",
- wp_segno, wp_blkoff);
+ if ((last_valid_block + 1 == wp_block) ||
+ (zone->wp == zone->start + zone->len))
+ return 0;
+ }
+
+ if (last_valid_block + 1 == zone_block) {
+ if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
+ /*
+ * If there is no valid block in the zone and if write
+ * pointer is not at zone start, reset the write
+ * pointer.
+ */
+ f2fs_notice(sbi,
+ "Zone without valid block has non-zero write "
+ "pointer. Reset the write pointer: wp[0x%x,0x%x]",
+ wp_segno, wp_blkoff);
+ }
ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block,
zone->len >> log_sectors_per_block);
if (ret)
@@ -4935,18 +4944,20 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
return ret;
}
- /*
- * If there are valid blocks and the write pointer doesn't
- * match with them, we need to report the inconsistency and
- * fill the zone till the end to close the zone. This inconsistency
- * does not cause write error because the zone will not be selected
- * for write operation until it get discarded.
- */
- f2fs_notice(sbi, "Valid blocks are not aligned with write pointer: "
- "valid block[0x%x,0x%x] wp[0x%x,0x%x]",
- GET_SEGNO(sbi, last_valid_block),
- GET_BLKOFF_FROM_SEG0(sbi, last_valid_block),
- wp_segno, wp_blkoff);
+ if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
+ /*
+ * If there are valid blocks and the write pointer doesn't match
+ * with them, we need to report the inconsistency and fill
+ * the zone till the end to close the zone. This inconsistency
+ * does not cause write error because the zone will not be
+ * selected for write operation until it get discarded.
+ */
+ f2fs_notice(sbi, "Valid blocks are not aligned with write "
+ "pointer: valid block[0x%x,0x%x] wp[0x%x,0x%x]",
+ GET_SEGNO(sbi, last_valid_block),
+ GET_BLKOFF_FROM_SEG0(sbi, last_valid_block),
+ wp_segno, wp_blkoff);
+ }
ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH,
zone->start, zone->len, GFP_NOFS);
@@ -5020,18 +5031,27 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type)
if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ)
return 0;
- wp_block = zbd->start_blk + (zone.wp >> log_sectors_per_block);
- wp_segno = GET_SEGNO(sbi, wp_block);
- wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno);
- wp_sector_off = zone.wp & GENMASK(log_sectors_per_block - 1, 0);
-
- if (cs->segno == wp_segno && cs->next_blkoff == wp_blkoff &&
- wp_sector_off == 0)
- return 0;
+ /*
+ * When safely unmounted in the previous mount, we could use current
+ * segments. Otherwise, allocate new sections.
+ */
+ if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
+ wp_block = zbd->start_blk + (zone.wp >> log_sectors_per_block);
+ wp_segno = GET_SEGNO(sbi, wp_block);
+ wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno);
+ wp_sector_off = zone.wp & GENMASK(log_sectors_per_block - 1, 0);
+
+ if (cs->segno == wp_segno && cs->next_blkoff == wp_blkoff &&
+ wp_sector_off == 0)
+ return 0;
- f2fs_notice(sbi, "Unaligned curseg[%d] with write pointer: "
- "curseg[0x%x,0x%x] wp[0x%x,0x%x]",
- type, cs->segno, cs->next_blkoff, wp_segno, wp_blkoff);
+ f2fs_notice(sbi, "Unaligned curseg[%d] with write pointer: "
+ "curseg[0x%x,0x%x] wp[0x%x,0x%x]", type, cs->segno,
+ cs->next_blkoff, wp_segno, wp_blkoff);
+ } else {
+ f2fs_notice(sbi, "Not successfully unmounted in the previous "
+ "mount");
+ }
f2fs_notice(sbi, "Assign new section to curseg[%d]: "
"curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 2ca8fb5d0dc4..8129be788bd5 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -108,11 +108,11 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi,
((sbi)->segs_per_sec - ((sbi)->unusable_blocks_per_sec >>\
(sbi)->log_blocks_per_seg))
#define GET_SEC_FROM_SEG(sbi, segno) \
- (((segno) == -1) ? -1: (segno) / (sbi)->segs_per_sec)
+ (((segno) == -1) ? -1 : (segno) / (sbi)->segs_per_sec)
#define GET_SEG_FROM_SEC(sbi, secno) \
((secno) * (sbi)->segs_per_sec)
#define GET_ZONE_FROM_SEC(sbi, secno) \
- (((secno) == -1) ? -1: (secno) / (sbi)->secs_per_zone)
+ (((secno) == -1) ? -1 : (secno) / (sbi)->secs_per_zone)
#define GET_ZONE_FROM_SEG(sbi, segno) \
GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno))
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index a8c8232852bb..033af907c3b1 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -83,11 +83,26 @@ void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
#endif
/* f2fs-wide shrinker description */
-static struct shrinker f2fs_shrinker_info = {
- .scan_objects = f2fs_shrink_scan,
- .count_objects = f2fs_shrink_count,
- .seeks = DEFAULT_SEEKS,
-};
+static struct shrinker *f2fs_shrinker_info;
+
+static int __init f2fs_init_shrinker(void)
+{
+ f2fs_shrinker_info = shrinker_alloc(0, "f2fs-shrinker");
+ if (!f2fs_shrinker_info)
+ return -ENOMEM;
+
+ f2fs_shrinker_info->count_objects = f2fs_shrink_count;
+ f2fs_shrinker_info->scan_objects = f2fs_shrink_scan;
+
+ shrinker_register(f2fs_shrinker_info);
+
+ return 0;
+}
+
+static void f2fs_exit_shrinker(void)
+{
+ shrinker_free(f2fs_shrinker_info);
+}
enum {
Opt_gc_background,
@@ -547,6 +562,29 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb,
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
+static bool is_compress_extension_exist(struct f2fs_sb_info *sbi,
+ const char *new_ext, bool is_ext)
+{
+ unsigned char (*ext)[F2FS_EXTENSION_LEN];
+ int ext_cnt;
+ int i;
+
+ if (is_ext) {
+ ext = F2FS_OPTION(sbi).extensions;
+ ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
+ } else {
+ ext = F2FS_OPTION(sbi).noextensions;
+ ext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt;
+ }
+
+ for (i = 0; i < ext_cnt; i++) {
+ if (!strcasecmp(new_ext, ext[i]))
+ return true;
+ }
+
+ return false;
+}
+
/*
* 1. The same extension name cannot not appear in both compress and non-compress extension
* at the same time.
@@ -1149,6 +1187,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
return -EINVAL;
}
+ if (is_compress_extension_exist(sbi, name, true)) {
+ kfree(name);
+ break;
+ }
+
strcpy(ext[ext_cnt], name);
F2FS_OPTION(sbi).compress_ext_cnt++;
kfree(name);
@@ -1173,6 +1216,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
return -EINVAL;
}
+ if (is_compress_extension_exist(sbi, name, false)) {
+ kfree(name);
+ break;
+ }
+
strcpy(noext[noext_cnt], name);
F2FS_OPTION(sbi).nocompress_ext_cnt++;
kfree(name);
@@ -1562,7 +1610,7 @@ static void destroy_device_list(struct f2fs_sb_info *sbi)
for (i = 0; i < sbi->s_ndevs; i++) {
if (i > 0)
- blkdev_put(FDEV(i).bdev, sbi->sb);
+ bdev_release(FDEV(i).bdev_handle);
#ifdef CONFIG_BLK_DEV_ZONED
kvfree(FDEV(i).blkz_seq);
#endif
@@ -1629,7 +1677,7 @@ static void f2fs_put_super(struct super_block *sb)
f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA);
- if (err) {
+ if (err || f2fs_cp_error(sbi)) {
truncate_inode_pages_final(NODE_MAPPING(sbi));
truncate_inode_pages_final(META_MAPPING(sbi));
}
@@ -2286,9 +2334,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
unsigned long old_sb_flags;
int err;
bool need_restart_gc = false, need_stop_gc = false;
- bool need_restart_ckpt = false, need_stop_ckpt = false;
bool need_restart_flush = false, need_stop_flush = false;
bool need_restart_discard = false, need_stop_discard = false;
+ bool need_enable_checkpoint = false, need_disable_checkpoint = false;
bool no_read_extent_cache = !test_opt(sbi, READ_EXTENT_CACHE);
bool no_age_extent_cache = !test_opt(sbi, AGE_EXTENT_CACHE);
bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT);
@@ -2452,24 +2500,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
clear_sbi_flag(sbi, SBI_IS_CLOSE);
}
- if ((*flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) ||
- !test_opt(sbi, MERGE_CHECKPOINT)) {
- f2fs_stop_ckpt_thread(sbi);
- need_restart_ckpt = true;
- } else {
- /* Flush if the prevous checkpoint, if exists. */
- f2fs_flush_ckpt_thread(sbi);
-
- err = f2fs_start_ckpt_thread(sbi);
- if (err) {
- f2fs_err(sbi,
- "Failed to start F2FS issue_checkpoint_thread (%d)",
- err);
- goto restore_gc;
- }
- need_stop_ckpt = true;
- }
-
/*
* We stop issue flush thread if FS is mounted as RO
* or if flush_merge is not passed in mount option.
@@ -2481,7 +2511,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
} else {
err = f2fs_create_flush_cmd_control(sbi);
if (err)
- goto restore_ckpt;
+ goto restore_gc;
need_stop_flush = true;
}
@@ -2503,8 +2533,31 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
err = f2fs_disable_checkpoint(sbi);
if (err)
goto restore_discard;
+ need_enable_checkpoint = true;
} else {
f2fs_enable_checkpoint(sbi);
+ need_disable_checkpoint = true;
+ }
+ }
+
+ /*
+ * Place this routine at the end, since a new checkpoint would be
+ * triggered while remount and we need to take care of it before
+ * returning from remount.
+ */
+ if ((*flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) ||
+ !test_opt(sbi, MERGE_CHECKPOINT)) {
+ f2fs_stop_ckpt_thread(sbi);
+ } else {
+ /* Flush if the prevous checkpoint, if exists. */
+ f2fs_flush_ckpt_thread(sbi);
+
+ err = f2fs_start_ckpt_thread(sbi);
+ if (err) {
+ f2fs_err(sbi,
+ "Failed to start F2FS issue_checkpoint_thread (%d)",
+ err);
+ goto restore_checkpoint;
}
}
@@ -2522,6 +2575,13 @@ skip:
adjust_unusable_cap_perc(sbi);
*flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME);
return 0;
+restore_checkpoint:
+ if (need_enable_checkpoint) {
+ f2fs_enable_checkpoint(sbi);
+ } else if (need_disable_checkpoint) {
+ if (f2fs_disable_checkpoint(sbi))
+ f2fs_warn(sbi, "checkpoint has not been disabled");
+ }
restore_discard:
if (need_restart_discard) {
if (f2fs_start_discard_thread(sbi))
@@ -2537,13 +2597,6 @@ restore_flush:
clear_opt(sbi, FLUSH_MERGE);
f2fs_destroy_flush_cmd_control(sbi, false);
}
-restore_ckpt:
- if (need_restart_ckpt) {
- if (f2fs_start_ckpt_thread(sbi))
- f2fs_warn(sbi, "background ckpt thread has stopped");
- } else if (need_stop_ckpt) {
- f2fs_stop_ckpt_thread(sbi);
- }
restore_gc:
if (need_restart_gc) {
if (f2fs_start_gc_thread(sbi))
@@ -2710,7 +2763,7 @@ retry:
if (len == towrite)
return err;
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
f2fs_mark_inode_dirty_sync(inode, false);
return len - towrite;
}
@@ -3203,13 +3256,6 @@ static bool f2fs_has_stable_inodes(struct super_block *sb)
return true;
}
-static void f2fs_get_ino_and_lblk_bits(struct super_block *sb,
- int *ino_bits_ret, int *lblk_bits_ret)
-{
- *ino_bits_ret = 8 * sizeof(nid_t);
- *lblk_bits_ret = 8 * sizeof(block_t);
-}
-
static struct block_device **f2fs_get_devices(struct super_block *sb,
unsigned int *num_devs)
{
@@ -3231,13 +3277,15 @@ static struct block_device **f2fs_get_devices(struct super_block *sb,
}
static const struct fscrypt_operations f2fs_cryptops = {
- .key_prefix = "f2fs:",
+ .needs_bounce_pages = 1,
+ .has_32bit_inodes = 1,
+ .supports_subblock_data_units = 1,
+ .legacy_key_prefix = "f2fs:",
.get_context = f2fs_get_context,
.set_context = f2fs_set_context,
.get_dummy_policy = f2fs_get_dummy_policy,
.empty_dir = f2fs_empty_dir,
.has_stable_inodes = f2fs_has_stable_inodes,
- .get_ino_and_lblk_bits = f2fs_get_ino_and_lblk_bits,
.get_devices = f2fs_get_devices,
};
#endif
@@ -3282,6 +3330,7 @@ static struct dentry *f2fs_fh_to_parent(struct super_block *sb, struct fid *fid,
}
static const struct export_operations f2fs_export_ops = {
+ .encode_fh = generic_encode_ino32_fh,
.fh_to_dentry = f2fs_fh_to_dentry,
.fh_to_parent = f2fs_fh_to_parent,
.get_parent = f2fs_get_parent,
@@ -3469,7 +3518,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
return -EFSCORRUPTED;
}
- /* Currently, support 512/1024/2048/4096 bytes sector size */
+ /* Currently, support 512/1024/2048/4096/16K bytes sector size */
if (le32_to_cpu(raw_super->log_sectorsize) >
F2FS_MAX_LOG_SECTOR_SIZE ||
le32_to_cpu(raw_super->log_sectorsize) <
@@ -4198,7 +4247,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
for (i = 0; i < max_devices; i++) {
if (i == 0)
- FDEV(0).bdev = sbi->sb->s_bdev;
+ FDEV(0).bdev_handle = sbi->sb->s_bdev_handle;
else if (!RDEV(i).path[0])
break;
@@ -4218,13 +4267,14 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
FDEV(i).end_blk = FDEV(i).start_blk +
(FDEV(i).total_segments <<
sbi->log_blocks_per_seg) - 1;
- FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path,
- mode, sbi->sb, NULL);
+ FDEV(i).bdev_handle = bdev_open_by_path(
+ FDEV(i).path, mode, sbi->sb, NULL);
}
}
- if (IS_ERR(FDEV(i).bdev))
- return PTR_ERR(FDEV(i).bdev);
+ if (IS_ERR(FDEV(i).bdev_handle))
+ return PTR_ERR(FDEV(i).bdev_handle);
+ FDEV(i).bdev = FDEV(i).bdev_handle->bdev;
/* to release errored devices */
sbi->s_ndevs = i + 1;
@@ -4915,7 +4965,7 @@ static int __init init_f2fs_fs(void)
int err;
if (PAGE_SIZE != F2FS_BLKSIZE) {
- printk("F2FS not supported on PAGE_SIZE(%lu) != %d\n",
+ printk("F2FS not supported on PAGE_SIZE(%lu) != BLOCK_SIZE(%lu)\n",
PAGE_SIZE, F2FS_BLKSIZE);
return -EINVAL;
}
@@ -4944,7 +4994,7 @@ static int __init init_f2fs_fs(void)
err = f2fs_init_sysfs();
if (err)
goto free_garbage_collection_cache;
- err = register_shrinker(&f2fs_shrinker_info, "f2fs-shrinker");
+ err = f2fs_init_shrinker();
if (err)
goto free_sysfs;
err = register_filesystem(&f2fs_fs_type);
@@ -4989,7 +5039,7 @@ free_root_stats:
f2fs_destroy_root_stats();
unregister_filesystem(&f2fs_fs_type);
free_shrinker:
- unregister_shrinker(&f2fs_shrinker_info);
+ f2fs_exit_shrinker();
free_sysfs:
f2fs_exit_sysfs();
free_garbage_collection_cache:
@@ -5021,7 +5071,7 @@ static void __exit exit_f2fs_fs(void)
f2fs_destroy_post_read_processing();
f2fs_destroy_root_stats();
unregister_filesystem(&f2fs_fs_type);
- unregister_shrinker(&f2fs_shrinker_info);
+ f2fs_exit_shrinker();
f2fs_exit_sysfs();
f2fs_destroy_garbage_collection_cache();
f2fs_destroy_extent_cache();
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index a657284faee3..47e88b4d4e7d 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -189,7 +189,7 @@ const struct xattr_handler f2fs_xattr_security_handler = {
.set = f2fs_xattr_generic_set,
};
-static const struct xattr_handler *f2fs_xattr_handler_map[] = {
+static const struct xattr_handler * const f2fs_xattr_handler_map[] = {
[F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler,
#ifdef CONFIG_F2FS_FS_POSIX_ACL
[F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access,
@@ -202,7 +202,7 @@ static const struct xattr_handler *f2fs_xattr_handler_map[] = {
[F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler,
};
-const struct xattr_handler *f2fs_xattr_handlers[] = {
+const struct xattr_handler * const f2fs_xattr_handlers[] = {
&f2fs_xattr_user_handler,
&f2fs_xattr_trusted_handler,
#ifdef CONFIG_F2FS_FS_SECURITY
@@ -364,10 +364,10 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
*xe = __find_xattr(cur_addr, last_txattr_addr, NULL, index, len, name);
if (!*xe) {
- f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr",
+ f2fs_err(F2FS_I_SB(inode), "lookup inode (%lu) has corrupted xattr",
inode->i_ino);
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
- err = -EFSCORRUPTED;
+ err = -ENODATA;
f2fs_handle_error(F2FS_I_SB(inode),
ERROR_CORRUPTED_XATTR);
goto out;
@@ -584,13 +584,12 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
if ((void *)(entry) + sizeof(__u32) > last_base_addr ||
(void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) {
- f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr",
+ f2fs_err(F2FS_I_SB(inode), "list inode (%lu) has corrupted xattr",
inode->i_ino);
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
- error = -EFSCORRUPTED;
f2fs_handle_error(F2FS_I_SB(inode),
ERROR_CORRUPTED_XATTR);
- goto cleanup;
+ break;
}
if (!prefix)
@@ -650,7 +649,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
if (size > MAX_VALUE_LEN(inode))
return -E2BIG;
-
+retry:
error = read_all_xattrs(inode, ipage, &base_addr);
if (error)
return error;
@@ -660,7 +659,14 @@ static int __f2fs_setxattr(struct inode *inode, int index,
/* find entry with wanted name. */
here = __find_xattr(base_addr, last_base_addr, NULL, index, len, name);
if (!here) {
- f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr",
+ if (!F2FS_I(inode)->i_xattr_nid) {
+ f2fs_notice(F2FS_I_SB(inode),
+ "recover xattr in inode (%lu)", inode->i_ino);
+ f2fs_recover_xattr_data(inode, NULL);
+ kfree(base_addr);
+ goto retry;
+ }
+ f2fs_err(F2FS_I_SB(inode), "set inode (%lu) has corrupted xattr",
inode->i_ino);
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
error = -EFSCORRUPTED;
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index b1811c392e6f..a005ffdcf717 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -125,7 +125,7 @@ extern const struct xattr_handler f2fs_xattr_trusted_handler;
extern const struct xattr_handler f2fs_xattr_advise_handler;
extern const struct xattr_handler f2fs_xattr_security_handler;
-extern const struct xattr_handler *f2fs_xattr_handlers[];
+extern const struct xattr_handler * const f2fs_xattr_handlers[];
extern int f2fs_setxattr(struct inode *, int, const char *,
const void *, size_t, struct page *, int);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index cdd39b6020f3..1fac3dabf130 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -512,6 +512,7 @@ static int fat_validate_dir(struct inode *dir)
int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
{
struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+ struct timespec64 mtime;
int error;
MSDOS_I(inode)->i_pos = 0;
@@ -561,14 +562,18 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
& ~((loff_t)sbi->cluster_size - 1)) >> 9;
- fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0);
- inode_set_ctime_to_ts(inode, inode->i_mtime);
+ fat_time_fat2unix(sbi, &mtime, de->time, de->date, 0);
+ inode_set_mtime_to_ts(inode, mtime);
+ inode_set_ctime_to_ts(inode, mtime);
if (sbi->options.isvfat) {
- fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0);
+ struct timespec64 atime;
+
+ fat_time_fat2unix(sbi, &atime, 0, de->adate, 0);
+ inode_set_atime_to_ts(inode, atime);
fat_time_fat2unix(sbi, &MSDOS_I(inode)->i_crtime, de->ctime,
de->cdate, de->ctime_cs);
} else
- inode->i_atime = fat_truncate_atime(sbi, &inode->i_mtime);
+ inode_set_atime_to_ts(inode, fat_truncate_atime(sbi, &mtime));
return 0;
}
@@ -849,6 +854,7 @@ static int __fat_write_inode(struct inode *inode, int wait)
struct msdos_sb_info *sbi = MSDOS_SB(sb);
struct buffer_head *bh;
struct msdos_dir_entry *raw_entry;
+ struct timespec64 mtime;
loff_t i_pos;
sector_t blocknr;
int err, offset;
@@ -882,12 +888,14 @@ retry:
raw_entry->size = cpu_to_le32(inode->i_size);
raw_entry->attr = fat_make_attrs(inode);
fat_set_start(raw_entry, MSDOS_I(inode)->i_logstart);
- fat_time_unix2fat(sbi, &inode->i_mtime, &raw_entry->time,
+ mtime = inode_get_mtime(inode);
+ fat_time_unix2fat(sbi, &mtime, &raw_entry->time,
&raw_entry->date, NULL);
if (sbi->options.isvfat) {
+ struct timespec64 ts = inode_get_atime(inode);
__le16 atime;
- fat_time_unix2fat(sbi, &inode->i_atime, &atime,
- &raw_entry->adate, NULL);
+
+ fat_time_unix2fat(sbi, &ts, &atime, &raw_entry->adate, NULL);
fat_time_unix2fat(sbi, &MSDOS_I(inode)->i_crtime, &raw_entry->ctime,
&raw_entry->cdate, &raw_entry->ctime_cs);
}
@@ -1407,7 +1415,8 @@ static int fat_read_root(struct inode *inode)
MSDOS_I(inode)->mmu_private = inode->i_size;
fat_save_attrs(inode, ATTR_DIR);
- inode->i_mtime = inode->i_atime = inode_set_ctime(inode, 0, 0);
+ inode_set_mtime_to_ts(inode,
+ inode_set_atime_to_ts(inode, inode_set_ctime(inode, 0, 0)));
set_nlink(inode, fat_subdirs(inode)+2);
return 0;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index f2304a1054aa..c7a2d27120ba 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -325,15 +325,15 @@ int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags)
}
if (flags & S_ATIME)
- inode->i_atime = fat_truncate_atime(sbi, now);
+ inode_set_atime_to_ts(inode, fat_truncate_atime(sbi, now));
/*
* ctime and mtime share the same on-disk field, and should be
* identical in memory. all mtime updates will be applied to ctime,
* but ctime updates are ignored.
*/
if (flags & S_MTIME)
- inode->i_mtime = inode_set_ctime_to_ts(inode,
- fat_truncate_mtime(sbi, now));
+ inode_set_mtime_to_ts(inode,
+ inode_set_ctime_to_ts(inode, fat_truncate_mtime(sbi, now)));
return 0;
}
diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c
index 3626eb585a98..c52e63e10d35 100644
--- a/fs/fat/nfs.c
+++ b/fs/fat/nfs.c
@@ -279,6 +279,7 @@ static struct dentry *fat_get_parent(struct dentry *child_dir)
}
const struct export_operations fat_export_ops = {
+ .encode_fh = generic_encode_ino32_fh,
.fh_to_dentry = fat_fh_to_dentry,
.fh_to_parent = fat_fh_to_parent,
.get_parent = fat_get_parent,
diff --git a/fs/fcntl.c b/fs/fcntl.c
index e871009f6c88..c80a6acad742 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -844,7 +844,7 @@ int send_sigurg(struct fown_struct *fown)
}
static DEFINE_SPINLOCK(fasync_lock);
-static struct kmem_cache *fasync_cache __read_mostly;
+static struct kmem_cache *fasync_cache __ro_after_init;
static void fasync_free_rcu(struct rcu_head *head)
{
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 6ea8d35a9382..18b3ba8dc8ea 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -26,12 +26,8 @@ static long do_sys_name_to_handle(const struct path *path,
/*
* We need to make sure whether the file system support decoding of
* the file handle if decodeable file handle was requested.
- * Otherwise, even empty export_operations are sufficient to opt-in
- * to encoding FIDs.
*/
- if (!path->dentry->d_sb->s_export_op ||
- (!(fh_flags & EXPORT_FH_FID) &&
- !path->dentry->d_sb->s_export_op->fh_to_dentry))
+ if (!exportfs_can_encode_fh(path->dentry->d_sb->s_export_op, fh_flags))
return -EOPNOTSUPP;
if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle)))
diff --git a/fs/file.c b/fs/file.c
index 3e4a4dfa38fc..5fb0b146e79e 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -604,6 +604,9 @@ void fd_install(unsigned int fd, struct file *file)
struct files_struct *files = current->files;
struct fdtable *fdt;
+ if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING)))
+ return;
+
rcu_read_lock_sched();
if (unlikely(files->resize_in_progress)) {
@@ -853,8 +856,104 @@ void do_close_on_exec(struct files_struct *files)
spin_unlock(&files->file_lock);
}
+static struct file *__get_file_rcu(struct file __rcu **f)
+{
+ struct file __rcu *file;
+ struct file __rcu *file_reloaded;
+ struct file __rcu *file_reloaded_cmp;
+
+ file = rcu_dereference_raw(*f);
+ if (!file)
+ return NULL;
+
+ if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
+ return ERR_PTR(-EAGAIN);
+
+ file_reloaded = rcu_dereference_raw(*f);
+
+ /*
+ * Ensure that all accesses have a dependency on the load from
+ * rcu_dereference_raw() above so we get correct ordering
+ * between reuse/allocation and the pointer check below.
+ */
+ file_reloaded_cmp = file_reloaded;
+ OPTIMIZER_HIDE_VAR(file_reloaded_cmp);
+
+ /*
+ * atomic_long_inc_not_zero() above provided a full memory
+ * barrier when we acquired a reference.
+ *
+ * This is paired with the write barrier from assigning to the
+ * __rcu protected file pointer so that if that pointer still
+ * matches the current file, we know we have successfully
+ * acquired a reference to the right file.
+ *
+ * If the pointers don't match the file has been reallocated by
+ * SLAB_TYPESAFE_BY_RCU.
+ */
+ if (file == file_reloaded_cmp)
+ return file_reloaded;
+
+ fput(file);
+ return ERR_PTR(-EAGAIN);
+}
+
+/**
+ * get_file_rcu - try go get a reference to a file under rcu
+ * @f: the file to get a reference on
+ *
+ * This function tries to get a reference on @f carefully verifying that
+ * @f hasn't been reused.
+ *
+ * This function should rarely have to be used and only by users who
+ * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
+ *
+ * Return: Returns @f with the reference count increased or NULL.
+ */
+struct file *get_file_rcu(struct file __rcu **f)
+{
+ for (;;) {
+ struct file __rcu *file;
+
+ file = __get_file_rcu(f);
+ if (unlikely(!file))
+ return NULL;
+
+ if (unlikely(IS_ERR(file)))
+ continue;
+
+ return file;
+ }
+}
+EXPORT_SYMBOL_GPL(get_file_rcu);
+
+/**
+ * get_file_active - try go get a reference to a file
+ * @f: the file to get a reference on
+ *
+ * In contast to get_file_rcu() the pointer itself isn't part of the
+ * reference counting.
+ *
+ * This function should rarely have to be used and only by users who
+ * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
+ *
+ * Return: Returns @f with the reference count increased or NULL.
+ */
+struct file *get_file_active(struct file **f)
+{
+ struct file __rcu *file;
+
+ rcu_read_lock();
+ file = __get_file_rcu(f);
+ rcu_read_unlock();
+ if (IS_ERR(file))
+ file = NULL;
+ return file;
+}
+EXPORT_SYMBOL_GPL(get_file_active);
+
static inline struct file *__fget_files_rcu(struct files_struct *files,
- unsigned int fd, fmode_t mask)
+ unsigned int fd, fmode_t mask)
{
for (;;) {
struct file *file;
@@ -865,12 +964,6 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
return NULL;
fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds);
- file = rcu_dereference_raw(*fdentry);
- if (unlikely(!file))
- return NULL;
-
- if (unlikely(file->f_mode & mask))
- return NULL;
/*
* Ok, we have a file pointer. However, because we do
@@ -879,10 +972,15 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
*
* Such a race can take two forms:
*
- * (a) the file ref already went down to zero,
- * and get_file_rcu() fails. Just try again:
+ * (a) the file ref already went down to zero and the
+ * file hasn't been reused yet or the file count
+ * isn't zero but the file has already been reused.
*/
- if (unlikely(!get_file_rcu(file)))
+ file = __get_file_rcu(fdentry);
+ if (unlikely(!file))
+ return NULL;
+
+ if (unlikely(IS_ERR(file)))
continue;
/*
@@ -893,13 +991,21 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
*
* If so, we need to put our ref and try again.
*/
- if (unlikely(rcu_dereference_raw(files->fdt) != fdt) ||
- unlikely(rcu_dereference_raw(*fdentry) != file)) {
+ if (unlikely(rcu_dereference_raw(files->fdt) != fdt)) {
fput(file);
continue;
}
/*
+ * This isn't the file we're looking for or we're not
+ * allowed to get a reference to it.
+ */
+ if (unlikely(file->f_mode & mask)) {
+ fput(file);
+ return NULL;
+ }
+
+ /*
* Ok, we have a ref to the file, and checked that it
* still exists.
*/
@@ -948,7 +1054,14 @@ struct file *fget_task(struct task_struct *task, unsigned int fd)
return file;
}
-struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
+struct file *lookup_fdget_rcu(unsigned int fd)
+{
+ return __fget_files_rcu(current->files, fd, 0);
+
+}
+EXPORT_SYMBOL_GPL(lookup_fdget_rcu);
+
+struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd)
{
/* Must be called with rcu_read_lock held */
struct files_struct *files;
@@ -957,13 +1070,13 @@ struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
task_lock(task);
files = task->files;
if (files)
- file = files_lookup_fd_rcu(files, fd);
+ file = __fget_files_rcu(files, fd, 0);
task_unlock(task);
return file;
}
-struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd)
+struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *ret_fd)
{
/* Must be called with rcu_read_lock held */
struct files_struct *files;
@@ -974,7 +1087,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret
files = task->files;
if (files) {
for (; fd < files_fdtable(files)->max_fds; fd++) {
- file = files_lookup_fd_rcu(files, fd);
+ file = __fget_files_rcu(files, fd, 0);
if (file)
break;
}
@@ -983,7 +1096,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret
*ret_fd = fd;
return file;
}
-EXPORT_SYMBOL(task_lookup_next_fd_rcu);
+EXPORT_SYMBOL(task_lookup_next_fdget_rcu);
/*
* Lightweight file lookup - no refcnt increment if fd table isn't shared.
@@ -1272,12 +1385,16 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
{
if (unlikely(newfd == oldfd)) { /* corner case */
struct files_struct *files = current->files;
+ struct file *f;
int retval = oldfd;
rcu_read_lock();
- if (!files_lookup_fd_rcu(files, oldfd))
+ f = __fget_files_rcu(files, oldfd, 0);
+ if (!f)
retval = -EBADF;
rcu_read_unlock();
+ if (f)
+ fput(f);
return retval;
}
return ksys_dup3(oldfd, newfd, 0);
diff --git a/fs/file_table.c b/fs/file_table.c
index ee21b3da9d08..de4a2915bfd4 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -40,14 +40,14 @@ static struct files_stat_struct files_stat = {
};
/* SLAB cache for file structures */
-static struct kmem_cache *filp_cachep __read_mostly;
+static struct kmem_cache *filp_cachep __ro_after_init;
static struct percpu_counter nr_files __cacheline_aligned_in_smp;
-/* Container for backing file with optional real path */
+/* Container for backing file with optional user path */
struct backing_file {
struct file file;
- struct path real_path;
+ struct path user_path;
};
static inline struct backing_file *backing_file(struct file *f)
@@ -55,31 +55,36 @@ static inline struct backing_file *backing_file(struct file *f)
return container_of(f, struct backing_file, file);
}
-struct path *backing_file_real_path(struct file *f)
+struct path *backing_file_user_path(struct file *f)
{
- return &backing_file(f)->real_path;
+ return &backing_file(f)->user_path;
}
-EXPORT_SYMBOL_GPL(backing_file_real_path);
+EXPORT_SYMBOL_GPL(backing_file_user_path);
-static void file_free_rcu(struct rcu_head *head)
+static inline void file_free(struct file *f)
{
- struct file *f = container_of(head, struct file, f_rcuhead);
-
+ security_file_free(f);
+ if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
+ percpu_counter_dec(&nr_files);
put_cred(f->f_cred);
- if (unlikely(f->f_mode & FMODE_BACKING))
+ if (unlikely(f->f_mode & FMODE_BACKING)) {
+ path_put(backing_file_user_path(f));
kfree(backing_file(f));
- else
+ } else {
kmem_cache_free(filp_cachep, f);
+ }
}
-static inline void file_free(struct file *f)
+void release_empty_file(struct file *f)
{
- security_file_free(f);
- if (unlikely(f->f_mode & FMODE_BACKING))
- path_put(backing_file_real_path(f));
- if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
- percpu_counter_dec(&nr_files);
- call_rcu(&f->f_rcuhead, file_free_rcu);
+ WARN_ON_ONCE(f->f_mode & (FMODE_BACKING | FMODE_OPENED));
+ if (atomic_long_dec_and_test(&f->f_count)) {
+ security_file_free(f);
+ put_cred(f->f_cred);
+ if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
+ percpu_counter_dec(&nr_files);
+ kmem_cache_free(filp_cachep, f);
+ }
}
/*
@@ -164,7 +169,6 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
return error;
}
- atomic_long_set(&f->f_count, 1);
rwlock_init(&f->f_owner.lock);
spin_lock_init(&f->f_lock);
mutex_init(&f->f_pos_lock);
@@ -172,6 +176,12 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
f->f_mode = OPEN_FMODE(flags);
/* f->f_version: 0 */
+ /*
+ * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While
+ * fget-rcu pattern users need to be able to handle spurious
+ * refcount bumps we should reinitialize the reused file first.
+ */
+ atomic_long_set(&f->f_count, 1);
return 0;
}
@@ -471,7 +481,8 @@ EXPORT_SYMBOL(__fput_sync);
void __init files_init(void)
{
filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
- SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL);
+ SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN |
+ SLAB_PANIC | SLAB_ACCOUNT, NULL);
percpu_counter_init(&nr_files, 0, GFP_KERNEL);
}
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index ac5d43b164b5..20600e9ea202 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -109,11 +109,9 @@ static inline void dip2vip_cpy(struct vxfs_sb_info *sbi,
set_nlink(inode, vip->vii_nlink);
inode->i_size = vip->vii_size;
- inode->i_atime.tv_sec = vip->vii_atime;
+ inode_set_atime(inode, vip->vii_atime, 0);
inode_set_ctime(inode, vip->vii_ctime, 0);
- inode->i_mtime.tv_sec = vip->vii_mtime;
- inode->i_atime.tv_nsec = 0;
- inode->i_mtime.tv_nsec = 0;
+ inode_set_mtime(inode, vip->vii_mtime, 0);
inode->i_blocks = vip->vii_blocks;
inode->i_generation = vip->vii_gen;
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 310d73e254df..e6e2a2185e7c 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -76,6 +76,7 @@ vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp)
{
struct vxfs_sb_info *infp = VXFS_SBI(dentry->d_sb);
struct vxfs_sb *raw_sb = infp->vsi_raw;
+ u64 id = huge_encode_dev(dentry->d_sb->s_bdev->bd_dev);
bufp->f_type = VXFS_SUPER_MAGIC;
bufp->f_bsize = dentry->d_sb->s_blocksize;
@@ -84,6 +85,7 @@ vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp)
bufp->f_bavail = 0;
bufp->f_files = 0;
bufp->f_ffree = fs32_to_cpu(infp, raw_sb->vs_ifree);
+ bufp->f_fsid = u64_to_fsid(id);
bufp->f_namelen = VXFS_NAMELEN;
return 0;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index c1af01b2c42d..1767493dffda 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -613,6 +613,24 @@ out_free:
kfree(isw);
}
+static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw,
+ struct list_head *list, int *nr)
+{
+ struct inode *inode;
+
+ list_for_each_entry(inode, list, i_io_list) {
+ if (!inode_prepare_wbs_switch(inode, isw->new_wb))
+ continue;
+
+ isw->inodes[*nr] = inode;
+ (*nr)++;
+
+ if (*nr >= WB_MAX_INODES_PER_ISW - 1)
+ return true;
+ }
+ return false;
+}
+
/**
* cleanup_offline_cgwb - detach associated inodes
* @wb: target wb
@@ -625,7 +643,6 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
{
struct cgroup_subsys_state *memcg_css;
struct inode_switch_wbs_context *isw;
- struct inode *inode;
int nr;
bool restart = false;
@@ -647,17 +664,17 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
nr = 0;
spin_lock(&wb->list_lock);
- list_for_each_entry(inode, &wb->b_attached, i_io_list) {
- if (!inode_prepare_wbs_switch(inode, isw->new_wb))
- continue;
-
- isw->inodes[nr++] = inode;
-
- if (nr >= WB_MAX_INODES_PER_ISW - 1) {
- restart = true;
- break;
- }
- }
+ /*
+ * In addition to the inodes that have completed writeback, also switch
+ * cgwbs for those inodes only with dirty timestamps. Otherwise, those
+ * inodes won't be written back for a long time when lazytime is
+ * enabled, and thus pinning the dying cgwbs. It won't break the
+ * bandwidth restrictions, as writeback of inode metadata is not
+ * accounted for.
+ */
+ restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr);
+ if (!restart)
+ restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr);
spin_unlock(&wb->list_lock);
/* no attached inodes? bail out */
diff --git a/fs/fsopen.c b/fs/fsopen.c
index ce03f6521c88..6593ae518115 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -465,6 +465,7 @@ SYSCALL_DEFINE5(fsconfig,
param.file = fget(aux);
if (!param.file)
goto out_key;
+ param.dirfd = aux;
break;
default:
break;
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index ab62e4624256..284a35006462 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -235,7 +235,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
inode->i_mode = mode;
inode->i_uid = fc->user_id;
inode->i_gid = fc->group_id;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
/* setting ->i_op to NULL is not allowed */
if (iop)
inode->i_op = iop;
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index 23904a6a9a96..12ef91d170bb 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -1222,6 +1222,7 @@ void fuse_dax_conn_free(struct fuse_conn *fc)
if (fc->dax) {
fuse_free_dax_mem_ranges(&fc->dax->free_ranges);
kfree(fc->dax);
+ fc->dax = NULL;
}
}
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index d707e6987da9..d19cbf34c634 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1812,12 +1812,12 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
memset(&outarg, 0, sizeof(outarg));
inarg.valid = FATTR_MTIME;
- inarg.mtime = inode->i_mtime.tv_sec;
- inarg.mtimensec = inode->i_mtime.tv_nsec;
+ inarg.mtime = inode_get_mtime_sec(inode);
+ inarg.mtimensec = inode_get_mtime_nsec(inode);
if (fm->fc->minor >= 23) {
inarg.valid |= FATTR_CTIME;
- inarg.ctime = inode_get_ctime(inode).tv_sec;
- inarg.ctimensec = inode_get_ctime(inode).tv_nsec;
+ inarg.ctime = inode_get_ctime_sec(inode);
+ inarg.ctimensec = inode_get_ctime_nsec(inode);
}
if (ff) {
inarg.valid |= FATTR_FH;
@@ -1956,7 +1956,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
/* the kernel maintains i_mtime locally */
if (trust_local_cmtime) {
if (attr->ia_valid & ATTR_MTIME)
- inode->i_mtime = attr->ia_mtime;
+ inode_set_mtime_to_ts(inode, attr->ia_mtime);
if (attr->ia_valid & ATTR_CTIME)
inode_set_ctime_to_ts(inode, attr->ia_ctime);
/* FIXME: clear I_DIRTY_SYNC? */
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 1cdb6327511e..a660f1f21540 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1448,7 +1448,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
if (!ia)
return -ENOMEM;
- if (fopen_direct_io && fc->direct_io_relax) {
+ if (fopen_direct_io && fc->direct_io_allow_mmap) {
res = filemap_write_and_wait_range(mapping, pos, pos + count - 1);
if (res) {
fuse_io_free(ia);
@@ -1574,6 +1574,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
ssize_t res;
bool exclusive_lock =
!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) ||
+ get_fuse_conn(inode)->direct_io_allow_mmap ||
iocb->ki_flags & IOCB_APPEND ||
fuse_direct_write_extending_i_size(iocb, from);
@@ -1581,6 +1582,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
* Take exclusive lock if
* - Parallel direct writes are disabled - a user space decision
* - Parallel direct writes are enabled and i_size is being extended.
+ * - Shared mmap on direct_io file is supported (FUSE_DIRECT_IO_ALLOW_MMAP).
* This might not be needed at all, but needs further investigation.
*/
if (exclusive_lock)
@@ -2466,9 +2468,9 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
if (ff->open_flags & FOPEN_DIRECT_IO) {
/* Can't provide the coherency needed for MAP_SHARED
- * if FUSE_DIRECT_IO_RELAX isn't set.
+ * if FUSE_DIRECT_IO_ALLOW_MMAP isn't set.
*/
- if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_relax)
+ if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_allow_mmap)
return -ENODEV;
invalidate_inode_pages2(file->f_mapping);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index bf0b85d0b95c..1df83eebda92 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -63,6 +63,19 @@ struct fuse_forget_link {
struct fuse_forget_link *next;
};
+/* Submount lookup tracking */
+struct fuse_submount_lookup {
+ /** Refcount */
+ refcount_t count;
+
+ /** Unique ID, which identifies the inode between userspace
+ * and kernel */
+ u64 nodeid;
+
+ /** The request used for sending the FORGET message */
+ struct fuse_forget_link *forget;
+};
+
/** FUSE inode */
struct fuse_inode {
/** Inode data */
@@ -158,6 +171,8 @@ struct fuse_inode {
*/
struct fuse_inode_dax *dax;
#endif
+ /** Submount specific lookup tracking */
+ struct fuse_submount_lookup *submount_lookup;
};
/** FUSE inode state bits */
@@ -797,8 +812,8 @@ struct fuse_conn {
/* Is tmpfile not implemented by fs? */
unsigned int no_tmpfile:1;
- /* relax restrictions in FOPEN_DIRECT_IO mode */
- unsigned int direct_io_relax:1;
+ /* Relax restrictions to allow shared mmap in FOPEN_DIRECT_IO mode */
+ unsigned int direct_io_allow_mmap:1;
/* Is statx not implemented by fs? */
unsigned int no_statx:1;
@@ -1284,7 +1299,7 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value,
size_t size);
ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size);
int fuse_removexattr(struct inode *inode, const char *name);
-extern const struct xattr_handler *fuse_xattr_handlers[];
+extern const struct xattr_handler * const fuse_xattr_handlers[];
struct posix_acl;
struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2e4eb7cf26fb..2a6d44f91729 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -68,6 +68,24 @@ struct fuse_forget_link *fuse_alloc_forget(void)
return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL_ACCOUNT);
}
+static struct fuse_submount_lookup *fuse_alloc_submount_lookup(void)
+{
+ struct fuse_submount_lookup *sl;
+
+ sl = kzalloc(sizeof(struct fuse_submount_lookup), GFP_KERNEL_ACCOUNT);
+ if (!sl)
+ return NULL;
+ sl->forget = fuse_alloc_forget();
+ if (!sl->forget)
+ goto out_free;
+
+ return sl;
+
+out_free:
+ kfree(sl);
+ return NULL;
+}
+
static struct inode *fuse_alloc_inode(struct super_block *sb)
{
struct fuse_inode *fi;
@@ -83,6 +101,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
fi->attr_version = 0;
fi->orig_ino = 0;
fi->state = 0;
+ fi->submount_lookup = NULL;
mutex_init(&fi->mutex);
spin_lock_init(&fi->lock);
fi->forget = fuse_alloc_forget();
@@ -113,6 +132,17 @@ static void fuse_free_inode(struct inode *inode)
kmem_cache_free(fuse_inode_cachep, fi);
}
+static void fuse_cleanup_submount_lookup(struct fuse_conn *fc,
+ struct fuse_submount_lookup *sl)
+{
+ if (!refcount_dec_and_test(&sl->count))
+ return;
+
+ fuse_queue_forget(fc, sl->forget, sl->nodeid, 1);
+ sl->forget = NULL;
+ kfree(sl);
+}
+
static void fuse_evict_inode(struct inode *inode)
{
struct fuse_inode *fi = get_fuse_inode(inode);
@@ -132,6 +162,11 @@ static void fuse_evict_inode(struct inode *inode)
fi->nlookup);
fi->forget = NULL;
}
+
+ if (fi->submount_lookup) {
+ fuse_cleanup_submount_lookup(fc, fi->submount_lookup);
+ fi->submount_lookup = NULL;
+ }
}
if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) {
WARN_ON(!list_empty(&fi->write_files));
@@ -188,12 +223,10 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
attr->mtimensec = min_t(u32, attr->mtimensec, NSEC_PER_SEC - 1);
attr->ctimensec = min_t(u32, attr->ctimensec, NSEC_PER_SEC - 1);
- inode->i_atime.tv_sec = attr->atime;
- inode->i_atime.tv_nsec = attr->atimensec;
+ inode_set_atime(inode, attr->atime, attr->atimensec);
/* mtime from server may be stale due to local buffered write */
if (!(cache_mask & STATX_MTIME)) {
- inode->i_mtime.tv_sec = attr->mtime;
- inode->i_mtime.tv_nsec = attr->mtimensec;
+ inode_set_mtime(inode, attr->mtime, attr->mtimensec);
}
if (!(cache_mask & STATX_CTIME)) {
inode_set_ctime(inode, attr->ctime, attr->ctimensec);
@@ -276,12 +309,12 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
attr->size = i_size_read(inode);
if (cache_mask & STATX_MTIME) {
- attr->mtime = inode->i_mtime.tv_sec;
- attr->mtimensec = inode->i_mtime.tv_nsec;
+ attr->mtime = inode_get_mtime_sec(inode);
+ attr->mtimensec = inode_get_mtime_nsec(inode);
}
if (cache_mask & STATX_CTIME) {
- attr->ctime = inode_get_ctime(inode).tv_sec;
- attr->ctimensec = inode_get_ctime(inode).tv_nsec;
+ attr->ctime = inode_get_ctime_sec(inode);
+ attr->ctimensec = inode_get_ctime_nsec(inode);
}
if ((attr_version != 0 && fi->attr_version > attr_version) ||
@@ -290,7 +323,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
return;
}
- old_mtime = inode->i_mtime;
+ old_mtime = inode_get_mtime(inode);
fuse_change_attributes_common(inode, attr, sx, attr_valid, cache_mask);
oldsize = inode->i_size;
@@ -332,13 +365,19 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
fuse_dax_dontcache(inode, attr->flags);
}
+static void fuse_init_submount_lookup(struct fuse_submount_lookup *sl,
+ u64 nodeid)
+{
+ sl->nodeid = nodeid;
+ refcount_set(&sl->count, 1);
+}
+
static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr,
struct fuse_conn *fc)
{
inode->i_mode = attr->mode & S_IFMT;
inode->i_size = attr->size;
- inode->i_mtime.tv_sec = attr->mtime;
- inode->i_mtime.tv_nsec = attr->mtimensec;
+ inode_set_mtime(inode, attr->mtime, attr->mtimensec);
inode_set_ctime(inode, attr->ctime, attr->ctimensec);
if (S_ISREG(inode->i_mode)) {
fuse_init_common(inode);
@@ -395,12 +434,22 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
*/
if (fc->auto_submounts && (attr->flags & FUSE_ATTR_SUBMOUNT) &&
S_ISDIR(attr->mode)) {
+ struct fuse_inode *fi;
+
inode = new_inode(sb);
if (!inode)
return NULL;
fuse_init_inode(inode, attr, fc);
- get_fuse_inode(inode)->nodeid = nodeid;
+ fi = get_fuse_inode(inode);
+ fi->nodeid = nodeid;
+ fi->submount_lookup = fuse_alloc_submount_lookup();
+ if (!fi->submount_lookup) {
+ iput(inode);
+ return NULL;
+ }
+ /* Sets nlookup = 1 on fi->submount_lookup->nlookup */
+ fuse_init_submount_lookup(fi->submount_lookup, nodeid);
inode->i_flags |= S_AUTOMOUNT;
goto done;
}
@@ -423,11 +472,11 @@ retry:
iput(inode);
goto retry;
}
-done:
fi = get_fuse_inode(inode);
spin_lock(&fi->lock);
fi->nlookup++;
spin_unlock(&fi->lock);
+done:
fuse_change_attributes(inode, attr, NULL, attr_valid, attr_version);
return inode;
@@ -1002,7 +1051,7 @@ static int fuse_encode_fh(struct inode *inode, u32 *fh, int *max_len,
}
*max_len = len;
- return parent ? 0x82 : 0x81;
+ return parent ? FILEID_INO64_GEN_PARENT : FILEID_INO64_GEN;
}
static struct dentry *fuse_fh_to_dentry(struct super_block *sb,
@@ -1010,7 +1059,8 @@ static struct dentry *fuse_fh_to_dentry(struct super_block *sb,
{
struct fuse_inode_handle handle;
- if ((fh_type != 0x81 && fh_type != 0x82) || fh_len < 3)
+ if ((fh_type != FILEID_INO64_GEN &&
+ fh_type != FILEID_INO64_GEN_PARENT) || fh_len < 3)
return NULL;
handle.nodeid = (u64) fid->raw[0] << 32;
@@ -1024,7 +1074,7 @@ static struct dentry *fuse_fh_to_parent(struct super_block *sb,
{
struct fuse_inode_handle parent;
- if (fh_type != 0x82 || fh_len < 6)
+ if (fh_type != FILEID_INO64_GEN_PARENT || fh_len < 6)
return NULL;
parent.nodeid = (u64) fid->raw[3] << 32;
@@ -1232,8 +1282,8 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
fc->init_security = 1;
if (flags & FUSE_CREATE_SUPP_GROUP)
fc->create_supp_group = 1;
- if (flags & FUSE_DIRECT_IO_RELAX)
- fc->direct_io_relax = 1;
+ if (flags & FUSE_DIRECT_IO_ALLOW_MMAP)
+ fc->direct_io_allow_mmap = 1;
} else {
ra_pages = fc->max_read / PAGE_SIZE;
fc->no_lock = 1;
@@ -1280,7 +1330,7 @@ void fuse_send_init(struct fuse_mount *fm)
FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA |
FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT |
FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP |
- FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_RELAX;
+ FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP;
#ifdef CONFIG_FUSE_DAX
if (fm->fc->dax)
flags |= FUSE_MAP_ALIGNMENT;
@@ -1423,17 +1473,19 @@ EXPORT_SYMBOL_GPL(fuse_dev_free);
static void fuse_fill_attr_from_inode(struct fuse_attr *attr,
const struct fuse_inode *fi)
{
+ struct timespec64 atime = inode_get_atime(&fi->inode);
+ struct timespec64 mtime = inode_get_mtime(&fi->inode);
struct timespec64 ctime = inode_get_ctime(&fi->inode);
*attr = (struct fuse_attr){
.ino = fi->inode.i_ino,
.size = fi->inode.i_size,
.blocks = fi->inode.i_blocks,
- .atime = fi->inode.i_atime.tv_sec,
- .mtime = fi->inode.i_mtime.tv_sec,
+ .atime = atime.tv_sec,
+ .mtime = mtime.tv_sec,
.ctime = ctime.tv_sec,
- .atimensec = fi->inode.i_atime.tv_nsec,
- .mtimensec = fi->inode.i_mtime.tv_nsec,
+ .atimensec = atime.tv_nsec,
+ .mtimensec = mtime.tv_nsec,
.ctimensec = ctime.tv_nsec,
.mode = fi->inode.i_mode,
.nlink = fi->inode.i_nlink,
@@ -1465,6 +1517,8 @@ static int fuse_fill_super_submount(struct super_block *sb,
struct super_block *parent_sb = parent_fi->inode.i_sb;
struct fuse_attr root_attr;
struct inode *root;
+ struct fuse_submount_lookup *sl;
+ struct fuse_inode *fi;
fuse_sb_defaults(sb);
fm->sb = sb;
@@ -1487,12 +1541,27 @@ static int fuse_fill_super_submount(struct super_block *sb,
* its nlookup should not be incremented. fuse_iget() does
* that, though, so undo it here.
*/
- get_fuse_inode(root)->nlookup--;
+ fi = get_fuse_inode(root);
+ fi->nlookup--;
+
sb->s_d_op = &fuse_dentry_operations;
sb->s_root = d_make_root(root);
if (!sb->s_root)
return -ENOMEM;
+ /*
+ * Grab the parent's submount_lookup pointer and take a
+ * reference on the shared nlookup from the parent. This is to
+ * prevent the last forget for this nodeid from getting
+ * triggered until all users have finished with it.
+ */
+ sl = parent_fi->submount_lookup;
+ WARN_ON(!sl);
+ if (sl) {
+ refcount_inc(&sl->count);
+ fi->submount_lookup = sl;
+ }
+
return 0;
}
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index 9e6d587b3e67..c66a54d6c7d3 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -476,7 +476,7 @@ retry_locked:
if (!fi->rdc.cached) {
/* Starting cache? Set cache mtime. */
if (!ctx->pos && !fi->rdc.size) {
- fi->rdc.mtime = inode->i_mtime;
+ fi->rdc.mtime = inode_get_mtime(inode);
fi->rdc.iversion = inode_query_iversion(inode);
}
spin_unlock(&fi->rdc.lock);
@@ -488,8 +488,10 @@ retry_locked:
* changed, and reset the cache if so.
*/
if (!ctx->pos) {
+ struct timespec64 mtime = inode_get_mtime(inode);
+
if (inode_peek_iversion(inode) != fi->rdc.iversion ||
- !timespec64_equal(&fi->rdc.mtime, &inode->i_mtime)) {
+ !timespec64_equal(&fi->rdc.mtime, &mtime)) {
fuse_rdc_reset(inode);
goto retry_locked;
}
diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c
index 49c01559580f..5b423fdbb13f 100644
--- a/fs/fuse/xattr.c
+++ b/fs/fuse/xattr.c
@@ -209,7 +209,7 @@ static const struct xattr_handler fuse_xattr_handler = {
.set = fuse_xattr_set,
};
-const struct xattr_handler *fuse_xattr_handlers[] = {
+const struct xattr_handler * const fuse_xattr_handlers[] = {
&fuse_xattr_handler,
NULL
};
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index d4deb2b19959..82f5b09c04e6 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -11,9 +11,9 @@
#define GFS2_ACL_MAX_ENTRIES(sdp) ((300 << (sdp)->sd_sb.sb_bsize_shift) >> 12)
-extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu);
-extern int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-extern int gfs2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
- struct posix_acl *acl, int type);
+struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu);
+int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int gfs2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct posix_acl *acl, int type);
#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index c26d48355cc2..9611bfceda4b 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -130,7 +130,7 @@ static int __gfs2_jdata_write_folio(struct folio *folio,
if (folio_test_checked(folio)) {
folio_clear_checked(folio);
if (!folio_buffers(folio)) {
- folio_create_empty_buffers(folio,
+ create_empty_buffers(folio,
inode->i_sb->s_blocksize,
BIT(BH_Dirty)|BIT(BH_Uptodate));
}
@@ -155,7 +155,7 @@ static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
- if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl)))
+ if (gfs2_assert_withdraw(sdp, ip->i_gl->gl_state == LM_ST_EXCLUSIVE))
goto out;
if (folio_test_checked(folio) || current->journal_info)
goto out_ignore;
@@ -214,12 +214,12 @@ static int gfs2_write_jdata_batch(struct address_space *mapping,
unsigned nrblocks;
int i;
int ret;
- int nr_pages = 0;
+ size_t size = 0;
int nr_folios = folio_batch_count(fbatch);
for (i = 0; i < nr_folios; i++)
- nr_pages += folio_nr_pages(fbatch->folios[i]);
- nrblocks = nr_pages * (PAGE_SIZE >> inode->i_blkbits);
+ size += folio_size(fbatch->folios[i]);
+ nrblocks = size >> inode->i_blkbits;
ret = gfs2_trans_begin(sdp, nrblocks, nrblocks);
if (ret < 0)
@@ -403,27 +403,27 @@ static int gfs2_jdata_writepages(struct address_space *mapping,
}
/**
- * stuffed_readpage - Fill in a Linux page with stuffed file data
+ * stuffed_readpage - Fill in a Linux folio with stuffed file data
* @ip: the inode
- * @page: the page
+ * @folio: the folio
*
* Returns: errno
*/
-static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
+static int stuffed_readpage(struct gfs2_inode *ip, struct folio *folio)
{
struct buffer_head *dibh;
- u64 dsize = i_size_read(&ip->i_inode);
- void *kaddr;
+ size_t i_size = i_size_read(&ip->i_inode);
+ void *data;
int error;
/*
* Due to the order of unstuffing files and ->fault(), we can be
- * asked for a zero page in the case of a stuffed file being extended,
+ * asked for a zero folio in the case of a stuffed file being extended,
* so we need to supply one here. It doesn't happen often.
*/
- if (unlikely(page->index)) {
- zero_user(page, 0, PAGE_SIZE);
- SetPageUptodate(page);
+ if (unlikely(folio->index)) {
+ folio_zero_range(folio, 0, folio_size(folio));
+ folio_mark_uptodate(folio);
return 0;
}
@@ -431,13 +431,11 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
if (error)
return error;
- kaddr = kmap_local_page(page);
- memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
- memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
- kunmap_local(kaddr);
- flush_dcache_page(page);
+ data = dibh->b_data + sizeof(struct gfs2_dinode);
+ memcpy_to_folio(folio, 0, data, i_size);
+ folio_zero_range(folio, i_size, folio_size(folio) - i_size);
brelse(dibh);
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
return 0;
}
@@ -458,7 +456,7 @@ static int gfs2_read_folio(struct file *file, struct folio *folio)
(i_blocksize(inode) == PAGE_SIZE && !folio_buffers(folio))) {
error = iomap_read_folio(folio, &gfs2_iomap_ops);
} else if (gfs2_is_stuffed(ip)) {
- error = stuffed_readpage(ip, &folio->page);
+ error = stuffed_readpage(ip, folio);
folio_unlock(folio);
} else {
error = mpage_read_folio(folio, gfs2_block_map);
@@ -479,31 +477,29 @@ static int gfs2_read_folio(struct file *file, struct folio *folio)
*
*/
-int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
- unsigned size)
+ssize_t gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
+ size_t size)
{
struct address_space *mapping = ip->i_inode.i_mapping;
unsigned long index = *pos >> PAGE_SHIFT;
- unsigned offset = *pos & (PAGE_SIZE - 1);
- unsigned copied = 0;
- unsigned amt;
- struct page *page;
+ size_t copied = 0;
do {
- page = read_cache_page(mapping, index, gfs2_read_folio, NULL);
- if (IS_ERR(page)) {
- if (PTR_ERR(page) == -EINTR)
+ size_t offset, chunk;
+ struct folio *folio;
+
+ folio = read_cache_folio(mapping, index, gfs2_read_folio, NULL);
+ if (IS_ERR(folio)) {
+ if (PTR_ERR(folio) == -EINTR)
continue;
- return PTR_ERR(page);
+ return PTR_ERR(folio);
}
- amt = size - copied;
- if (offset + size > PAGE_SIZE)
- amt = PAGE_SIZE - offset;
- memcpy_from_page(buf + copied, page, offset, amt);
- put_page(page);
- copied += amt;
- index++;
- offset = 0;
+ offset = *pos + copied - folio_pos(folio);
+ chunk = min(size - copied, folio_size(folio) - offset);
+ memcpy_from_folio(buf + copied, folio, offset, chunk);
+ index = folio_next_index(folio);
+ folio_put(folio);
+ copied += chunk;
} while(copied < size);
(*pos) += size;
return size;
diff --git a/fs/gfs2/aops.h b/fs/gfs2/aops.h
index f08322ef41cf..a10c4334d248 100644
--- a/fs/gfs2/aops.h
+++ b/fs/gfs2/aops.h
@@ -8,8 +8,8 @@
#include "incore.h"
-extern void adjust_fs_space(struct inode *inode);
-extern void gfs2_trans_add_databufs(struct gfs2_inode *ip, struct folio *folio,
- size_t from, size_t len);
+void adjust_fs_space(struct inode *inode);
+void gfs2_trans_add_databufs(struct gfs2_inode *ip, struct folio *folio,
+ size_t from, size_t len);
#endif /* __AOPS_DOT_H__ */
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index ef7017fb6951..d9ccfd27e4f1 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -43,53 +43,51 @@ struct metapath {
static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length);
/**
- * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
+ * gfs2_unstuffer_folio - unstuff a stuffed inode into a block cached by a folio
* @ip: the inode
* @dibh: the dinode buffer
* @block: the block number that was allocated
- * @page: The (optional) page. This is looked up if @page is NULL
+ * @folio: The folio.
*
* Returns: errno
*/
-
-static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
- u64 block, struct page *page)
+static int gfs2_unstuffer_folio(struct gfs2_inode *ip, struct buffer_head *dibh,
+ u64 block, struct folio *folio)
{
struct inode *inode = &ip->i_inode;
- if (!PageUptodate(page)) {
- void *kaddr = kmap(page);
+ if (!folio_test_uptodate(folio)) {
+ void *kaddr = kmap_local_folio(folio, 0);
u64 dsize = i_size_read(inode);
memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
- memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
- kunmap(page);
+ memset(kaddr + dsize, 0, folio_size(folio) - dsize);
+ kunmap_local(kaddr);
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
}
if (gfs2_is_jdata(ip)) {
- struct buffer_head *bh;
+ struct buffer_head *bh = folio_buffers(folio);
- if (!page_has_buffers(page))
- create_empty_buffers(page, BIT(inode->i_blkbits),
- BIT(BH_Uptodate));
+ if (!bh)
+ bh = create_empty_buffers(folio,
+ BIT(inode->i_blkbits), BIT(BH_Uptodate));
- bh = page_buffers(page);
if (!buffer_mapped(bh))
map_bh(bh, inode->i_sb, block);
set_buffer_uptodate(bh);
gfs2_trans_add_data(ip->i_gl, bh);
} else {
- set_page_dirty(page);
+ folio_mark_dirty(folio);
gfs2_ordered_add_inode(ip);
}
return 0;
}
-static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct page *page)
+static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct folio *folio)
{
struct buffer_head *bh, *dibh;
struct gfs2_dinode *di;
@@ -106,7 +104,7 @@ static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct page *page)
and write it out to disk */
unsigned int n = 1;
- error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
+ error = gfs2_alloc_blocks(ip, &block, &n, 0);
if (error)
goto out_brelse;
if (isdir) {
@@ -118,7 +116,7 @@ static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct page *page)
dibh, sizeof(struct gfs2_dinode));
brelse(bh);
} else {
- error = gfs2_unstuffer_page(ip, dibh, block, page);
+ error = gfs2_unstuffer_folio(ip, dibh, block, folio);
if (error)
goto out_brelse;
}
@@ -157,17 +155,17 @@ out_brelse:
int gfs2_unstuff_dinode(struct gfs2_inode *ip)
{
struct inode *inode = &ip->i_inode;
- struct page *page;
+ struct folio *folio;
int error;
down_write(&ip->i_rw_mutex);
- page = grab_cache_page(inode->i_mapping, 0);
- error = -ENOMEM;
- if (!page)
+ folio = filemap_grab_folio(inode->i_mapping, 0);
+ error = PTR_ERR(folio);
+ if (IS_ERR(folio))
goto out;
- error = __gfs2_unstuff_inode(ip, page);
- unlock_page(page);
- put_page(page);
+ error = __gfs2_unstuff_inode(ip, folio);
+ folio_unlock(folio);
+ folio_put(folio);
out:
up_write(&ip->i_rw_mutex);
return error;
@@ -317,6 +315,12 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end)
}
}
+static inline struct buffer_head *
+metapath_dibh(struct metapath *mp)
+{
+ return mp->mp_bh[0];
+}
+
static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp,
unsigned int x, unsigned int h)
{
@@ -415,13 +419,12 @@ static void release_metapath(struct metapath *mp)
* gfs2_extent_length - Returns length of an extent of blocks
* @bh: The metadata block
* @ptr: Current position in @bh
- * @limit: Max extent length to return
* @eob: Set to 1 if we hit "end of block"
*
* Returns: The length of the extent (minimum of one block)
*/
-static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *ptr, size_t limit, int *eob)
+static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *ptr, int *eob)
{
const __be64 *end = (__be64 *)(bh->b_data + bh->b_size);
const __be64 *first = ptr;
@@ -660,7 +663,7 @@ static int __gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
- struct buffer_head *dibh = mp->mp_bh[0];
+ struct buffer_head *dibh = metapath_dibh(mp);
u64 bn;
unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
size_t dblks = iomap->length >> inode->i_blkbits;
@@ -702,7 +705,7 @@ static int __gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
i = mp->mp_aheight;
do {
n = blks - alloced;
- ret = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
+ ret = gfs2_alloc_blocks(ip, &bn, &n, 0);
if (ret)
goto out;
alloced += n;
@@ -913,7 +916,7 @@ unstuff:
goto do_alloc;
bh = mp->mp_bh[ip->i_height - 1];
- len = gfs2_extent_length(bh, ptr, len, &eob);
+ len = gfs2_extent_length(bh, ptr, &eob);
iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
iomap->length = len << inode->i_blkbits;
@@ -1386,7 +1389,7 @@ static int trunc_start(struct inode *inode, u64 newsize)
ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
i_size_write(inode, newsize);
- ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
+ inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
gfs2_dinode_out(ip, dibh->b_data);
if (journaled)
@@ -1583,7 +1586,7 @@ out_unlock:
/* Every transaction boundary, we rewrite the dinode
to keep its di_blocks current in case of failure. */
- ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
+ inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
@@ -1949,7 +1952,7 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
gfs2_statfs_change(sdp, 0, +btotal, 0);
gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
ip->i_inode.i_gid);
- ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
+ inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
up_write(&ip->i_rw_mutex);
@@ -1992,7 +1995,7 @@ static int trunc_end(struct gfs2_inode *ip)
gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
gfs2_ordered_del_inode(ip);
}
- ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
+ inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
gfs2_trans_add_meta(ip->i_gl, dibh);
@@ -2093,7 +2096,7 @@ static int do_grow(struct inode *inode, u64 size)
goto do_end_trans;
truncate_setsize(inode, size);
- ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
+ inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index e5b7d17131ed..4e8b1e8ebdf3 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -46,24 +46,24 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
extern const struct iomap_ops gfs2_iomap_ops;
extern const struct iomap_writeback_ops gfs2_writeback_ops;
-extern int gfs2_unstuff_dinode(struct gfs2_inode *ip);
-extern int gfs2_block_map(struct inode *inode, sector_t lblock,
- struct buffer_head *bh, int create);
-extern int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
- struct iomap *iomap);
-extern int gfs2_iomap_alloc(struct inode *inode, loff_t pos, loff_t length,
- struct iomap *iomap);
-extern int gfs2_get_extent(struct inode *inode, u64 lblock, u64 *dblock,
- unsigned int *extlen);
-extern int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock,
- unsigned *extlen, bool *new);
-extern int gfs2_setattr_size(struct inode *inode, u64 size);
-extern int gfs2_truncatei_resume(struct gfs2_inode *ip);
-extern int gfs2_file_dealloc(struct gfs2_inode *ip);
-extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
- unsigned int len);
-extern int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd);
-extern void gfs2_free_journal_extents(struct gfs2_jdesc *jd);
-extern int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length);
+int gfs2_unstuff_dinode(struct gfs2_inode *ip);
+int gfs2_block_map(struct inode *inode, sector_t lblock,
+ struct buffer_head *bh, int create);
+int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
+ struct iomap *iomap);
+int gfs2_iomap_alloc(struct inode *inode, loff_t pos, loff_t length,
+ struct iomap *iomap);
+int gfs2_get_extent(struct inode *inode, u64 lblock, u64 *dblock,
+ unsigned int *extlen);
+int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock,
+ unsigned *extlen, bool *new);
+int gfs2_setattr_size(struct inode *inode, u64 size);
+int gfs2_truncatei_resume(struct gfs2_inode *ip);
+int gfs2_file_dealloc(struct gfs2_inode *ip);
+int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
+ unsigned int len);
+int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd);
+void gfs2_free_journal_extents(struct gfs2_jdesc *jd);
+int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length);
#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 1a2afa88f8be..560e4624c09f 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -130,7 +130,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
if (ip->i_inode.i_size < offset + size)
i_size_write(&ip->i_inode, offset + size);
- ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
+ inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
@@ -227,7 +227,7 @@ out:
if (ip->i_inode.i_size < offset + copied)
i_size_write(&ip->i_inode, offset + copied);
- ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
+ inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
@@ -868,7 +868,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
struct gfs2_dirent *dent;
struct timespec64 tv = current_time(inode);
- error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
+ error = gfs2_alloc_blocks(ip, &bn, &n, 0);
if (error)
return NULL;
bh = gfs2_meta_new(ip->i_gl, bn);
@@ -1825,7 +1825,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
da->bh = NULL;
brelse(bh);
ip->i_entries++;
- ip->i_inode.i_mtime = tv;
+ inode_set_mtime_to_ts(&ip->i_inode, tv);
if (S_ISDIR(nip->i_inode.i_mode))
inc_nlink(&ip->i_inode);
mark_inode_dirty(inode);
@@ -1911,7 +1911,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
if (!dip->i_entries)
gfs2_consist_inode(dip);
dip->i_entries--;
- dip->i_inode.i_mtime = tv;
+ inode_set_mtime_to_ts(&dip->i_inode, tv);
if (d_is_dir(dentry))
drop_nlink(&dip->i_inode);
mark_inode_dirty(&dip->i_inode);
@@ -1952,7 +1952,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
dent->de_type = cpu_to_be16(new_type);
brelse(bh);
- dip->i_inode.i_mtime = inode_set_ctime_current(&dip->i_inode);
+ inode_set_mtime_to_ts(&dip->i_inode, inode_set_ctime_current(&dip->i_inode));
mark_inode_dirty_sync(&dip->i_inode);
return 0;
}
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 5b76480c17c9..25a857c78b53 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -23,32 +23,32 @@ struct gfs2_diradd {
int save_loc;
};
-extern struct inode *gfs2_dir_search(struct inode *dir,
- const struct qstr *filename,
- bool fail_on_exist);
-extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
- const struct gfs2_inode *ip);
-extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
- const struct gfs2_inode *ip, struct gfs2_diradd *da);
+struct inode *gfs2_dir_search(struct inode *dir,
+ const struct qstr *filename,
+ bool fail_on_exist);
+int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
+ const struct gfs2_inode *ip);
+int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
+ const struct gfs2_inode *ip, struct gfs2_diradd *da);
static inline void gfs2_dir_no_add(struct gfs2_diradd *da)
{
brelse(da->bh);
da->bh = NULL;
}
-extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
-extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
- struct file_ra_state *f_ra);
-extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
- const struct gfs2_inode *nip, unsigned int new_type);
+int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
+int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
+ struct file_ra_state *f_ra);
+int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
+ const struct gfs2_inode *nip, unsigned int new_type);
-extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
+int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
-extern int gfs2_diradd_alloc_required(struct inode *dir,
- const struct qstr *filename,
- struct gfs2_diradd *da);
-extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
- struct buffer_head **bhp);
-extern void gfs2_dir_hash_inval(struct gfs2_inode *ip);
+int gfs2_diradd_alloc_required(struct inode *dir,
+ const struct qstr *filename,
+ struct gfs2_diradd *da);
+int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
+ struct buffer_head **bhp);
+void gfs2_dir_hash_inval(struct gfs2_inode *ip);
static inline u32 gfs2_disk_hash(const char *data, int len)
{
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index f2700477a300..4b66efc1a82a 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -418,7 +418,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
struct inode *inode = file_inode(vmf->vma->vm_file);
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
- struct gfs2_alloc_parms ap = { .aflags = 0, };
+ struct gfs2_alloc_parms ap = {};
u64 offset = page_offset(page);
unsigned int data_blocks, ind_blocks, rblocks;
vm_fault_t ret = VM_FAULT_LOCKED;
@@ -1120,14 +1120,16 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (ret)
goto out_unlock;
- ret = file_update_time(file);
- if (ret)
- goto out_unlock;
-
if (iocb->ki_flags & IOCB_DIRECT) {
struct address_space *mapping = file->f_mapping;
ssize_t buffered, ret2;
+ /*
+ * Note that under direct I/O, we don't allow and inode
+ * timestamp updates, so we're not calling file_update_time()
+ * here.
+ */
+
ret = gfs2_file_direct_write(iocb, from, &gh);
if (ret < 0 || !iov_iter_count(from))
goto out_unlock;
@@ -1154,6 +1156,10 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (!ret || ret2 > 0)
ret += ret2;
} else {
+ ret = file_update_time(file);
+ if (ret)
+ goto out_unlock;
+
ret = gfs2_file_buffered_write(iocb, from, &gh);
if (likely(ret > 0))
ret = generic_write_sync(iocb, ret);
@@ -1245,7 +1251,7 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t
struct inode *inode = file_inode(file);
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_alloc_parms ap = { .aflags = 0, };
+ struct gfs2_alloc_parms ap = {};
unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
loff_t bytes, max_bytes, max_blks;
int error;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 4a280be229a6..d6bf1f8c25dc 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1524,7 +1524,6 @@ fail:
return;
}
list_add_tail(&gh->gh_list, insert_pt);
- gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, gh_list);
spin_unlock(&gl->gl_lockref.lock);
if (sdp->sd_lockstruct.ls_ops->lm_cancel)
sdp->sd_lockstruct.ls_ops->lm_cancel(gl);
@@ -2041,11 +2040,7 @@ static unsigned long gfs2_glock_shrink_count(struct shrinker *shrink,
return vfs_pressure_ratio(atomic_read(&lru_count));
}
-static struct shrinker glock_shrinker = {
- .seeks = DEFAULT_SEEKS,
- .count_objects = gfs2_glock_shrink_count,
- .scan_objects = gfs2_glock_shrink_scan,
-};
+static struct shrinker *glock_shrinker;
/**
* glock_hash_walk - Call a function for glock in a hash bucket
@@ -2465,13 +2460,18 @@ int __init gfs2_glock_init(void)
return -ENOMEM;
}
- ret = register_shrinker(&glock_shrinker, "gfs2-glock");
- if (ret) {
+ glock_shrinker = shrinker_alloc(0, "gfs2-glock");
+ if (!glock_shrinker) {
destroy_workqueue(glock_workqueue);
rhashtable_destroy(&gl_hash_table);
- return ret;
+ return -ENOMEM;
}
+ glock_shrinker->count_objects = gfs2_glock_shrink_count;
+ glock_shrinker->scan_objects = gfs2_glock_shrink_scan;
+
+ shrinker_register(glock_shrinker);
+
for (i = 0; i < GLOCK_WAIT_TABLE_SIZE; i++)
init_waitqueue_head(glock_wait_table + i);
@@ -2480,7 +2480,7 @@ int __init gfs2_glock_init(void)
void gfs2_glock_exit(void)
{
- unregister_shrinker(&glock_shrinker);
+ shrinker_free(glock_shrinker);
rhashtable_destroy(&gl_hash_table);
destroy_workqueue(glock_workqueue);
}
@@ -2719,16 +2719,19 @@ static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i)
for(;; i->fd++) {
struct inode *inode;
- i->file = task_lookup_next_fd_rcu(i->task, &i->fd);
+ i->file = task_lookup_next_fdget_rcu(i->task, &i->fd);
if (!i->file) {
i->fd = 0;
break;
}
+
inode = file_inode(i->file);
- if (inode->i_sb != i->sb)
- continue;
- if (get_file_rcu(i->file))
+ if (inode->i_sb == i->sb)
break;
+
+ rcu_read_unlock();
+ fput(i->file);
+ rcu_read_lock();
}
rcu_read_unlock();
return i->file;
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index c8685ca7d2a2..61197598abfd 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -156,21 +156,6 @@ out:
return gh;
}
-static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl)
-{
- return gl->gl_state == LM_ST_EXCLUSIVE;
-}
-
-static inline int gfs2_glock_is_held_dfrd(struct gfs2_glock *gl)
-{
- return gl->gl_state == LM_ST_DEFERRED;
-}
-
-static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
-{
- return gl->gl_state == LM_ST_SHARED;
-}
-
static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
{
if (gl->gl_ops->go_flags & GLOF_ASPACE) {
@@ -181,40 +166,40 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
return NULL;
}
-extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
- const struct gfs2_glock_operations *glops,
- int create, struct gfs2_glock **glp);
-extern struct gfs2_glock *gfs2_glock_hold(struct gfs2_glock *gl);
-extern void gfs2_glock_put(struct gfs2_glock *gl);
-extern void gfs2_glock_queue_put(struct gfs2_glock *gl);
+int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
+ const struct gfs2_glock_operations *glops,
+ int create, struct gfs2_glock **glp);
+struct gfs2_glock *gfs2_glock_hold(struct gfs2_glock *gl);
+void gfs2_glock_put(struct gfs2_glock *gl);
+void gfs2_glock_queue_put(struct gfs2_glock *gl);
-extern void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
- u16 flags, struct gfs2_holder *gh,
- unsigned long ip);
+void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
+ u16 flags, struct gfs2_holder *gh,
+ unsigned long ip);
static inline void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
u16 flags, struct gfs2_holder *gh) {
__gfs2_holder_init(gl, state, flags, gh, _RET_IP_);
}
-extern void gfs2_holder_reinit(unsigned int state, u16 flags,
- struct gfs2_holder *gh);
-extern void gfs2_holder_uninit(struct gfs2_holder *gh);
-extern int gfs2_glock_nq(struct gfs2_holder *gh);
-extern int gfs2_glock_poll(struct gfs2_holder *gh);
-extern int gfs2_instantiate(struct gfs2_holder *gh);
-extern int gfs2_glock_holder_ready(struct gfs2_holder *gh);
-extern int gfs2_glock_wait(struct gfs2_holder *gh);
-extern int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs);
-extern void gfs2_glock_dq(struct gfs2_holder *gh);
-extern void gfs2_glock_dq_wait(struct gfs2_holder *gh);
-extern void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
-extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
- const struct gfs2_glock_operations *glops,
- unsigned int state, u16 flags,
- struct gfs2_holder *gh);
-extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
-extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
-extern void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl,
+void gfs2_holder_reinit(unsigned int state, u16 flags,
+ struct gfs2_holder *gh);
+void gfs2_holder_uninit(struct gfs2_holder *gh);
+int gfs2_glock_nq(struct gfs2_holder *gh);
+int gfs2_glock_poll(struct gfs2_holder *gh);
+int gfs2_instantiate(struct gfs2_holder *gh);
+int gfs2_glock_holder_ready(struct gfs2_holder *gh);
+int gfs2_glock_wait(struct gfs2_holder *gh);
+int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_glock_dq(struct gfs2_holder *gh);
+void gfs2_glock_dq_wait(struct gfs2_holder *gh);
+void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
+int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
+ const struct gfs2_glock_operations *glops,
+ unsigned int state, u16 flags,
+ struct gfs2_holder *gh);
+int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl,
bool fsid);
#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { \
gfs2_dump_glock(NULL, gl, true); \
@@ -228,7 +213,7 @@ extern void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl,
gfs2_assert_withdraw((gl)->gl_name.ln_sbd, (x)); } } \
while (0)
-extern __printf(2, 3)
+__printf(2, 3)
void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
/**
@@ -256,27 +241,27 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
return error;
}
-extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
-extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
-extern bool gfs2_queue_try_to_evict(struct gfs2_glock *gl);
-extern void gfs2_cancel_delete_work(struct gfs2_glock *gl);
-extern void gfs2_flush_delete_work(struct gfs2_sbd *sdp);
-extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
-extern void gfs2_gl_dq_holders(struct gfs2_sbd *sdp);
-extern void gfs2_glock_thaw(struct gfs2_sbd *sdp);
-extern void gfs2_glock_add_to_lru(struct gfs2_glock *gl);
-extern void gfs2_glock_free(struct gfs2_glock *gl);
+void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
+void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
+bool gfs2_queue_try_to_evict(struct gfs2_glock *gl);
+void gfs2_cancel_delete_work(struct gfs2_glock *gl);
+void gfs2_flush_delete_work(struct gfs2_sbd *sdp);
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
+void gfs2_gl_dq_holders(struct gfs2_sbd *sdp);
+void gfs2_glock_thaw(struct gfs2_sbd *sdp);
+void gfs2_glock_add_to_lru(struct gfs2_glock *gl);
+void gfs2_glock_free(struct gfs2_glock *gl);
-extern int __init gfs2_glock_init(void);
-extern void gfs2_glock_exit(void);
+int __init gfs2_glock_init(void);
+void gfs2_glock_exit(void);
-extern void gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
-extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
-extern void gfs2_register_debugfs(void);
-extern void gfs2_unregister_debugfs(void);
+void gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
+void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
+void gfs2_register_debugfs(void);
+void gfs2_unregister_debugfs(void);
-extern void glock_set_object(struct gfs2_glock *gl, void *object);
-extern void glock_clear_object(struct gfs2_glock *gl, void *object);
+void glock_set_object(struct gfs2_glock *gl, void *object);
+void glock_clear_object(struct gfs2_glock *gl, void *object);
extern const struct lm_lockops gfs2_dlm_ops;
@@ -295,7 +280,7 @@ static inline bool gfs2_holder_queued(struct gfs2_holder *gh)
return !list_empty(&gh->gh_list);
}
-extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation);
-extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation);
+void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation);
+bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation);
#endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index f41ca89d216b..b41c78bd2cc0 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -403,7 +403,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
const struct gfs2_dinode *str = buf;
- struct timespec64 atime;
+ struct timespec64 atime, iatime;
u16 height, depth;
umode_t mode = be32_to_cpu(str->di_mode);
struct inode *inode = &ip->i_inode;
@@ -433,10 +433,11 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
gfs2_set_inode_blocks(inode, be64_to_cpu(str->di_blocks));
atime.tv_sec = be64_to_cpu(str->di_atime);
atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
- if (timespec64_compare(&inode->i_atime, &atime) < 0)
- inode->i_atime = atime;
- inode->i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
- inode->i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
+ iatime = inode_get_atime(inode);
+ if (timespec64_compare(&iatime, &atime) < 0)
+ inode_set_atime_to_ts(inode, atime);
+ inode_set_mtime(inode, be64_to_cpu(str->di_mtime),
+ be32_to_cpu(str->di_mtime_nsec));
inode_set_ctime(inode, be64_to_cpu(str->di_ctime),
be32_to_cpu(str->di_ctime_nsec));
@@ -614,18 +615,6 @@ static int freeze_go_xmote_bh(struct gfs2_glock *gl)
}
/**
- * freeze_go_demote_ok
- * @gl: the glock
- *
- * Always returns 0
- */
-
-static int freeze_go_demote_ok(const struct gfs2_glock *gl)
-{
- return 0;
-}
-
-/**
* iopen_go_callback - schedule the dcache entry for the inode to be deleted
* @gl: the glock
* @remote: true if this came from a different cluster node
@@ -744,7 +733,6 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
const struct gfs2_glock_operations gfs2_freeze_glops = {
.go_xmote_bh = freeze_go_xmote_bh,
- .go_demote_ok = freeze_go_demote_ok,
.go_callback = freeze_go_callback,
.go_type = LM_TYPE_NONDISK,
.go_flags = GLOF_NONDISK,
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
index 695898afcaf1..9341423798df 100644
--- a/fs/gfs2/glops.h
+++ b/fs/gfs2/glops.h
@@ -22,7 +22,7 @@ extern const struct gfs2_glock_operations gfs2_quota_glops;
extern const struct gfs2_glock_operations gfs2_journal_glops;
extern const struct gfs2_glock_operations *gfs2_glops_list[];
-extern int gfs2_inode_metasync(struct gfs2_glock *gl);
-extern void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync);
+int gfs2_inode_metasync(struct gfs2_glock *gl);
+void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync);
#endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index a8c95c5293c6..95a334d64da2 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -863,7 +863,7 @@ static inline void gfs2_sbstats_inc(const struct gfs2_glock *gl, int which)
preempt_enable();
}
-extern struct gfs2_rgrpd *gfs2_glock2rgrp(struct gfs2_glock *gl);
+struct gfs2_rgrpd *gfs2_glock2rgrp(struct gfs2_glock *gl);
static inline unsigned gfs2_max_stuffed_size(const struct gfs2_inode *ip)
{
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 0eac04507904..1b95db2c3aac 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -185,8 +185,9 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
set_bit(GLF_INSTANTIATE_NEEDED, &ip->i_gl->gl_flags);
/* Lowest possible timestamp; will be overwritten in gfs2_dinode_in. */
- inode->i_atime.tv_sec = 1LL << (8 * sizeof(inode->i_atime.tv_sec) - 1);
- inode->i_atime.tv_nsec = 0;
+ inode_set_atime(inode,
+ 1LL << (8 * sizeof(inode_get_atime_sec(inode)) - 1),
+ 0);
glock_set_object(ip->i_gl, ip);
@@ -265,17 +266,18 @@ fail_iput:
}
-struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
+/**
+ * gfs2_lookup_meta - Look up an inode in a metadata directory
+ * @dip: The directory
+ * @name: The name of the inode
+ */
+struct inode *gfs2_lookup_meta(struct inode *dip, const char *name)
{
struct qstr qstr;
struct inode *inode;
+
gfs2_str2qstr(&qstr, name);
inode = gfs2_lookupi(dip, &qstr, 1);
- /* gfs2_lookupi has inconsistent callers: vfs
- * related routines expect NULL for no entry found,
- * gfs2_lookup_simple callers expect ENOENT
- * and do not check for NULL.
- */
if (IS_ERR_OR_NULL(inode))
return inode ? inode : ERR_PTR(-ENOENT);
@@ -417,7 +419,7 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks)
if (error)
goto out_ipreserv;
- error = gfs2_alloc_blocks(ip, &ip->i_no_addr, dblocks, 1, &ip->i_generation);
+ error = gfs2_alloc_blocks(ip, &ip->i_no_addr, dblocks, 1);
if (error)
goto out_trans_end;
@@ -696,7 +698,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
set_nlink(inode, S_ISDIR(mode) ? 2 : 1);
inode->i_rdev = dev;
inode->i_size = size;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
munge_mode_uid_gid(dip, inode);
check_and_update_goal(dip);
ip->i_goal = dip->i_goal;
@@ -1866,16 +1868,24 @@ out:
int gfs2_permission(struct mnt_idmap *idmap, struct inode *inode,
int mask)
{
+ int may_not_block = mask & MAY_NOT_BLOCK;
struct gfs2_inode *ip;
struct gfs2_holder i_gh;
+ struct gfs2_glock *gl;
int error;
gfs2_holder_mark_uninitialized(&i_gh);
ip = GFS2_I(inode);
- if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
- if (mask & MAY_NOT_BLOCK)
+ gl = rcu_dereference_check(ip->i_gl, !may_not_block);
+ if (unlikely(!gl)) {
+ /* inode is getting torn down, must be RCU mode */
+ WARN_ON_ONCE(!may_not_block);
+ return -ECHILD;
+ }
+ if (gfs2_glock_is_locked_by_me(gl) == NULL) {
+ if (may_not_block)
return -ECHILD;
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+ error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
if (error)
return error;
}
@@ -1920,7 +1930,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
kuid_t ouid, nuid;
kgid_t ogid, ngid;
int error;
- struct gfs2_alloc_parms ap;
+ struct gfs2_alloc_parms ap = {};
ouid = inode->i_uid;
ogid = inode->i_gid;
@@ -2153,7 +2163,7 @@ static int gfs2_update_time(struct inode *inode, int flags)
int error;
gh = gfs2_glock_is_locked_by_me(gl);
- if (gh && !gfs2_glock_is_held_excl(gl)) {
+ if (gh && gl->gl_state != LM_ST_EXCLUSIVE) {
gfs2_glock_dq(gh);
gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, gh);
error = gfs2_glock_nq(gh);
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c8c5814e7295..fd15d1c6b6fb 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -13,9 +13,9 @@
#include "util.h"
bool gfs2_release_folio(struct folio *folio, gfp_t gfp_mask);
-extern int gfs2_internal_read(struct gfs2_inode *ip,
- char *buf, loff_t *pos, unsigned size);
-extern void gfs2_set_aops(struct inode *inode);
+ssize_t gfs2_internal_read(struct gfs2_inode *ip,
+ char *buf, loff_t *pos, size_t size);
+void gfs2_set_aops(struct inode *inode);
static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
{
@@ -44,19 +44,17 @@ static inline int gfs2_is_dir(const struct gfs2_inode *ip)
static inline void gfs2_set_inode_blocks(struct inode *inode, u64 blocks)
{
- inode->i_blocks = blocks <<
- (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
+ inode->i_blocks = blocks << (inode->i_blkbits - 9);
}
static inline u64 gfs2_get_inode_blocks(const struct inode *inode)
{
- return inode->i_blocks >>
- (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
+ return inode->i_blocks >> (inode->i_blkbits - 9);
}
static inline void gfs2_add_inode_blocks(struct inode *inode, s64 change)
{
- change <<= inode->i_blkbits - GFS2_BASIC_BLOCK_SHIFT;
+ change <<= inode->i_blkbits - 9;
gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks >= -change));
inode->i_blocks += change;
}
@@ -88,33 +86,33 @@ err:
return -EIO;
}
-extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
- u64 no_addr, u64 no_formal_ino,
- unsigned int blktype);
-extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
- u64 no_formal_ino,
- unsigned int blktype);
-
-extern int gfs2_inode_refresh(struct gfs2_inode *ip);
-
-extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
- int is_root);
-extern int gfs2_permission(struct mnt_idmap *idmap,
- struct inode *inode, int mask);
-extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
-extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
-extern int gfs2_open_common(struct inode *inode, struct file *file);
-extern loff_t gfs2_seek_data(struct file *file, loff_t offset);
-extern loff_t gfs2_seek_hole(struct file *file, loff_t offset);
+struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
+ u64 no_addr, u64 no_formal_ino,
+ unsigned int blktype);
+struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
+ u64 no_formal_ino,
+ unsigned int blktype);
+
+int gfs2_inode_refresh(struct gfs2_inode *ip);
+
+struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
+ int is_root);
+int gfs2_permission(struct mnt_idmap *idmap,
+ struct inode *inode, int mask);
+struct inode *gfs2_lookup_meta(struct inode *dip, const char *name);
+void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
+int gfs2_open_common(struct inode *inode, struct file *file);
+loff_t gfs2_seek_data(struct file *file, loff_t offset);
+loff_t gfs2_seek_hole(struct file *file, loff_t offset);
extern const struct file_operations gfs2_file_fops_nolock;
extern const struct file_operations gfs2_dir_fops_nolock;
-extern int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa);
-extern int gfs2_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
-extern void gfs2_set_inode_flags(struct inode *inode);
-
+int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int gfs2_fileattr_set(struct mnt_idmap *idmap,
+ struct dentry *dentry, struct fileattr *fa);
+void gfs2_set_inode_flags(struct inode *inode);
+
#ifdef CONFIG_GFS2_FS_LOCKING_DLM
extern const struct file_operations gfs2_file_fops;
extern const struct file_operations gfs2_dir_fops;
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 653cffcbf869..c27b05099c1e 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -70,29 +70,29 @@ static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip)
}
}
-extern void gfs2_ordered_del_inode(struct gfs2_inode *ip);
-extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct);
-extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
-extern bool gfs2_log_is_empty(struct gfs2_sbd *sdp);
-extern void gfs2_log_release_revokes(struct gfs2_sbd *sdp, unsigned int revokes);
-extern void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
-extern bool gfs2_log_try_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
- unsigned int *extra_revokes);
-extern void gfs2_log_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
- unsigned int *extra_revokes);
-extern void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
- u64 seq, u32 tail, u32 lblock, u32 flags,
- blk_opf_t op_flags);
-extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
- u32 type);
-extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
-extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc);
-extern void log_flush_wait(struct gfs2_sbd *sdp);
+void gfs2_ordered_del_inode(struct gfs2_inode *ip);
+unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct);
+void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
+bool gfs2_log_is_empty(struct gfs2_sbd *sdp);
+void gfs2_log_release_revokes(struct gfs2_sbd *sdp, unsigned int revokes);
+void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
+bool gfs2_log_try_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
+ unsigned int *extra_revokes);
+void gfs2_log_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
+ unsigned int *extra_revokes);
+void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
+ u64 seq, u32 tail, u32 lblock, u32 flags,
+ blk_opf_t op_flags);
+void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
+ u32 type);
+void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
+void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc);
+void log_flush_wait(struct gfs2_sbd *sdp);
-extern int gfs2_logd(void *data);
-extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
-extern void gfs2_glock_remove_revoke(struct gfs2_glock *gl);
-extern void gfs2_flush_revokes(struct gfs2_sbd *sdp);
-extern void gfs2_ail_drain(struct gfs2_sbd *sdp);
+int gfs2_logd(void *data);
+void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
+void gfs2_glock_remove_revoke(struct gfs2_glock *gl);
+void gfs2_flush_revokes(struct gfs2_sbd *sdp);
+void gfs2_ail_drain(struct gfs2_sbd *sdp);
#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 1412ffba1d44..07890c7b145d 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -11,16 +11,18 @@
#include "incore.h"
extern const struct gfs2_log_operations *gfs2_log_ops[];
-extern void gfs2_log_incr_head(struct gfs2_sbd *sdp);
-extern u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lbn);
-extern void gfs2_log_write(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
- struct page *page, unsigned size, unsigned offset,
- u64 blkno);
-extern void gfs2_log_submit_bio(struct bio **biop, blk_opf_t opf);
-extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
-extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
- struct gfs2_log_header_host *head, bool keep_cache);
-extern void gfs2_drain_revokes(struct gfs2_sbd *sdp);
+
+void gfs2_log_incr_head(struct gfs2_sbd *sdp);
+u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lbn);
+void gfs2_log_write(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
+ struct page *page, unsigned size, unsigned offset,
+ u64 blkno);
+void gfs2_log_submit_bio(struct bio **biop, blk_opf_t opf);
+void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
+int gfs2_find_jhead(struct gfs2_jdesc *jd,
+ struct gfs2_log_header_host *head, bool keep_cache);
+void gfs2_drain_revokes(struct gfs2_sbd *sdp);
+
static inline unsigned int buf_limit(struct gfs2_sbd *sdp)
{
return sdp->sd_ldptrs;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 66eb98b690a2..79be0cdc730c 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -147,7 +147,7 @@ static int __init init_gfs2_fs(void)
if (!gfs2_trans_cachep)
goto fail_cachep8;
- error = register_shrinker(&gfs2_qd_shrinker, "gfs2-qd");
+ error = gfs2_qd_shrinker_init();
if (error)
goto fail_shrinker;
@@ -196,7 +196,7 @@ fail_wq3:
fail_wq2:
destroy_workqueue(gfs2_recovery_wq);
fail_wq1:
- unregister_shrinker(&gfs2_qd_shrinker);
+ gfs2_qd_shrinker_exit();
fail_shrinker:
kmem_cache_destroy(gfs2_trans_cachep);
fail_cachep8:
@@ -229,7 +229,7 @@ fail_lru:
static void __exit exit_gfs2_fs(void)
{
- unregister_shrinker(&gfs2_qd_shrinker);
+ gfs2_qd_shrinker_exit();
gfs2_glock_exit();
gfs2_unregister_debugfs();
unregister_filesystem(&gfs2_fs_type);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 924361fa510b..25ceb0805df2 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -115,7 +115,7 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
{
struct address_space *mapping = gfs2_glock2aspace(gl);
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
- struct page *page;
+ struct folio *folio;
struct buffer_head *bh;
unsigned int shift;
unsigned long index;
@@ -129,36 +129,31 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
bufnum = blkno - (index << shift); /* block buf index within page */
if (create) {
- for (;;) {
- page = grab_cache_page(mapping, index);
- if (page)
- break;
- yield();
- }
- if (!page_has_buffers(page))
- create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0);
+ folio = __filemap_get_folio(mapping, index,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ mapping_gfp_mask(mapping) | __GFP_NOFAIL);
+ bh = folio_buffers(folio);
+ if (!bh)
+ bh = create_empty_buffers(folio,
+ sdp->sd_sb.sb_bsize, 0);
} else {
- page = find_get_page_flags(mapping, index,
- FGP_LOCK|FGP_ACCESSED);
- if (!page)
+ folio = __filemap_get_folio(mapping, index,
+ FGP_LOCK | FGP_ACCESSED, 0);
+ if (IS_ERR(folio))
return NULL;
- if (!page_has_buffers(page)) {
- bh = NULL;
- goto out_unlock;
- }
+ bh = folio_buffers(folio);
}
- /* Locate header for our buffer within our page */
- for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page)
- /* Do nothing */;
- get_bh(bh);
+ if (!bh)
+ goto out_unlock;
+ bh = get_nth_bh(bh, bufnum);
if (!buffer_mapped(bh))
map_bh(bh, sdp->sd_vfs, blkno);
out_unlock:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return bh;
}
@@ -405,26 +400,20 @@ static struct buffer_head *gfs2_getjdatabuf(struct gfs2_inode *ip, u64 blkno)
{
struct address_space *mapping = ip->i_inode.i_mapping;
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct page *page;
+ struct folio *folio;
struct buffer_head *bh;
unsigned int shift = PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift;
unsigned long index = blkno >> shift; /* convert block to page */
unsigned int bufnum = blkno - (index << shift);
- page = find_get_page_flags(mapping, index, FGP_LOCK|FGP_ACCESSED);
- if (!page)
+ folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_ACCESSED, 0);
+ if (IS_ERR(folio))
return NULL;
- if (!page_has_buffers(page)) {
- unlock_page(page);
- put_page(page);
- return NULL;
- }
- /* Locate header for our buffer within our page */
- for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page)
- /* Do nothing */;
- get_bh(bh);
- unlock_page(page);
- put_page(page);
+ bh = folio_buffers(folio);
+ if (bh)
+ bh = get_nth_bh(bh, bufnum);
+ folio_unlock(folio);
+ folio_put(folio);
return bh;
}
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index d0a58cdd433a..831d988c2ceb 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -50,21 +50,21 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
return inode->i_sb->s_fs_info;
}
-extern struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
-extern int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
- int rahead, struct buffer_head **bhp);
-extern int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
-extern struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno,
- int create);
+struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
+int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
+ int rahead, struct buffer_head **bhp);
+int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
+struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno,
+ int create);
enum {
REMOVE_JDATA = 0,
REMOVE_META = 1,
};
-extern void gfs2_remove_from_journal(struct buffer_head *bh, int meta);
-extern void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
-extern int gfs2_meta_buffer(struct gfs2_inode *ip, u32 mtype, u64 num,
- struct buffer_head **bhp);
+void gfs2_remove_from_journal(struct buffer_head *bh, int meta);
+void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
+int gfs2_meta_buffer(struct gfs2_inode *ip, u32 mtype, u64 num,
+ struct buffer_head **bhp);
static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip,
struct buffer_head **bhp)
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 33ca04733e93..b108c5d26839 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -292,8 +292,7 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
return error;
}
- sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
- GFS2_BASIC_BLOCK_SHIFT;
+ sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - 9;
sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift);
sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
sizeof(struct gfs2_dinode)) / sizeof(u64);
@@ -648,7 +647,7 @@ static int init_statfs(struct gfs2_sbd *sdp)
struct gfs2_jdesc *jd;
struct gfs2_inode *ip;
- sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs");
+ sdp->sd_statfs_inode = gfs2_lookup_meta(master, "statfs");
if (IS_ERR(sdp->sd_statfs_inode)) {
error = PTR_ERR(sdp->sd_statfs_inode);
fs_err(sdp, "can't read in statfs inode: %d\n", error);
@@ -657,7 +656,7 @@ static int init_statfs(struct gfs2_sbd *sdp)
if (sdp->sd_args.ar_spectator)
goto out;
- pn = gfs2_lookup_simple(master, "per_node");
+ pn = gfs2_lookup_meta(master, "per_node");
if (IS_ERR(pn)) {
error = PTR_ERR(pn);
fs_err(sdp, "can't find per_node directory: %d\n", error);
@@ -674,7 +673,7 @@ static int init_statfs(struct gfs2_sbd *sdp)
goto free_local;
}
sprintf(buf, "statfs_change%u", jd->jd_jid);
- lsi->si_sc_inode = gfs2_lookup_simple(pn, buf);
+ lsi->si_sc_inode = gfs2_lookup_meta(pn, buf);
if (IS_ERR(lsi->si_sc_inode)) {
error = PTR_ERR(lsi->si_sc_inode);
fs_err(sdp, "can't find local \"sc\" file#%u: %d\n",
@@ -739,7 +738,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
if (undo)
goto fail_statfs;
- sdp->sd_jindex = gfs2_lookup_simple(master, "jindex");
+ sdp->sd_jindex = gfs2_lookup_meta(master, "jindex");
if (IS_ERR(sdp->sd_jindex)) {
fs_err(sdp, "can't lookup journal index: %d\n", error);
return PTR_ERR(sdp->sd_jindex);
@@ -888,7 +887,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
goto fail;
/* Read in the resource index inode */
- sdp->sd_rindex = gfs2_lookup_simple(master, "rindex");
+ sdp->sd_rindex = gfs2_lookup_meta(master, "rindex");
if (IS_ERR(sdp->sd_rindex)) {
error = PTR_ERR(sdp->sd_rindex);
fs_err(sdp, "can't get resource index inode: %d\n", error);
@@ -897,7 +896,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
sdp->sd_rindex_uptodate = 0;
/* Read in the quota inode */
- sdp->sd_quota_inode = gfs2_lookup_simple(master, "quota");
+ sdp->sd_quota_inode = gfs2_lookup_meta(master, "quota");
if (IS_ERR(sdp->sd_quota_inode)) {
error = PTR_ERR(sdp->sd_quota_inode);
fs_err(sdp, "can't get quota file inode: %d\n", error);
@@ -941,7 +940,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
if (undo)
goto fail_qc_gh;
- pn = gfs2_lookup_simple(master, "per_node");
+ pn = gfs2_lookup_meta(master, "per_node");
if (IS_ERR(pn)) {
error = PTR_ERR(pn);
fs_err(sdp, "can't find per_node directory: %d\n", error);
@@ -949,7 +948,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
}
sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid);
- sdp->sd_qc_inode = gfs2_lookup_simple(pn, buf);
+ sdp->sd_qc_inode = gfs2_lookup_meta(pn, buf);
if (IS_ERR(sdp->sd_qc_inode)) {
error = PTR_ERR(sdp->sd_qc_inode);
fs_err(sdp, "can't find local \"qc\" file: %d\n", error);
@@ -1126,8 +1125,7 @@ static int init_threads(struct gfs2_sbd *sdp)
return 0;
fail:
- kthread_stop(sdp->sd_logd_process);
- put_task_struct(sdp->sd_logd_process);
+ kthread_stop_put(sdp->sd_logd_process);
sdp->sd_logd_process = NULL;
return error;
}
@@ -1135,13 +1133,11 @@ fail:
void gfs2_destroy_threads(struct gfs2_sbd *sdp)
{
if (sdp->sd_logd_process) {
- kthread_stop(sdp->sd_logd_process);
- put_task_struct(sdp->sd_logd_process);
+ kthread_stop_put(sdp->sd_logd_process);
sdp->sd_logd_process = NULL;
}
if (sdp->sd_quotad_process) {
- kthread_stop(sdp->sd_quotad_process);
- put_task_struct(sdp->sd_quotad_process);
+ kthread_stop_put(sdp->sd_quotad_process);
sdp->sd_quotad_process = NULL;
}
}
@@ -1190,10 +1186,9 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
/* Set up the buffer cache and fill in some fake block size values
to allow us to read-in the on-disk superblock. */
- sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, GFS2_BASIC_BLOCK);
+ sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, 512);
sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits;
- sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
- GFS2_BASIC_BLOCK_SHIFT;
+ sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - 9;
sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift);
sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit;
@@ -1281,10 +1276,8 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
if (!sb_rdonly(sb)) {
error = init_threads(sdp);
- if (error) {
- gfs2_withdraw_delayed(sdp);
+ if (error)
goto fail_per_node;
- }
}
error = gfs2_freeze_lock_shared(sdp);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 171b2713d2e5..95dae7838b4e 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -196,13 +196,26 @@ static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink,
return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc));
}
-struct shrinker gfs2_qd_shrinker = {
- .count_objects = gfs2_qd_shrink_count,
- .scan_objects = gfs2_qd_shrink_scan,
- .seeks = DEFAULT_SEEKS,
- .flags = SHRINKER_NUMA_AWARE,
-};
+static struct shrinker *gfs2_qd_shrinker;
+
+int __init gfs2_qd_shrinker_init(void)
+{
+ gfs2_qd_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "gfs2-qd");
+ if (!gfs2_qd_shrinker)
+ return -ENOMEM;
+
+ gfs2_qd_shrinker->count_objects = gfs2_qd_shrink_count;
+ gfs2_qd_shrinker->scan_objects = gfs2_qd_shrink_scan;
+
+ shrinker_register(gfs2_qd_shrinker);
+ return 0;
+}
+
+void gfs2_qd_shrinker_exit(void)
+{
+ shrinker_free(gfs2_qd_shrinker);
+}
static u64 qd2index(struct gfs2_quota_data *qd)
{
@@ -457,6 +470,17 @@ static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd,
(sync_gen && (qd->qd_sync_gen >= *sync_gen)))
return 0;
+ /*
+ * If qd_change is 0 it means a pending quota change was negated.
+ * We should not sync it, but we still have a qd reference and slot
+ * reference taken by gfs2_quota_change -> do_qc that need to be put.
+ */
+ if (!qd->qd_change && test_and_clear_bit(QDF_CHANGE, &qd->qd_flags)) {
+ slot_put(qd);
+ qd_put(qd);
+ return 0;
+ }
+
if (!lockref_get_not_dead(&qd->qd_lockref))
return 0;
@@ -736,7 +760,7 @@ static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index,
struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
struct inode *inode = &ip->i_inode;
struct address_space *mapping = inode->i_mapping;
- struct page *page;
+ struct folio *folio;
struct buffer_head *bh;
u64 blk;
unsigned bsize = sdp->sd_sb.sb_bsize, bnum = 0, boff = 0;
@@ -745,15 +769,15 @@ static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index,
blk = index << (PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift);
boff = off % bsize;
- page = grab_cache_page(mapping, index);
- if (!page)
- return -ENOMEM;
- if (!page_has_buffers(page))
- create_empty_buffers(page, bsize, 0);
+ folio = filemap_grab_folio(mapping, index);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ bh = folio_buffers(folio);
+ if (!bh)
+ bh = create_empty_buffers(folio, bsize, 0);
- bh = page_buffers(page);
- for(;;) {
- /* Find the beginning block within the page */
+ for (;;) {
+ /* Find the beginning block within the folio */
if (pg_off >= ((bnum * bsize) + bsize)) {
bh = bh->b_this_page;
bnum++;
@@ -766,9 +790,10 @@ static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index,
goto unlock_out;
/* If it's a newly allocated disk block, zero it */
if (buffer_new(bh))
- zero_user(page, bnum * bsize, bh->b_size);
+ folio_zero_range(folio, bnum * bsize,
+ bh->b_size);
}
- if (PageUptodate(page))
+ if (folio_test_uptodate(folio))
set_buffer_uptodate(bh);
if (bh_read(bh, REQ_META | REQ_PRIO) < 0)
goto unlock_out;
@@ -784,17 +809,17 @@ static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index,
break;
}
- /* Write to the page, now that we have setup the buffer(s) */
- memcpy_to_page(page, off, buf, bytes);
- flush_dcache_page(page);
- unlock_page(page);
- put_page(page);
+ /* Write to the folio, now that we have setup the buffer(s) */
+ memcpy_to_folio(folio, off, buf, bytes);
+ flush_dcache_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
return 0;
unlock_out:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return -EIO;
}
@@ -886,7 +911,7 @@ static int gfs2_adjust_quota(struct gfs2_sbd *sdp, loff_t loc,
size = loc + sizeof(struct gfs2_quota);
if (size > inode->i_size)
i_size_write(inode, size);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
set_bit(QDF_REFRESH, &qd->qd_flags);
}
@@ -898,7 +923,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
{
struct gfs2_sbd *sdp = (*qda)->qd_sbd;
struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
- struct gfs2_alloc_parms ap = { .aflags = 0, };
+ struct gfs2_alloc_parms ap = {};
unsigned int data_blocks, ind_blocks;
struct gfs2_holder *ghs, i_gh;
unsigned int qx, x;
@@ -1072,8 +1097,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
u32 x;
int error;
- if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON &&
- sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET)
+ if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
return 0;
error = gfs2_quota_hold(ip, uid, gid);
@@ -1180,17 +1204,16 @@ void gfs2_quota_unlock(struct gfs2_inode *ip)
#define MAX_LINE 256
-static int print_message(struct gfs2_quota_data *qd, char *type)
+static void print_message(struct gfs2_quota_data *qd, char *type)
{
struct gfs2_sbd *sdp = qd->qd_sbd;
- if (sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET)
+ if (sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET) {
fs_info(sdp, "quota %s for %s %u\n",
type,
(qd->qd_id.type == USRQUOTA) ? "user" : "group",
from_kqid(&init_user_ns, qd->qd_id));
-
- return 0;
+ }
}
/**
@@ -1260,7 +1283,8 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid,
* HZ)) {
quota_send_warning(qd->qd_id,
sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN);
- error = print_message(qd, "warning");
+ print_message(qd, "warning");
+ error = 0;
qd->qd_last_warn = jiffies;
}
}
@@ -1274,8 +1298,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
u32 x;
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- if ((sdp->sd_args.ar_quota != GFS2_QUOTA_ON &&
- sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET) ||
+ if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ||
gfs2_assert_warn(sdp, change))
return;
if (ip->i_diskflags & GFS2_DIF_SYSTEM)
@@ -1732,7 +1755,7 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
if (gfs2_is_stuffed(ip))
alloc_required = 1;
if (alloc_required) {
- struct gfs2_alloc_parms ap = { .aflags = 0, };
+ struct gfs2_alloc_parms ap = {};
gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
&data_blocks, &ind_blocks);
blocks = 1 + data_blocks + ind_blocks;
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 1429945215a0..f462d9cb3087 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -15,27 +15,27 @@ struct gfs2_sbd;
#define NO_UID_QUOTA_CHANGE INVALID_UID
#define NO_GID_QUOTA_CHANGE INVALID_GID
-extern int gfs2_qa_get(struct gfs2_inode *ip);
-extern void gfs2_qa_put(struct gfs2_inode *ip);
-extern int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
-extern void gfs2_quota_unhold(struct gfs2_inode *ip);
+int gfs2_qa_get(struct gfs2_inode *ip);
+void gfs2_qa_put(struct gfs2_inode *ip);
+int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
+void gfs2_quota_unhold(struct gfs2_inode *ip);
-extern int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
-extern void gfs2_quota_unlock(struct gfs2_inode *ip);
+int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
+void gfs2_quota_unlock(struct gfs2_inode *ip);
-extern int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid,
- struct gfs2_alloc_parms *ap);
-extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
- kuid_t uid, kgid_t gid);
+int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid,
+ struct gfs2_alloc_parms *ap);
+void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
+ kuid_t uid, kgid_t gid);
-extern int gfs2_quota_sync(struct super_block *sb, int type);
-extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid);
+int gfs2_quota_sync(struct super_block *sb, int type);
+int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid);
-extern int gfs2_quota_init(struct gfs2_sbd *sdp);
-extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
-extern int gfs2_quotad(void *data);
+int gfs2_quota_init(struct gfs2_sbd *sdp);
+void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
+int gfs2_quotad(void *data);
-extern void gfs2_wake_up_statfs(struct gfs2_sbd *sdp);
+void gfs2_wake_up_statfs(struct gfs2_sbd *sdp);
static inline int gfs2_quota_lock_check(struct gfs2_inode *ip,
struct gfs2_alloc_parms *ap)
@@ -50,8 +50,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip,
ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
if (ret)
return ret;
- if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON &&
- sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET)
+ if (sdp->sd_args.ar_quota == GFS2_QUOTA_ACCOUNT)
return 0;
ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid, ap);
if (ret)
@@ -60,8 +59,10 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip,
}
extern const struct quotactl_ops gfs2_quotactl_ops;
-extern struct shrinker gfs2_qd_shrinker;
+int __init gfs2_qd_shrinker_init(void);
+void gfs2_qd_shrinker_exit(void);
extern struct list_lru gfs2_qd_lru;
-extern void __init gfs2_quota_hash_init(void);
+
+void __init gfs2_quota_hash_init(void);
#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index 7a0c9d0b7503..6a0fd42e1120 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -17,18 +17,18 @@ static inline void gfs2_replay_incr_blk(struct gfs2_jdesc *jd, u32 *blk)
*blk = 0;
}
-extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
+int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
struct buffer_head **bh);
-extern int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where);
-extern int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where);
-extern void gfs2_revoke_clean(struct gfs2_jdesc *jd);
+int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where);
+int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where);
+void gfs2_revoke_clean(struct gfs2_jdesc *jd);
-extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait);
-extern void gfs2_recover_func(struct work_struct *work);
-extern int __get_log_header(struct gfs2_sbd *sdp,
- const struct gfs2_log_header *lh, unsigned int blkno,
- struct gfs2_log_header_host *head);
+int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait);
+void gfs2_recover_func(struct work_struct *work);
+int __get_log_header(struct gfs2_sbd *sdp,
+ const struct gfs2_log_header *lh, unsigned int blkno,
+ struct gfs2_log_header_host *head);
#endif /* __RECOVERY_DOT_H__ */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 9308190895c8..c2060203b98a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -2411,13 +2411,12 @@ static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
* @bn: Used to return the starting block number
* @nblocks: requested number of blocks/extent length (value/result)
* @dinode: 1 if we're allocating a dinode block, else 0
- * @generation: the generation number of the inode
*
* Returns: 0 or error
*/
int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
- bool dinode, u64 *generation)
+ bool dinode)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct buffer_head *dibh;
@@ -2477,10 +2476,13 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
rbm.rgd->rd_free -= *nblocks;
spin_unlock(&rbm.rgd->rd_rsspin);
if (dinode) {
+ u64 generation;
+
rbm.rgd->rd_dinodes++;
- *generation = rbm.rgd->rd_igeneration++;
- if (*generation == 0)
- *generation = rbm.rgd->rd_igeneration++;
+ generation = rbm.rgd->rd_igeneration++;
+ if (generation == 0)
+ generation = rbm.rgd->rd_igeneration++;
+ ip->i_generation = generation;
}
gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 00b30cf893af..8d20e99385db 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -22,38 +22,38 @@ struct gfs2_rgrpd;
struct gfs2_sbd;
struct gfs2_holder;
-extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
+void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
-extern struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact);
-extern struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
-extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
+struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact);
+struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
+struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
-extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
-extern int gfs2_rindex_update(struct gfs2_sbd *sdp);
-extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
-extern int gfs2_rgrp_go_instantiate(struct gfs2_glock *gl);
-extern void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd);
+void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
+int gfs2_rindex_update(struct gfs2_sbd *sdp);
+void gfs2_free_clones(struct gfs2_rgrpd *rgd);
+int gfs2_rgrp_go_instantiate(struct gfs2_glock *gl);
+void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd);
-extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
+struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
#define GFS2_AF_ORLOV 1
-extern int gfs2_inplace_reserve(struct gfs2_inode *ip,
- struct gfs2_alloc_parms *ap);
-extern void gfs2_inplace_release(struct gfs2_inode *ip);
-
-extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
- bool dinode, u64 *generation);
-
-extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
-extern void gfs2_rs_delete(struct gfs2_inode *ip);
-extern void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
- u64 bstart, u32 blen, int meta);
-extern void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
- u64 bstart, u32 blen);
-extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
-extern void gfs2_unlink_di(struct inode *inode);
-extern int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr,
- unsigned int type);
+int gfs2_inplace_reserve(struct gfs2_inode *ip,
+ struct gfs2_alloc_parms *ap);
+void gfs2_inplace_release(struct gfs2_inode *ip);
+
+int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
+ bool dinode);
+
+void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
+void gfs2_rs_delete(struct gfs2_inode *ip);
+void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
+ u64 bstart, u32 blen, int meta);
+void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
+ u64 bstart, u32 blen);
+void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
+void gfs2_unlink_di(struct inode *inode);
+int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr,
+ unsigned int type);
struct gfs2_rgrp_list {
unsigned int rl_rgrps;
@@ -62,18 +62,19 @@ struct gfs2_rgrp_list {
struct gfs2_holder *rl_ghs;
};
-extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
- u64 block);
-extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist,
- unsigned int state, u16 flags);
-extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
-extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
-extern void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd,
- const char *fs_id_buf);
-extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
- struct buffer_head *bh,
- const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
-extern int gfs2_fitrim(struct file *filp, void __user *argp);
+void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
+ u64 block);
+void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist,
+ unsigned int state, u16 flags);
+void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
+u64 gfs2_ri_total(struct gfs2_sbd *sdp);
+void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd,
+ const char *fs_id_buf);
+int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
+ struct buffer_head *bh,
+ const struct gfs2_bitmap *bi, unsigned minlen,
+ u64 *ptrimmed);
+int gfs2_fitrim(struct file *filp, void __user *argp);
/* This is how to tell if a reservation is in the rgrp tree: */
static inline bool gfs2_rs_active(const struct gfs2_blkreserv *rs)
@@ -88,9 +89,9 @@ static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
return first <= block && block < last;
}
-extern void check_and_update_goal(struct gfs2_inode *ip);
+void check_and_update_goal(struct gfs2_inode *ip);
-extern void rgrp_lock_local(struct gfs2_rgrpd *rgd);
-extern void rgrp_unlock_local(struct gfs2_rgrpd *rgd);
+void rgrp_lock_local(struct gfs2_rgrpd *rgd);
+void rgrp_unlock_local(struct gfs2_rgrpd *rgd);
#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 02d93da21b2b..d21c04a22d73 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -410,9 +410,9 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
str->di_nlink = cpu_to_be32(inode->i_nlink);
str->di_size = cpu_to_be64(i_size_read(inode));
str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(inode));
- str->di_atime = cpu_to_be64(inode->i_atime.tv_sec);
- str->di_mtime = cpu_to_be64(inode->i_mtime.tv_sec);
- str->di_ctime = cpu_to_be64(inode_get_ctime(inode).tv_sec);
+ str->di_atime = cpu_to_be64(inode_get_atime_sec(inode));
+ str->di_mtime = cpu_to_be64(inode_get_mtime_sec(inode));
+ str->di_ctime = cpu_to_be64(inode_get_ctime_sec(inode));
str->di_goal_meta = cpu_to_be64(ip->i_goal);
str->di_goal_data = cpu_to_be64(ip->i_goal);
@@ -427,9 +427,9 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
str->di_entries = cpu_to_be32(ip->i_entries);
str->di_eattr = cpu_to_be64(ip->i_eattr);
- str->di_atime_nsec = cpu_to_be32(inode->i_atime.tv_nsec);
- str->di_mtime_nsec = cpu_to_be32(inode->i_mtime.tv_nsec);
- str->di_ctime_nsec = cpu_to_be32(inode_get_ctime(inode).tv_nsec);
+ str->di_atime_nsec = cpu_to_be32(inode_get_atime_nsec(inode));
+ str->di_mtime_nsec = cpu_to_be32(inode_get_mtime_nsec(inode));
+ str->di_ctime_nsec = cpu_to_be32(inode_get_ctime_nsec(inode));
}
/**
@@ -602,13 +602,15 @@ restart:
}
spin_unlock(&sdp->sd_jindex_spin);
- if (!sb_rdonly(sb)) {
+ if (!sb_rdonly(sb))
gfs2_make_fs_ro(sdp);
- }
- if (gfs2_withdrawn(sdp)) {
- gfs2_destroy_threads(sdp);
+ else {
+ if (gfs2_withdrawn(sdp))
+ gfs2_destroy_threads(sdp);
+
gfs2_quota_cleanup(sdp);
}
+
WARN_ON(gfs2_withdrawing(sdp));
/* At this point, we're through modifying the disk */
@@ -1006,6 +1008,7 @@ static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_files = sc.sc_dinodes + sc.sc_free;
buf->f_ffree = sc.sc_free;
buf->f_namelen = GFS2_FNAMESIZE;
+ buf->f_fsid = uuid_to_fsid(sb->s_uuid.b);
return 0;
}
@@ -1299,18 +1302,8 @@ static bool gfs2_upgrade_iopen_glock(struct inode *inode)
* As a last resort, if another node keeps holding the iopen glock
* without showing any activity on the inode glock, we will eventually
* time out and fail the iopen glock upgrade.
- *
- * Note that we're passing the LM_FLAG_TRY_1CB flag to the first
- * locking request as an optimization to notify lock holders as soon as
- * possible. Without that flag, they'd be notified implicitly by the
- * second locking request.
*/
- gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, gh);
- error = gfs2_glock_nq(gh);
- if (error != GLR_TRYFAILED)
- return !error;
-
gfs2_holder_reinit(LM_ST_EXCLUSIVE, GL_ASYNC | GL_NOCACHE, gh);
error = gfs2_glock_nq(gh);
if (error)
@@ -1550,7 +1543,7 @@ out:
wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE);
gfs2_glock_add_to_lru(ip->i_gl);
gfs2_glock_put_eventually(ip->i_gl);
- ip->i_gl = NULL;
+ rcu_assign_pointer(ip->i_gl, NULL);
}
}
@@ -1576,7 +1569,7 @@ static void gfs2_free_inode(struct inode *inode)
kmem_cache_free(gfs2_inode_cachep, GFS2_I(inode));
}
-extern void free_local_statfs_inodes(struct gfs2_sbd *sdp)
+void free_local_statfs_inodes(struct gfs2_sbd *sdp)
{
struct local_statfs_inode *lsi, *safe;
@@ -1591,8 +1584,8 @@ extern void free_local_statfs_inodes(struct gfs2_sbd *sdp)
}
}
-extern struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp,
- unsigned int index)
+struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp,
+ unsigned int index)
{
struct local_statfs_inode *lsi;
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index ab9c83106932..b27a774d9580 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -15,7 +15,7 @@
#define GFS2_FS_FORMAT_MIN (1801)
#define GFS2_FS_FORMAT_MAX (1802)
-extern void gfs2_lm_unmount(struct gfs2_sbd *sdp);
+void gfs2_lm_unmount(struct gfs2_sbd *sdp);
static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
{
@@ -26,33 +26,33 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
return x;
}
-extern void gfs2_jindex_free(struct gfs2_sbd *sdp);
+void gfs2_jindex_free(struct gfs2_sbd *sdp);
-extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
-extern int gfs2_jdesc_check(struct gfs2_jdesc *jd);
-extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
- struct gfs2_inode **ipp);
+struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
+int gfs2_jdesc_check(struct gfs2_jdesc *jd);
+int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
+ struct gfs2_inode **ipp);
-extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
-extern void gfs2_make_fs_ro(struct gfs2_sbd *sdp);
-extern void gfs2_online_uevent(struct gfs2_sbd *sdp);
-extern void gfs2_destroy_threads(struct gfs2_sbd *sdp);
-extern int gfs2_statfs_init(struct gfs2_sbd *sdp);
-extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
- s64 dinodes);
-extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc,
- const void *buf);
-extern void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc,
- void *buf);
-extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh);
-extern int gfs2_statfs_sync(struct super_block *sb, int type);
-extern void gfs2_freeze_func(struct work_struct *work);
-extern void gfs2_thaw_freeze_initiator(struct super_block *sb);
+int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
+void gfs2_make_fs_ro(struct gfs2_sbd *sdp);
+void gfs2_online_uevent(struct gfs2_sbd *sdp);
+void gfs2_destroy_threads(struct gfs2_sbd *sdp);
+int gfs2_statfs_init(struct gfs2_sbd *sdp);
+void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
+ s64 dinodes);
+void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc,
+ const void *buf);
+void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc,
+ void *buf);
+void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh);
+int gfs2_statfs_sync(struct super_block *sb, int type);
+void gfs2_freeze_func(struct work_struct *work);
+void gfs2_thaw_freeze_initiator(struct super_block *sb);
-extern void free_local_statfs_inodes(struct gfs2_sbd *sdp);
-extern struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp,
- unsigned int index);
-extern void free_sbd(struct gfs2_sbd *sdp);
+void free_local_statfs_inodes(struct gfs2_sbd *sdp);
+struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp,
+ unsigned int index);
+void free_sbd(struct gfs2_sbd *sdp);
extern struct file_system_type gfs2_fs_type;
extern struct file_system_type gfs2meta_fs_type;
@@ -60,8 +60,8 @@ extern const struct export_operations gfs2_export_ops;
extern const struct super_operations gfs2_super_ops;
extern const struct dentry_operations gfs2_dops;
-extern const struct xattr_handler *gfs2_xattr_handlers_max[];
-extern const struct xattr_handler **gfs2_xattr_handlers_min;
+extern const struct xattr_handler * const gfs2_xattr_handlers_max[];
+extern const struct xattr_handler * const *gfs2_xattr_handlers_min;
#endif /* __SUPER_DOT_H__ */
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index c76ad9a4c75a..f8ce5302280d 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -34,17 +34,17 @@ static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip, unsigned
return rgd->rd_length;
}
-extern int __gfs2_trans_begin(struct gfs2_trans *tr, struct gfs2_sbd *sdp,
- unsigned int blocks, unsigned int revokes,
- unsigned long ip);
-extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
- unsigned int revokes);
-
-extern void gfs2_trans_end(struct gfs2_sbd *sdp);
-extern void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh);
-extern void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh);
-extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
-extern void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
-extern void gfs2_trans_free(struct gfs2_sbd *sdp, struct gfs2_trans *tr);
+int __gfs2_trans_begin(struct gfs2_trans *tr, struct gfs2_sbd *sdp,
+ unsigned int blocks, unsigned int revokes,
+ unsigned long ip);
+int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
+ unsigned int revokes);
+
+void gfs2_trans_end(struct gfs2_sbd *sdp);
+void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh);
+void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh);
+void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
+void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
+void gfs2_trans_free(struct gfs2_sbd *sdp, struct gfs2_trans *tr);
#endif /* __TRANS_DOT_H__ */
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index cdb839529175..11c9d59b6889 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -147,10 +147,10 @@ static inline void gfs2_metatype_set(struct buffer_head *bh, u16 type,
int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function,
char *file, unsigned int line);
-extern int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
- bool verbose);
-extern int gfs2_freeze_lock_shared(struct gfs2_sbd *sdp);
-extern void gfs2_freeze_unlock(struct gfs2_holder *freeze_gh);
+int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
+ bool verbose);
+int gfs2_freeze_lock_shared(struct gfs2_sbd *sdp);
+void gfs2_freeze_unlock(struct gfs2_holder *freeze_gh);
#define gfs2_io_error(sdp) \
gfs2_io_error_i((sdp), __func__, __FILE__, __LINE__)
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 4fea70c0fe3d..8c96ba6230d1 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -639,7 +639,7 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
u64 block;
int error;
- error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
+ error = gfs2_alloc_blocks(ip, &block, &n, 0);
if (error)
return error;
gfs2_trans_remove_revoke(sdp, block, 1);
@@ -701,7 +701,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
int mh_size = sizeof(struct gfs2_meta_header);
unsigned int n = 1;
- error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
+ error = gfs2_alloc_blocks(ip, &block, &n, 0);
if (error)
return error;
gfs2_trans_remove_revoke(sdp, block, 1);
@@ -1002,7 +1002,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
} else {
u64 blk;
unsigned int n = 1;
- error = gfs2_alloc_blocks(ip, &blk, &n, 0, NULL);
+ error = gfs2_alloc_blocks(ip, &blk, &n, 0);
if (error)
return error;
gfs2_trans_remove_revoke(sdp, blk, 1);
@@ -1494,7 +1494,7 @@ static const struct xattr_handler gfs2_xattr_trusted_handler = {
.set = gfs2_xattr_set,
};
-const struct xattr_handler *gfs2_xattr_handlers_max[] = {
+const struct xattr_handler * const gfs2_xattr_handlers_max[] = {
/* GFS2_FS_FORMAT_MAX */
&gfs2_xattr_trusted_handler,
@@ -1504,4 +1504,4 @@ const struct xattr_handler *gfs2_xattr_handlers_max[] = {
NULL,
};
-const struct xattr_handler **gfs2_xattr_handlers_min = gfs2_xattr_handlers_max + 1;
+const struct xattr_handler * const *gfs2_xattr_handlers_min = gfs2_xattr_handlers_max + 1;
diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h
index 2aed9d7d483d..eb12eb7e37c1 100644
--- a/fs/gfs2/xattr.h
+++ b/fs/gfs2/xattr.h
@@ -50,14 +50,14 @@ struct gfs2_ea_location {
struct gfs2_ea_header *el_prev;
};
-extern int __gfs2_xattr_set(struct inode *inode, const char *name,
- const void *value, size_t size,
- int flags, int type);
-extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size);
-extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
+int __gfs2_xattr_set(struct inode *inode, const char *name,
+ const void *value, size_t size,
+ int flags, int type);
+ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size);
+int gfs2_ea_dealloc(struct gfs2_inode *ip);
/* Exported to acl.c */
-extern int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data);
+int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data);
#endif /* __EATTR_DOT_H__ */
diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c
index 6341bb248247..f8395cdd1adf 100644
--- a/fs/hfs/attr.c
+++ b/fs/hfs/attr.c
@@ -146,7 +146,7 @@ static const struct xattr_handler hfs_type_handler = {
.set = hfs_xattr_set,
};
-const struct xattr_handler *hfs_xattr_handlers[] = {
+const struct xattr_handler * const hfs_xattr_handlers[] = {
&hfs_creator_handler,
&hfs_type_handler,
NULL
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index 632c226a3972..d63880e7d9d6 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -133,7 +133,7 @@ int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct i
goto err1;
dir->i_size++;
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
hfs_find_exit(&fd);
return 0;
@@ -269,7 +269,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str)
}
dir->i_size--;
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
res = 0;
out:
@@ -337,7 +337,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name,
if (err)
goto out;
dst_dir->i_size++;
- dst_dir->i_mtime = inode_set_ctime_current(dst_dir);
+ inode_set_mtime_to_ts(dst_dir, inode_set_ctime_current(dst_dir));
mark_inode_dirty(dst_dir);
/* finally remove the old entry */
@@ -349,7 +349,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name,
if (err)
goto out;
src_dir->i_size--;
- src_dir->i_mtime = inode_set_ctime_current(src_dir);
+ inode_set_mtime_to_ts(src_dir, inode_set_ctime_current(src_dir));
mark_inode_dirty(src_dir);
type = entry.type;
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 49d02524e667..b5a6ad5df357 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -215,7 +215,7 @@ extern void hfs_evict_inode(struct inode *);
extern void hfs_delete_inode(struct inode *);
/* attr.c */
-extern const struct xattr_handler *hfs_xattr_handlers[];
+extern const struct xattr_handler * const hfs_xattr_handlers[];
/* mdb.c */
extern int hfs_mdb_get(struct super_block *);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index ee349b72cfb3..a7bc4690a780 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -200,7 +200,7 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
set_nlink(inode, 1);
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
HFS_I(inode)->flags = 0;
HFS_I(inode)->rsrc_inode = NULL;
HFS_I(inode)->fs_blocks = 0;
@@ -355,8 +355,8 @@ static int hfs_read_inode(struct inode *inode, void *data)
inode->i_mode |= S_IWUGO;
inode->i_mode &= ~hsb->s_file_umask;
inode->i_mode |= S_IFREG;
- inode->i_atime = inode->i_mtime = inode_set_ctime_to_ts(inode,
- hfs_m_to_utime(rec->file.MdDat));
+ inode_set_mtime_to_ts(inode,
+ inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, hfs_m_to_utime(rec->file.MdDat))));
inode->i_op = &hfs_file_inode_operations;
inode->i_fop = &hfs_file_operations;
inode->i_mapping->a_ops = &hfs_aops;
@@ -366,8 +366,8 @@ static int hfs_read_inode(struct inode *inode, void *data)
inode->i_size = be16_to_cpu(rec->dir.Val) + 2;
HFS_I(inode)->fs_blocks = 0;
inode->i_mode = S_IFDIR | (S_IRWXUGO & ~hsb->s_dir_umask);
- inode->i_atime = inode->i_mtime = inode_set_ctime_to_ts(inode,
- hfs_m_to_utime(rec->dir.MdDat));
+ inode_set_mtime_to_ts(inode,
+ inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, hfs_m_to_utime(rec->dir.MdDat))));
inode->i_op = &hfs_dir_inode_operations;
inode->i_fop = &hfs_dir_operations;
break;
@@ -474,7 +474,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
be32_to_cpu(rec.dir.DirID) != inode->i_ino) {
}
- rec.dir.MdDat = hfs_u_to_mtime(inode->i_mtime);
+ rec.dir.MdDat = hfs_u_to_mtime(inode_get_mtime(inode));
rec.dir.Val = cpu_to_be16(inode->i_size - 2);
hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
@@ -502,7 +502,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
else
rec.file.Flags |= HFS_FIL_LOCK;
hfs_inode_write_fork(inode, rec.file.ExtRec, &rec.file.LgLen, &rec.file.PyLen);
- rec.file.MdDat = hfs_u_to_mtime(inode->i_mtime);
+ rec.file.MdDat = hfs_u_to_mtime(inode_get_mtime(inode));
hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_file));
@@ -654,7 +654,7 @@ int hfs_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
truncate_setsize(inode, attr->ia_size);
hfs_file_truncate(inode);
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
}
setattr_copy(&nop_mnt_idmap, inode, attr);
diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c
index dc27d418fbcd..76fa02e3835b 100644
--- a/fs/hfs/sysdep.c
+++ b/fs/hfs/sysdep.c
@@ -28,11 +28,13 @@ static int hfs_revalidate_dentry(struct dentry *dentry, unsigned int flags)
/* fix up inode on a timezone change */
diff = sys_tz.tz_minuteswest * 60 - HFS_I(inode)->tz_secondswest;
if (diff) {
- struct timespec64 ctime = inode_get_ctime(inode);
+ struct timespec64 ts = inode_get_ctime(inode);
- inode_set_ctime(inode, ctime.tv_sec + diff, ctime.tv_nsec);
- inode->i_atime.tv_sec += diff;
- inode->i_mtime.tv_sec += diff;
+ inode_set_ctime(inode, ts.tv_sec + diff, ts.tv_nsec);
+ ts = inode_get_atime(inode);
+ inode_set_atime(inode, ts.tv_sec + diff, ts.tv_nsec);
+ ts = inode_get_mtime(inode);
+ inode_set_mtime(inode, ts.tv_sec + diff, ts.tv_nsec);
HFS_I(inode)->tz_secondswest += diff;
}
return 1;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index e71ae2537eaa..1995bafee839 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -312,7 +312,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
dir->i_size++;
if (S_ISDIR(inode->i_mode))
hfsplus_subfolders_inc(dir);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
hfs_find_exit(&fd);
@@ -417,7 +417,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str)
dir->i_size--;
if (type == HFSPLUS_FOLDER)
hfsplus_subfolders_dec(dir);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
if (type == HFSPLUS_FILE || type == HFSPLUS_FOLDER) {
@@ -494,7 +494,7 @@ int hfsplus_rename_cat(u32 cnid,
dst_dir->i_size++;
if (type == HFSPLUS_FOLDER)
hfsplus_subfolders_inc(dst_dir);
- dst_dir->i_mtime = inode_set_ctime_current(dst_dir);
+ inode_set_mtime_to_ts(dst_dir, inode_set_ctime_current(dst_dir));
/* finally remove the old entry */
err = hfsplus_cat_build_key(sb, src_fd.search_key,
@@ -511,7 +511,7 @@ int hfsplus_rename_cat(u32 cnid,
src_dir->i_size--;
if (type == HFSPLUS_FOLDER)
hfsplus_subfolders_dec(src_dir);
- src_dir->i_mtime = inode_set_ctime_current(src_dir);
+ inode_set_mtime_to_ts(src_dir, inode_set_ctime_current(src_dir));
/* remove old thread entry */
hfsplus_cat_build_key_with_cnid(sb, src_fd.search_key, cnid);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index c65c8c4b03dd..702a0663b1d8 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -267,7 +267,7 @@ static int hfsplus_setattr(struct mnt_idmap *idmap,
}
truncate_setsize(inode, attr->ia_size);
hfsplus_file_truncate(inode);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
}
setattr_copy(&nop_mnt_idmap, inode, attr);
@@ -392,7 +392,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir,
inode->i_ino = sbi->next_cnid++;
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
set_nlink(inode, 1);
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
hip = HFSPLUS_I(inode);
INIT_LIST_HEAD(&hip->open_dir_list);
@@ -521,8 +521,9 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
hfsplus_get_perms(inode, &folder->permissions, 1);
set_nlink(inode, 1);
inode->i_size = 2 + be32_to_cpu(folder->valence);
- inode->i_atime = hfsp_mt2ut(folder->access_date);
- inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
+ inode_set_atime_to_ts(inode, hfsp_mt2ut(folder->access_date));
+ inode_set_mtime_to_ts(inode,
+ hfsp_mt2ut(folder->content_mod_date));
inode_set_ctime_to_ts(inode,
hfsp_mt2ut(folder->attribute_mod_date));
HFSPLUS_I(inode)->create_date = folder->create_date;
@@ -563,8 +564,9 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
init_special_inode(inode, inode->i_mode,
be32_to_cpu(file->permissions.dev));
}
- inode->i_atime = hfsp_mt2ut(file->access_date);
- inode->i_mtime = hfsp_mt2ut(file->content_mod_date);
+ inode_set_atime_to_ts(inode, hfsp_mt2ut(file->access_date));
+ inode_set_mtime_to_ts(inode,
+ hfsp_mt2ut(file->content_mod_date));
inode_set_ctime_to_ts(inode,
hfsp_mt2ut(file->attribute_mod_date));
HFSPLUS_I(inode)->create_date = file->create_date;
@@ -609,8 +611,8 @@ int hfsplus_cat_write_inode(struct inode *inode)
sizeof(struct hfsplus_cat_folder));
/* simple node checks? */
hfsplus_cat_set_perms(inode, &folder->permissions);
- folder->access_date = hfsp_ut2mt(inode->i_atime);
- folder->content_mod_date = hfsp_ut2mt(inode->i_mtime);
+ folder->access_date = hfsp_ut2mt(inode_get_atime(inode));
+ folder->content_mod_date = hfsp_ut2mt(inode_get_mtime(inode));
folder->attribute_mod_date = hfsp_ut2mt(inode_get_ctime(inode));
folder->valence = cpu_to_be32(inode->i_size - 2);
if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) {
@@ -644,8 +646,8 @@ int hfsplus_cat_write_inode(struct inode *inode)
file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
else
file->flags &= cpu_to_be16(~HFSPLUS_FILE_LOCKED);
- file->access_date = hfsp_ut2mt(inode->i_atime);
- file->content_mod_date = hfsp_ut2mt(inode->i_mtime);
+ file->access_date = hfsp_ut2mt(inode_get_atime(inode));
+ file->content_mod_date = hfsp_ut2mt(inode_get_mtime(inode));
file->attribute_mod_date = hfsp_ut2mt(inode_get_ctime(inode));
hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
sizeof(struct hfsplus_cat_file));
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index 58021e73c00b..9c9ff6b8c6f7 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -13,7 +13,7 @@
static int hfsplus_removexattr(struct inode *inode, const char *name);
-const struct xattr_handler *hfsplus_xattr_handlers[] = {
+const struct xattr_handler * const hfsplus_xattr_handlers[] = {
&hfsplus_xattr_osx_handler,
&hfsplus_xattr_user_handler,
&hfsplus_xattr_trusted_handler,
diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h
index d14e362b3eba..15cc55e41410 100644
--- a/fs/hfsplus/xattr.h
+++ b/fs/hfsplus/xattr.h
@@ -17,7 +17,7 @@ extern const struct xattr_handler hfsplus_xattr_user_handler;
extern const struct xattr_handler hfsplus_xattr_trusted_handler;
extern const struct xattr_handler hfsplus_xattr_security_handler;
-extern const struct xattr_handler *hfsplus_xattr_handlers[];
+extern const struct xattr_handler * const hfsplus_xattr_handlers[];
int __hfsplus_setxattr(struct inode *inode, const char *name,
const void *value, size_t size, int flags);
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index dc5a5cea5fae..ea87f24c6c3f 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -513,10 +513,14 @@ static int hostfs_inode_update(struct inode *ino, const struct hostfs_stat *st)
set_nlink(ino, st->nlink);
i_uid_write(ino, st->uid);
i_gid_write(ino, st->gid);
- ino->i_atime =
- (struct timespec64){ st->atime.tv_sec, st->atime.tv_nsec };
- ino->i_mtime =
- (struct timespec64){ st->mtime.tv_sec, st->mtime.tv_nsec };
+ inode_set_atime_to_ts(ino, (struct timespec64){
+ st->atime.tv_sec,
+ st->atime.tv_nsec,
+ });
+ inode_set_mtime_to_ts(ino, (struct timespec64){
+ st->mtime.tv_sec,
+ st->mtime.tv_nsec,
+ });
inode_set_ctime(ino, st->ctime.tv_sec, st->ctime.tv_nsec);
ino->i_size = st->size;
ino->i_blocks = st->blocks;
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index f36566d61215..49dd585c2b17 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -277,14 +277,16 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, unsigned in
* inode.
*/
- if (!inode_get_ctime(result).tv_sec) {
+ if (!inode_get_ctime_sec(result)) {
time64_t csec = local_to_gmt(dir->i_sb, le32_to_cpu(de->creation_date));
inode_set_ctime(result, csec ? csec : 1, 0);
- result->i_mtime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->write_date));
- result->i_mtime.tv_nsec = 0;
- result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->read_date));
- result->i_atime.tv_nsec = 0;
+ inode_set_mtime(result,
+ local_to_gmt(dir->i_sb, le32_to_cpu(de->write_date)),
+ 0);
+ inode_set_atime(result,
+ local_to_gmt(dir->i_sb, le32_to_cpu(de->read_date)),
+ 0);
hpfs_result->i_ea_size = le32_to_cpu(de->ea_size);
if (!hpfs_result->i_ea_mode && de->read_only)
result->i_mode &= ~0222;
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 479166378bae..a59e8fa630db 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -37,8 +37,8 @@ void hpfs_init_inode(struct inode *i)
hpfs_inode->i_dirty = 0;
inode_set_ctime(i, 0, 0);
- i->i_mtime.tv_sec = i->i_mtime.tv_nsec = 0;
- i->i_atime.tv_sec = i->i_atime.tv_nsec = 0;
+ inode_set_mtime(i, 0, 0);
+ inode_set_atime(i, 0, 0);
}
void hpfs_read_inode(struct inode *i)
@@ -230,9 +230,9 @@ void hpfs_write_inode_nolock(struct inode *i)
}
hpfs_write_inode_ea(i, fnode);
if (de) {
- de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec));
- de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec));
- de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime(i).tv_sec));
+ de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_mtime_sec(i)));
+ de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_atime_sec(i)));
+ de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime_sec(i)));
de->read_only = !(i->i_mode & 0222);
de->ea_size = cpu_to_le32(hpfs_inode->i_ea_size);
hpfs_mark_4buffers_dirty(&qbh);
@@ -240,9 +240,9 @@ void hpfs_write_inode_nolock(struct inode *i)
}
if (S_ISDIR(i->i_mode)) {
if ((de = map_dirent(i, hpfs_inode->i_dno, "\001\001", 2, NULL, &qbh))) {
- de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec));
- de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec));
- de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime(i).tv_sec));
+ de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_mtime_sec(i)));
+ de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_atime_sec(i)));
+ de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime_sec(i)));
de->read_only = !(i->i_mode & 0222);
de->ea_size = cpu_to_le32(/*hpfs_inode->i_ea_size*/0);
de->file_size = cpu_to_le32(0);
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index f4eb8d6f5989..9184b4584b01 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -12,10 +12,10 @@
static void hpfs_update_directory_times(struct inode *dir)
{
time64_t t = local_to_gmt(dir->i_sb, local_get_seconds(dir->i_sb));
- if (t == dir->i_mtime.tv_sec &&
- t == inode_get_ctime(dir).tv_sec)
+ if (t == inode_get_mtime_sec(dir) &&
+ t == inode_get_ctime_sec(dir))
return;
- dir->i_mtime = inode_set_ctime(dir, t, 0);
+ inode_set_mtime_to_ts(dir, inode_set_ctime(dir, t, 0));
hpfs_write_inode_nolock(dir);
}
@@ -58,8 +58,8 @@ static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
result->i_ino = fno;
hpfs_i(result)->i_parent_dir = dir->i_ino;
hpfs_i(result)->i_dno = dno;
- result->i_mtime = result->i_atime =
- inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0);
+ inode_set_mtime_to_ts(result,
+ inode_set_atime_to_ts(result, inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0)));
hpfs_i(result)->i_ea_size = 0;
result->i_mode |= S_IFDIR;
result->i_op = &hpfs_dir_iops;
@@ -164,8 +164,8 @@ static int hpfs_create(struct mnt_idmap *idmap, struct inode *dir,
result->i_fop = &hpfs_file_ops;
set_nlink(result, 1);
hpfs_i(result)->i_parent_dir = dir->i_ino;
- result->i_mtime = result->i_atime =
- inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0);
+ inode_set_mtime_to_ts(result,
+ inode_set_atime_to_ts(result, inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0)));
hpfs_i(result)->i_ea_size = 0;
if (dee.read_only)
result->i_mode &= ~0222;
@@ -245,8 +245,8 @@ static int hpfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
hpfs_init_inode(result);
result->i_ino = fno;
hpfs_i(result)->i_parent_dir = dir->i_ino;
- result->i_mtime = result->i_atime =
- inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0);
+ inode_set_mtime_to_ts(result,
+ inode_set_atime_to_ts(result, inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0)));
hpfs_i(result)->i_ea_size = 0;
result->i_uid = current_fsuid();
result->i_gid = current_fsgid();
@@ -319,8 +319,8 @@ static int hpfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
result->i_ino = fno;
hpfs_init_inode(result);
hpfs_i(result)->i_parent_dir = dir->i_ino;
- result->i_mtime = result->i_atime =
- inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0);
+ inode_set_mtime_to_ts(result,
+ inode_set_atime_to_ts(result, inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0)));
hpfs_i(result)->i_ea_size = 0;
result->i_mode = S_IFLNK | 0777;
result->i_uid = current_fsuid();
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 758a51564124..6b0ba3c1efba 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -725,10 +725,12 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
if (!de)
hpfs_error(s, "unable to find root dir");
else {
- root->i_atime.tv_sec = local_to_gmt(s, le32_to_cpu(de->read_date));
- root->i_atime.tv_nsec = 0;
- root->i_mtime.tv_sec = local_to_gmt(s, le32_to_cpu(de->write_date));
- root->i_mtime.tv_nsec = 0;
+ inode_set_atime(root,
+ local_to_gmt(s, le32_to_cpu(de->read_date)),
+ 0);
+ inode_set_mtime(root,
+ local_to_gmt(s, le32_to_cpu(de->write_date)),
+ 0);
inode_set_ctime(root,
local_to_gmt(s, le32_to_cpu(de->creation_date)),
0);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 316c4cebd3f3..f757d4f7ad98 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -83,29 +83,6 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
{}
};
-#ifdef CONFIG_NUMA
-static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
- struct inode *inode, pgoff_t index)
-{
- vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
- index);
-}
-
-static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
-{
- mpol_cond_put(vma->vm_policy);
-}
-#else
-static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
- struct inode *inode, pgoff_t index)
-{
-}
-
-static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
-{
-}
-#endif
-
/*
* Mask used when checking the page offset value passed in via system
* calls. This value will be converted to a loff_t which is signed.
@@ -135,7 +112,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND);
vma->vm_ops = &hugetlb_vm_ops;
- ret = seal_check_future_write(info->seals, vma);
+ ret = seal_check_write(info->seals, vma);
if (ret)
return ret;
@@ -295,7 +272,7 @@ static size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t byt
size_t res = 0;
/* First subpage to start the loop. */
- page += offset / PAGE_SIZE;
+ page = nth_page(page, offset / PAGE_SIZE);
offset %= PAGE_SIZE;
while (1) {
if (is_raw_hwpoison_page_in_hugepage(page))
@@ -309,7 +286,7 @@ static size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t byt
break;
offset += n;
if (offset == PAGE_SIZE) {
- page++;
+ page = nth_page(page, 1);
offset = 0;
}
}
@@ -334,7 +311,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
ssize_t retval = 0;
while (iov_iter_count(to)) {
- struct page *page;
+ struct folio *folio;
size_t nr, copied, want;
/* nr is the maximum number of bytes to copy from this page */
@@ -352,18 +329,18 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
nr = nr - offset;
- /* Find the page */
- page = find_lock_page(mapping, index);
- if (unlikely(page == NULL)) {
+ /* Find the folio */
+ folio = filemap_lock_hugetlb_folio(h, mapping, index);
+ if (IS_ERR(folio)) {
/*
* We have a HOLE, zero out the user-buffer for the
* length of the hole or request.
*/
copied = iov_iter_zero(nr, to);
} else {
- unlock_page(page);
+ folio_unlock(folio);
- if (!PageHWPoison(page))
+ if (!folio_test_has_hwpoisoned(folio))
want = nr;
else {
/*
@@ -371,19 +348,19 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
* touching the 1st raw HWPOISON subpage after
* offset.
*/
- want = adjust_range_hwpoison(page, offset, nr);
+ want = adjust_range_hwpoison(&folio->page, offset, nr);
if (want == 0) {
- put_page(page);
+ folio_put(folio);
retval = -EIO;
break;
}
}
/*
- * We have the page, copy it to user space buffer.
+ * We have the folio, copy it to user space buffer.
*/
- copied = copy_page_to_iter(page, offset, want, to);
- put_page(page);
+ copied = copy_folio_to_iter(folio, offset, want, to);
+ folio_put(folio);
}
offset += copied;
retval += copied;
@@ -661,21 +638,20 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
{
struct hstate *h = hstate_inode(inode);
struct address_space *mapping = &inode->i_data;
- const pgoff_t start = lstart >> huge_page_shift(h);
- const pgoff_t end = lend >> huge_page_shift(h);
+ const pgoff_t end = lend >> PAGE_SHIFT;
struct folio_batch fbatch;
pgoff_t next, index;
int i, freed = 0;
bool truncate_op = (lend == LLONG_MAX);
folio_batch_init(&fbatch);
- next = start;
+ next = lstart >> PAGE_SHIFT;
while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
for (i = 0; i < folio_batch_count(&fbatch); ++i) {
struct folio *folio = fbatch.folios[i];
u32 hash = 0;
- index = folio->index;
+ index = folio->index >> huge_page_order(h);
hash = hugetlb_fault_mutex_hash(mapping, index);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -693,7 +669,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
}
if (truncate_op)
- (void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
+ (void)hugetlb_unreserve_pages(inode,
+ lstart >> huge_page_shift(h),
+ LONG_MAX, freed);
}
static void hugetlbfs_evict_inode(struct inode *inode)
@@ -741,7 +719,7 @@ static void hugetlbfs_zero_partial_page(struct hstate *h,
pgoff_t idx = start >> huge_page_shift(h);
struct folio *folio;
- folio = filemap_lock_folio(mapping, idx);
+ folio = filemap_lock_hugetlb_folio(h, mapping, idx);
if (IS_ERR(folio))
return;
@@ -852,8 +830,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
/*
* Initialize a pseudo vma as this is required by the huge page
- * allocation routines. If NUMA is configured, use page index
- * as input to create an allocation policy.
+ * allocation routines.
*/
vma_init(&pseudo_vma, mm);
vm_flags_init(&pseudo_vma, VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
@@ -886,7 +863,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
mutex_lock(&hugetlb_fault_mutex_table[hash]);
/* See if already present in mapping to avoid alloc/free */
- folio = filemap_get_folio(mapping, index);
+ folio = filemap_get_folio(mapping, index << huge_page_order(h));
if (!IS_ERR(folio)) {
folio_put(folio);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
@@ -901,9 +878,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
* folios in these areas, we need to consume the reserves
* to keep reservation accounting consistent.
*/
- hugetlb_set_vma_policy(&pseudo_vma, inode, index);
folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0);
- hugetlb_drop_vma_policy(&pseudo_vma);
if (IS_ERR(folio)) {
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
error = PTR_ERR(folio);
@@ -980,7 +955,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
inode->i_mode = S_IFDIR | ctx->mode;
inode->i_uid = ctx->uid;
inode->i_gid = ctx->gid;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_op = &hugetlbfs_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
/* directory inodes start off with i_nlink == 2 (for "." entry) */
@@ -1024,7 +999,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
&hugetlbfs_i_mmap_rwsem_key);
inode->i_mapping->a_ops = &hugetlbfs_aops;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_mapping->private_data = resv_map;
info->seals = F_SEAL_SEAL;
switch (mode & S_IFMT) {
@@ -1067,7 +1042,7 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
if (!inode)
return -ENOSPC;
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
d_instantiate(dentry, inode);
dget(dentry);/* Extra count - pin the dentry in core */
return 0;
@@ -1099,7 +1074,7 @@ static int hugetlbfs_tmpfile(struct mnt_idmap *idmap,
inode = hugetlbfs_get_inode(dir->i_sb, dir, mode | S_IFREG, 0);
if (!inode)
return -ENOSPC;
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
d_tmpfile(file, inode);
return finish_open_simple(file, 0);
}
@@ -1121,7 +1096,7 @@ static int hugetlbfs_symlink(struct mnt_idmap *idmap,
} else
iput(inode);
}
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
return error;
}
@@ -1204,7 +1179,9 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
struct hstate *h = hstate_inode(d_inode(dentry));
+ u64 id = huge_encode_dev(dentry->d_sb->s_dev);
+ buf->f_fsid = u64_to_fsid(id);
buf->f_type = HUGETLBFS_MAGIC;
buf->f_bsize = huge_page_size(h);
if (sbinfo) {
@@ -1282,18 +1259,6 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
hugetlbfs_inc_free_inodes(sbinfo);
return NULL;
}
-
- /*
- * Any time after allocation, hugetlbfs_destroy_inode can be called
- * for the inode. mpol_free_shared_policy is unconditionally called
- * as part of hugetlbfs_destroy_inode. So, initialize policy here
- * in case of a quick call to destroy.
- *
- * Note that the policy is initialized even if we are creating a
- * private inode. This simplifies hugetlbfs_destroy_inode.
- */
- mpol_shared_policy_init(&p->policy, NULL);
-
return &p->vfs_inode;
}
@@ -1305,7 +1270,6 @@ static void hugetlbfs_free_inode(struct inode *inode)
static void hugetlbfs_destroy_inode(struct inode *inode)
{
hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
- mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
}
static const struct address_space_operations hugetlbfs_aops = {
diff --git a/fs/init.c b/fs/init.c
index 9684406a8416..e9387b6c4f30 100644
--- a/fs/init.c
+++ b/fs/init.c
@@ -153,8 +153,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev)
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- if (!IS_POSIXACL(path.dentry->d_inode))
- mode &= ~current_umask();
+ mode = mode_strip_umask(d_inode(path.dentry), mode);
error = security_path_mknod(&path, dentry, mode, dev);
if (!error)
error = vfs_mknod(mnt_idmap(path.mnt), path.dentry->d_inode,
@@ -229,8 +228,7 @@ int __init init_mkdir(const char *pathname, umode_t mode)
dentry = kern_path_create(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- if (!IS_POSIXACL(path.dentry->d_inode))
- mode &= ~current_umask();
+ mode = mode_strip_umask(d_inode(path.dentry), mode);
error = security_path_mkdir(&path, dentry, mode);
if (!error)
error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
diff --git a/fs/inode.c b/fs/inode.c
index 84bc3c76e5cc..f238d987dec9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -54,9 +54,9 @@
* inode_hash_lock
*/
-static unsigned int i_hash_mask __read_mostly;
-static unsigned int i_hash_shift __read_mostly;
-static struct hlist_head *inode_hashtable __read_mostly;
+static unsigned int i_hash_mask __ro_after_init;
+static unsigned int i_hash_shift __ro_after_init;
+static struct hlist_head *inode_hashtable __ro_after_init;
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
/*
@@ -70,7 +70,7 @@ EXPORT_SYMBOL(empty_aops);
static DEFINE_PER_CPU(unsigned long, nr_inodes);
static DEFINE_PER_CPU(unsigned long, nr_unused);
-static struct kmem_cache *inode_cachep __read_mostly;
+static struct kmem_cache *inode_cachep __ro_after_init;
static long get_nr_inodes(void)
{
@@ -215,6 +215,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
lockdep_set_class_and_name(&mapping->invalidate_lock,
&sb->s_type->invalidate_lock_key,
"mapping.invalidate_lock");
+ if (sb->s_iflags & SB_I_STABLE_WRITES)
+ mapping_set_stable_writes(mapping);
inode->i_private = NULL;
inode->i_mapping = mapping;
INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */
@@ -1837,27 +1839,29 @@ EXPORT_SYMBOL(bmap);
static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
struct timespec64 now)
{
- struct timespec64 ctime;
+ struct timespec64 atime, mtime, ctime;
if (!(mnt->mnt_flags & MNT_RELATIME))
return 1;
/*
* Is mtime younger than or equal to atime? If yes, update atime:
*/
- if (timespec64_compare(&inode->i_mtime, &inode->i_atime) >= 0)
+ atime = inode_get_atime(inode);
+ mtime = inode_get_mtime(inode);
+ if (timespec64_compare(&mtime, &atime) >= 0)
return 1;
/*
* Is ctime younger than or equal to atime? If yes, update atime:
*/
ctime = inode_get_ctime(inode);
- if (timespec64_compare(&ctime, &inode->i_atime) >= 0)
+ if (timespec64_compare(&ctime, &atime) >= 0)
return 1;
/*
* Is the previous atime value older than a day? If yes,
* update atime:
*/
- if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60)
+ if ((long)(now.tv_sec - atime.tv_sec) >= 24*60*60)
return 1;
/*
* Good, we can skip the atime update:
@@ -1888,12 +1892,13 @@ int inode_update_timestamps(struct inode *inode, int flags)
if (flags & (S_MTIME|S_CTIME|S_VERSION)) {
struct timespec64 ctime = inode_get_ctime(inode);
+ struct timespec64 mtime = inode_get_mtime(inode);
now = inode_set_ctime_current(inode);
if (!timespec64_equal(&now, &ctime))
updated |= S_CTIME;
- if (!timespec64_equal(&now, &inode->i_mtime)) {
- inode->i_mtime = now;
+ if (!timespec64_equal(&now, &mtime)) {
+ inode_set_mtime_to_ts(inode, now);
updated |= S_MTIME;
}
if (IS_I_VERSION(inode) && inode_maybe_inc_iversion(inode, updated))
@@ -1903,8 +1908,10 @@ int inode_update_timestamps(struct inode *inode, int flags)
}
if (flags & S_ATIME) {
- if (!timespec64_equal(&now, &inode->i_atime)) {
- inode->i_atime = now;
+ struct timespec64 atime = inode_get_atime(inode);
+
+ if (!timespec64_equal(&now, &atime)) {
+ inode_set_atime_to_ts(inode, now);
updated |= S_ATIME;
}
}
@@ -1963,7 +1970,7 @@ EXPORT_SYMBOL(inode_update_time);
bool atime_needs_update(const struct path *path, struct inode *inode)
{
struct vfsmount *mnt = path->mnt;
- struct timespec64 now;
+ struct timespec64 now, atime;
if (inode->i_flags & S_NOATIME)
return false;
@@ -1989,7 +1996,8 @@ bool atime_needs_update(const struct path *path, struct inode *inode)
if (!relatime_need_update(mnt, inode, now))
return false;
- if (timespec64_equal(&inode->i_atime, &now))
+ atime = inode_get_atime(inode);
+ if (timespec64_equal(&atime, &now))
return false;
return true;
@@ -2006,7 +2014,7 @@ void touch_atime(const struct path *path)
if (!sb_start_write_trylock(inode->i_sb))
return;
- if (__mnt_want_write(mnt) != 0)
+ if (mnt_get_write_access(mnt) != 0)
goto skip_update;
/*
* File systems can error out when updating inodes if they need to
@@ -2018,7 +2026,7 @@ void touch_atime(const struct path *path)
* of the fs read only, e.g. subvolumes in Btrfs.
*/
inode_update_time(inode, S_ATIME);
- __mnt_drop_write(mnt);
+ mnt_put_write_access(mnt);
skip_update:
sb_end_write(inode->i_sb);
}
@@ -2106,17 +2114,18 @@ static int inode_needs_update_time(struct inode *inode)
{
int sync_it = 0;
struct timespec64 now = current_time(inode);
- struct timespec64 ctime;
+ struct timespec64 ts;
/* First try to exhaust all avenues to not sync */
if (IS_NOCMTIME(inode))
return 0;
- if (!timespec64_equal(&inode->i_mtime, &now))
+ ts = inode_get_mtime(inode);
+ if (!timespec64_equal(&ts, &now))
sync_it = S_MTIME;
- ctime = inode_get_ctime(inode);
- if (!timespec64_equal(&ctime, &now))
+ ts = inode_get_ctime(inode);
+ if (!timespec64_equal(&ts, &now))
sync_it |= S_CTIME;
if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
@@ -2131,9 +2140,9 @@ static int __file_update_time(struct file *file, int sync_mode)
struct inode *inode = file_inode(file);
/* try to update time settings */
- if (!__mnt_want_write_file(file)) {
+ if (!mnt_get_write_access_file(file)) {
ret = inode_update_time(inode, sync_mode);
- __mnt_drop_write_file(file);
+ mnt_put_write_access_file(file);
}
return ret;
diff --git a/fs/internal.h b/fs/internal.h
index d64ae03998cc..58e43341aebf 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -73,8 +73,8 @@ extern int sb_prepare_remount_readonly(struct super_block *);
extern void __init mnt_init(void);
-extern int __mnt_want_write_file(struct file *);
-extern void __mnt_drop_write_file(struct file *);
+int mnt_get_write_access_file(struct file *file);
+void mnt_put_write_access_file(struct file *file);
extern void dissolve_on_fput(struct vfsmount *);
extern bool may_mount(void);
@@ -94,14 +94,22 @@ extern void chroot_fs_refs(const struct path *, const struct path *);
struct file *alloc_empty_file(int flags, const struct cred *cred);
struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred);
struct file *alloc_empty_backing_file(int flags, const struct cred *cred);
+void release_empty_file(struct file *f);
+
+static inline void file_put_write_access(struct file *file)
+{
+ put_write_access(file->f_inode);
+ mnt_put_write_access(file->f_path.mnt);
+ if (unlikely(file->f_mode & FMODE_BACKING))
+ mnt_put_write_access(backing_file_user_path(file)->mnt);
+}
static inline void put_file_access(struct file *file)
{
if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
i_readcount_dec(file->f_inode);
} else if (file->f_mode & FMODE_WRITER) {
- put_write_access(file->f_inode);
- __mnt_drop_write(file->f_path.mnt);
+ file_put_write_access(file);
}
}
@@ -130,9 +138,9 @@ static inline void sb_start_ro_state_change(struct super_block *sb)
* mnt_is_readonly() making sure if mnt_is_readonly() sees SB_RDONLY
* cleared, it will see s_readonly_remount set.
* For RW->RO transition, the barrier pairs with the barrier in
- * __mnt_want_write() before the mnt_is_readonly() check. The barrier
- * makes sure if __mnt_want_write() sees MNT_WRITE_HOLD already
- * cleared, it will see s_readonly_remount set.
+ * mnt_get_write_access() before the mnt_is_readonly() check.
+ * The barrier makes sure if mnt_get_write_access() sees MNT_WRITE_HOLD
+ * already cleared, it will see s_readonly_remount set.
*/
smp_wmb();
}
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 2bc0aa23fde3..f72df2babe56 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -29,9 +29,9 @@ typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length);
* and I/O completions.
*/
struct iomap_folio_state {
- atomic_t read_bytes_pending;
- atomic_t write_bytes_pending;
spinlock_t state_lock;
+ unsigned int read_bytes_pending;
+ atomic_t write_bytes_pending;
/*
* Each block has two bits in this bitmap:
@@ -57,30 +57,32 @@ static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs,
return test_bit(block, ifs->state);
}
-static void ifs_set_range_uptodate(struct folio *folio,
+static bool ifs_set_range_uptodate(struct folio *folio,
struct iomap_folio_state *ifs, size_t off, size_t len)
{
struct inode *inode = folio->mapping->host;
unsigned int first_blk = off >> inode->i_blkbits;
unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
unsigned int nr_blks = last_blk - first_blk + 1;
- unsigned long flags;
- spin_lock_irqsave(&ifs->state_lock, flags);
bitmap_set(ifs->state, first_blk, nr_blks);
- if (ifs_is_fully_uptodate(folio, ifs))
- folio_mark_uptodate(folio);
- spin_unlock_irqrestore(&ifs->state_lock, flags);
+ return ifs_is_fully_uptodate(folio, ifs);
}
static void iomap_set_range_uptodate(struct folio *folio, size_t off,
size_t len)
{
struct iomap_folio_state *ifs = folio->private;
+ unsigned long flags;
+ bool uptodate = true;
- if (ifs)
- ifs_set_range_uptodate(folio, ifs, off, len);
- else
+ if (ifs) {
+ spin_lock_irqsave(&ifs->state_lock, flags);
+ uptodate = ifs_set_range_uptodate(folio, ifs, off, len);
+ spin_unlock_irqrestore(&ifs->state_lock, flags);
+ }
+
+ if (uptodate)
folio_mark_uptodate(folio);
}
@@ -181,7 +183,7 @@ static void ifs_free(struct folio *folio)
if (!ifs)
return;
- WARN_ON_ONCE(atomic_read(&ifs->read_bytes_pending));
+ WARN_ON_ONCE(ifs->read_bytes_pending != 0);
WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending));
WARN_ON_ONCE(ifs_is_fully_uptodate(folio, ifs) !=
folio_test_uptodate(folio));
@@ -248,20 +250,28 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
*lenp = plen;
}
-static void iomap_finish_folio_read(struct folio *folio, size_t offset,
+static void iomap_finish_folio_read(struct folio *folio, size_t off,
size_t len, int error)
{
struct iomap_folio_state *ifs = folio->private;
+ bool uptodate = !error;
+ bool finished = true;
- if (unlikely(error)) {
- folio_clear_uptodate(folio);
- folio_set_error(folio);
- } else {
- iomap_set_range_uptodate(folio, offset, len);
+ if (ifs) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&ifs->state_lock, flags);
+ if (!error)
+ uptodate = ifs_set_range_uptodate(folio, ifs, off, len);
+ ifs->read_bytes_pending -= len;
+ finished = !ifs->read_bytes_pending;
+ spin_unlock_irqrestore(&ifs->state_lock, flags);
}
- if (!ifs || atomic_sub_and_test(len, &ifs->read_bytes_pending))
- folio_unlock(folio);
+ if (error)
+ folio_set_error(folio);
+ if (finished)
+ folio_end_read(folio, uptodate);
}
static void iomap_read_end_io(struct bio *bio)
@@ -358,8 +368,11 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
}
ctx->cur_folio_in_bio = true;
- if (ifs)
- atomic_add(plen, &ifs->read_bytes_pending);
+ if (ifs) {
+ spin_lock_irq(&ifs->state_lock);
+ ifs->read_bytes_pending += plen;
+ spin_unlock_irq(&ifs->state_lock);
+ }
sector = iomap_sector(iomap, pos);
if (!ctx->bio ||
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 2ee21286ac8f..3e4d53e26f94 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1422,8 +1422,8 @@ static int isofs_read_inode(struct inode *inode, int relocated)
inode->i_ino, de->flags[-high_sierra]);
}
#endif
- inode->i_mtime = inode->i_atime =
- inode_set_ctime(inode, iso_date(de->date, high_sierra), 0);
+ inode_set_mtime_to_ts(inode,
+ inode_set_atime_to_ts(inode, inode_set_ctime(inode, iso_date(de->date, high_sierra), 0)));
ei->i_first_extent = (isonum_733(de->extent) +
isonum_711(de->ext_attr_length));
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 348783a70f57..d6c17ad69dee 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -426,16 +426,14 @@ repeat:
0);
}
if (rr->u.TF.flags & TF_MODIFY) {
- inode->i_mtime.tv_sec =
- iso_date(rr->u.TF.times[cnt++].time,
- 0);
- inode->i_mtime.tv_nsec = 0;
+ inode_set_mtime(inode,
+ iso_date(rr->u.TF.times[cnt++].time, 0),
+ 0);
}
if (rr->u.TF.flags & TF_ACCESS) {
- inode->i_atime.tv_sec =
- iso_date(rr->u.TF.times[cnt++].time,
- 0);
- inode->i_atime.tv_nsec = 0;
+ inode_set_atime(inode,
+ iso_date(rr->u.TF.times[cnt++].time, 0),
+ 0);
}
if (rr->u.TF.flags & TF_ATTRIBUTES) {
inode_set_ctime(inode,
@@ -531,9 +529,9 @@ repeat:
inode->i_rdev = reloc->i_rdev;
inode->i_size = reloc->i_size;
inode->i_blocks = reloc->i_blocks;
- inode->i_atime = reloc->i_atime;
+ inode_set_atime_to_ts(inode, inode_get_atime(reloc));
inode_set_ctime_to_ts(inode, inode_get_ctime(reloc));
- inode->i_mtime = reloc->i_mtime;
+ inode_set_mtime_to_ts(inode, inode_get_mtime(reloc));
iput(reloc);
break;
#ifdef CONFIG_ZISOFS
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 8d6f934c3d95..5e122586e06e 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -119,7 +119,7 @@ static int journal_submit_commit_record(journal_t *journal,
struct commit_header *tmp;
struct buffer_head *bh;
struct timespec64 now;
- blk_opf_t write_flags = REQ_OP_WRITE | REQ_SYNC;
+ blk_opf_t write_flags = REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS;
*cbh = NULL;
@@ -270,6 +270,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
if (!ret)
ret = err;
}
+ cond_resched();
spin_lock(&journal->j_list_lock);
jinode->i_flags &= ~JI_COMMIT_RUNNING;
smp_mb();
@@ -395,8 +396,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
*/
jbd2_journal_update_sb_log_tail(journal,
journal->j_tail_sequence,
- journal->j_tail,
- REQ_SYNC);
+ journal->j_tail, 0);
mutex_unlock(&journal->j_checkpoint_mutex);
} else {
jbd2_debug(3, "superblock not updated\n");
@@ -715,6 +715,7 @@ start_journal_io:
for (i = 0; i < bufs; i++) {
struct buffer_head *bh = wbuf[i];
+
/*
* Compute checksum.
*/
@@ -727,7 +728,8 @@ start_journal_io:
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
bh->b_end_io = journal_end_buffer_io_sync;
- submit_bh(REQ_OP_WRITE | REQ_SYNC, bh);
+ submit_bh(REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS,
+ bh);
}
cond_resched();
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 30dec2bd2ecc..206cb53ef2b0 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1100,8 +1100,7 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
* space and if we lose sb update during power failure we'd replay
* old transaction with possibly newly overwritten data.
*/
- ret = jbd2_journal_update_sb_log_tail(journal, tid, block,
- REQ_SYNC | REQ_FUA);
+ ret = jbd2_journal_update_sb_log_tail(journal, tid, block, REQ_FUA);
if (ret)
goto out;
@@ -1290,7 +1289,7 @@ static int jbd2_min_tag_size(void)
static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
- journal_t *journal = container_of(shrink, journal_t, j_shrinker);
+ journal_t *journal = shrink->private_data;
unsigned long nr_to_scan = sc->nr_to_scan;
unsigned long nr_shrunk;
unsigned long count;
@@ -1316,7 +1315,7 @@ static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink,
static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink,
struct shrink_control *sc)
{
- journal_t *journal = container_of(shrink, journal_t, j_shrinker);
+ journal_t *journal = shrink->private_data;
unsigned long count;
count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count);
@@ -1588,14 +1587,21 @@ static journal_t *journal_init_common(struct block_device *bdev,
goto err_cleanup;
journal->j_shrink_transaction = NULL;
- journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan;
- journal->j_shrinker.count_objects = jbd2_journal_shrink_count;
- journal->j_shrinker.seeks = DEFAULT_SEEKS;
- journal->j_shrinker.batch = journal->j_max_transaction_buffers;
- err = register_shrinker(&journal->j_shrinker, "jbd2-journal:(%u:%u)",
- MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
- if (err)
+
+ journal->j_shrinker = shrinker_alloc(0, "jbd2-journal:(%u:%u)",
+ MAJOR(bdev->bd_dev),
+ MINOR(bdev->bd_dev));
+ if (!journal->j_shrinker) {
+ err = -ENOMEM;
goto err_cleanup;
+ }
+
+ journal->j_shrinker->scan_objects = jbd2_journal_shrink_scan;
+ journal->j_shrinker->count_objects = jbd2_journal_shrink_count;
+ journal->j_shrinker->batch = journal->j_max_transaction_buffers;
+ journal->j_shrinker->private_data = journal;
+
+ shrinker_register(journal->j_shrinker);
return journal;
@@ -1768,8 +1774,7 @@ static int journal_reset(journal_t *journal)
*/
jbd2_journal_update_sb_log_tail(journal,
journal->j_tail_sequence,
- journal->j_tail,
- REQ_SYNC | REQ_FUA);
+ journal->j_tail, REQ_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
}
return jbd2_journal_start_thread(journal);
@@ -1791,9 +1796,16 @@ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags)
return -EIO;
}
- trace_jbd2_write_superblock(journal, write_flags);
+ /*
+ * Always set high priority flags to exempt from block layer's
+ * QOS policies, e.g. writeback throttle.
+ */
+ write_flags |= JBD2_JOURNAL_REQ_FLAGS;
if (!(journal->j_flags & JBD2_BARRIER))
write_flags &= ~(REQ_FUA | REQ_PREFLUSH);
+
+ trace_jbd2_write_superblock(journal, write_flags);
+
if (buffer_write_io_error(bh)) {
/*
* Oh, dear. A previous attempt to write the journal
@@ -2043,7 +2055,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
jbd2_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode);
sb->s_errno = cpu_to_be32(errcode);
- jbd2_write_superblock(journal, REQ_SYNC | REQ_FUA);
+ jbd2_write_superblock(journal, REQ_FUA);
}
EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
@@ -2164,17 +2176,16 @@ int jbd2_journal_destroy(journal_t *journal)
++journal->j_transaction_sequence;
write_unlock(&journal->j_state_lock);
- jbd2_mark_journal_empty(journal,
- REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
+ jbd2_mark_journal_empty(journal, REQ_PREFLUSH | REQ_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
} else
err = -EIO;
brelse(journal->j_sb_buffer);
}
- if (journal->j_shrinker.flags & SHRINKER_REGISTERED) {
+ if (journal->j_shrinker) {
percpu_counter_destroy(&journal->j_checkpoint_jh_count);
- unregister_shrinker(&journal->j_shrinker);
+ shrinker_free(journal->j_shrinker);
}
if (journal->j_proc_entry)
jbd2_stats_proc_exit(journal);
@@ -2466,7 +2477,7 @@ int jbd2_journal_flush(journal_t *journal, unsigned int flags)
* the magic code for a fully-recovered superblock. Any future
* commits of data to the journal will restore the current
* s_start value. */
- jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
+ jbd2_mark_journal_empty(journal, REQ_FUA);
if (flags)
err = __jbd2_journal_erase(journal, flags);
@@ -2512,7 +2523,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
if (write) {
/* Lock to make assertions happy... */
mutex_lock_io(&journal->j_checkpoint_mutex);
- jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
+ jbd2_mark_journal_empty(journal, REQ_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
}
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index c269a7d29a46..01f744cb97a4 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -289,6 +289,8 @@ int jbd2_journal_recover(journal_t *journal)
journal_superblock_t * sb;
struct recovery_info info;
+ errseq_t wb_err;
+ struct address_space *mapping;
memset(&info, 0, sizeof(info));
sb = journal->j_superblock;
@@ -306,6 +308,9 @@ int jbd2_journal_recover(journal_t *journal)
return 0;
}
+ wb_err = 0;
+ mapping = journal->j_fs_dev->bd_inode->i_mapping;
+ errseq_check_and_advance(&mapping->wb_err, &wb_err);
err = do_one_pass(journal, &info, PASS_SCAN);
if (!err)
err = do_one_pass(journal, &info, PASS_REVOKE);
@@ -329,6 +334,9 @@ int jbd2_journal_recover(journal_t *journal)
err2 = sync_blockdev(journal->j_fs_dev);
if (!err)
err = err2;
+ err2 = errseq_check_and_advance(&mapping->wb_err, &wb_err);
+ if (!err)
+ err = err2;
/* Make sure all replayed data is on permanent storage */
if (journal->j_flags & JBD2_BARRIER) {
err2 = blkdev_issue_flush(journal->j_fs_dev);
@@ -632,7 +640,7 @@ static int do_one_pass(journal_t *journal,
success = err;
printk(KERN_ERR
"JBD2: IO error %d recovering "
- "block %ld in log\n",
+ "block %lu in log\n",
err, io_block);
} else {
unsigned long long blocknr;
@@ -661,7 +669,8 @@ static int do_one_pass(journal_t *journal,
printk(KERN_ERR "JBD2: Invalid "
"checksum recovering "
"data block %llu in "
- "log\n", blocknr);
+ "journal block %lu\n",
+ blocknr, io_block);
block_error = 1;
goto skip_write;
}
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 091ab0eaabbe..2b2938970da3 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -204,8 +204,8 @@ static int jffs2_create(struct mnt_idmap *idmap, struct inode *dir_i,
if (ret)
goto fail;
- dir_i->i_mtime = inode_set_ctime_to_ts(dir_i,
- ITIME(je32_to_cpu(ri->ctime)));
+ inode_set_mtime_to_ts(dir_i,
+ inode_set_ctime_to_ts(dir_i, ITIME(je32_to_cpu(ri->ctime))));
jffs2_free_raw_inode(ri);
@@ -238,7 +238,8 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry)
if (dead_f->inocache)
set_nlink(d_inode(dentry), dead_f->inocache->pino_nlink);
if (!ret)
- dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, ITIME(now));
+ inode_set_mtime_to_ts(dir_i,
+ inode_set_ctime_to_ts(dir_i, ITIME(now)));
return ret;
}
/***********************************************************************/
@@ -272,7 +273,8 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de
set_nlink(d_inode(old_dentry), ++f->inocache->pino_nlink);
mutex_unlock(&f->sem);
d_instantiate(dentry, d_inode(old_dentry));
- dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, ITIME(now));
+ inode_set_mtime_to_ts(dir_i,
+ inode_set_ctime_to_ts(dir_i, ITIME(now)));
ihold(d_inode(old_dentry));
}
return ret;
@@ -423,8 +425,8 @@ static int jffs2_symlink (struct mnt_idmap *idmap, struct inode *dir_i,
goto fail;
}
- dir_i->i_mtime = inode_set_ctime_to_ts(dir_i,
- ITIME(je32_to_cpu(rd->mctime)));
+ inode_set_mtime_to_ts(dir_i,
+ inode_set_ctime_to_ts(dir_i, ITIME(je32_to_cpu(rd->mctime))));
jffs2_free_raw_dirent(rd);
@@ -568,8 +570,8 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i,
goto fail;
}
- dir_i->i_mtime = inode_set_ctime_to_ts(dir_i,
- ITIME(je32_to_cpu(rd->mctime)));
+ inode_set_mtime_to_ts(dir_i,
+ inode_set_ctime_to_ts(dir_i, ITIME(je32_to_cpu(rd->mctime))));
inc_nlink(dir_i);
jffs2_free_raw_dirent(rd);
@@ -610,7 +612,8 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name,
dentry->d_name.len, f, now);
if (!ret) {
- dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, ITIME(now));
+ inode_set_mtime_to_ts(dir_i,
+ inode_set_ctime_to_ts(dir_i, ITIME(now)));
clear_nlink(d_inode(dentry));
drop_nlink(dir_i);
}
@@ -746,8 +749,8 @@ static int jffs2_mknod (struct mnt_idmap *idmap, struct inode *dir_i,
goto fail;
}
- dir_i->i_mtime = inode_set_ctime_to_ts(dir_i,
- ITIME(je32_to_cpu(rd->mctime)));
+ inode_set_mtime_to_ts(dir_i,
+ inode_set_ctime_to_ts(dir_i, ITIME(je32_to_cpu(rd->mctime))));
jffs2_free_raw_dirent(rd);
@@ -868,16 +871,18 @@ static int jffs2_rename (struct mnt_idmap *idmap,
* caller won't do it on its own since we are returning an error.
*/
d_invalidate(new_dentry);
- new_dir_i->i_mtime = inode_set_ctime_to_ts(new_dir_i,
- ITIME(now));
+ inode_set_mtime_to_ts(new_dir_i,
+ inode_set_ctime_to_ts(new_dir_i, ITIME(now)));
return ret;
}
if (d_is_dir(old_dentry))
drop_nlink(old_dir_i);
- old_dir_i->i_mtime = inode_set_ctime_to_ts(old_dir_i, ITIME(now));
- new_dir_i->i_mtime = inode_set_ctime_to_ts(new_dir_i, ITIME(now));
+ inode_set_mtime_to_ts(old_dir_i,
+ inode_set_ctime_to_ts(old_dir_i, ITIME(now)));
+ inode_set_mtime_to_ts(new_dir_i,
+ inode_set_ctime_to_ts(new_dir_i, ITIME(now)));
return 0;
}
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 11c66793960e..62ea76da7fdf 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -317,8 +317,8 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
inode->i_size = pos + writtenlen;
inode->i_blocks = (inode->i_size + 511) >> 9;
- inode->i_mtime = inode_set_ctime_to_ts(inode,
- ITIME(je32_to_cpu(ri->ctime)));
+ inode_set_mtime_to_ts(inode,
+ inode_set_ctime_to_ts(inode, ITIME(je32_to_cpu(ri->ctime))));
}
}
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 0403efab4089..d175cccb7c55 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -113,8 +113,8 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
ri->isize = cpu_to_je32((ivalid & ATTR_SIZE)?iattr->ia_size:inode->i_size);
- ri->atime = cpu_to_je32(I_SEC((ivalid & ATTR_ATIME)?iattr->ia_atime:inode->i_atime));
- ri->mtime = cpu_to_je32(I_SEC((ivalid & ATTR_MTIME)?iattr->ia_mtime:inode->i_mtime));
+ ri->atime = cpu_to_je32(I_SEC((ivalid & ATTR_ATIME)?iattr->ia_atime:inode_get_atime(inode)));
+ ri->mtime = cpu_to_je32(I_SEC((ivalid & ATTR_MTIME)?iattr->ia_mtime:inode_get_mtime(inode)));
ri->ctime = cpu_to_je32(I_SEC((ivalid & ATTR_CTIME)?iattr->ia_ctime:inode_get_ctime(inode)));
ri->offset = cpu_to_je32(0);
@@ -147,9 +147,9 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
return PTR_ERR(new_metadata);
}
/* It worked. Update the inode */
- inode->i_atime = ITIME(je32_to_cpu(ri->atime));
+ inode_set_atime_to_ts(inode, ITIME(je32_to_cpu(ri->atime)));
inode_set_ctime_to_ts(inode, ITIME(je32_to_cpu(ri->ctime)));
- inode->i_mtime = ITIME(je32_to_cpu(ri->mtime));
+ inode_set_mtime_to_ts(inode, ITIME(je32_to_cpu(ri->mtime)));
inode->i_mode = jemode_to_cpu(ri->mode);
i_uid_write(inode, je16_to_cpu(ri->uid));
i_gid_write(inode, je16_to_cpu(ri->gid));
@@ -282,8 +282,8 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
i_uid_write(inode, je16_to_cpu(latest_node.uid));
i_gid_write(inode, je16_to_cpu(latest_node.gid));
inode->i_size = je32_to_cpu(latest_node.isize);
- inode->i_atime = ITIME(je32_to_cpu(latest_node.atime));
- inode->i_mtime = ITIME(je32_to_cpu(latest_node.mtime));
+ inode_set_atime_to_ts(inode, ITIME(je32_to_cpu(latest_node.atime)));
+ inode_set_mtime_to_ts(inode, ITIME(je32_to_cpu(latest_node.mtime)));
inode_set_ctime_to_ts(inode, ITIME(je32_to_cpu(latest_node.ctime)));
set_nlink(inode, f->inocache->pino_nlink);
@@ -386,8 +386,8 @@ void jffs2_dirty_inode(struct inode *inode, int flags)
iattr.ia_mode = inode->i_mode;
iattr.ia_uid = inode->i_uid;
iattr.ia_gid = inode->i_gid;
- iattr.ia_atime = inode->i_atime;
- iattr.ia_mtime = inode->i_mtime;
+ iattr.ia_atime = inode_get_atime(inode);
+ iattr.ia_mtime = inode_get_mtime(inode);
iattr.ia_ctime = inode_get_ctime(inode);
jffs2_do_setattr(inode, &iattr);
@@ -475,8 +475,8 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
inode->i_mode = jemode_to_cpu(ri->mode);
i_gid_write(inode, je16_to_cpu(ri->gid));
i_uid_write(inode, je16_to_cpu(ri->uid));
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
- ri->atime = ri->mtime = ri->ctime = cpu_to_je32(I_SEC(inode->i_mtime));
+ simple_inode_init_ts(inode);
+ ri->atime = ri->mtime = ri->ctime = cpu_to_je32(I_SEC(inode_get_mtime(inode)));
inode->i_blocks = 0;
inode->i_size = 0;
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 50727a1ff931..86ab014a349c 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -36,8 +36,8 @@ struct kvec;
#define JFFS2_NOW() JFFS2_CLAMP_TIME(ktime_get_real_seconds())
#define I_SEC(tv) JFFS2_CLAMP_TIME((tv).tv_sec)
#define JFFS2_F_I_CTIME(f) I_SEC(inode_get_ctime(OFNI_EDONI_2SFFJ(f)))
-#define JFFS2_F_I_MTIME(f) I_SEC(OFNI_EDONI_2SFFJ(f)->i_mtime)
-#define JFFS2_F_I_ATIME(f) I_SEC(OFNI_EDONI_2SFFJ(f)->i_atime)
+#define JFFS2_F_I_MTIME(f) I_SEC(inode_get_mtime(OFNI_EDONI_2SFFJ(f)))
+#define JFFS2_F_I_ATIME(f) I_SEC(inode_get_atime(OFNI_EDONI_2SFFJ(f)))
#define sleep_on_spinunlock(wq, s) \
do { \
DECLARE_WAITQUEUE(__wait, current); \
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 7ea37f49f1e1..f99591a634b4 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -150,6 +150,7 @@ static struct dentry *jffs2_get_parent(struct dentry *child)
}
static const struct export_operations jffs2_export_ops = {
+ .encode_fh = generic_encode_ino32_fh,
.get_parent = jffs2_get_parent,
.fh_to_dentry = jffs2_fh_to_dentry,
.fh_to_parent = jffs2_fh_to_parent,
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 3b6bdc9a49e1..00224f3a8d6e 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -920,7 +920,7 @@ struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
* do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags)
* is an implementation of setxattr handler on jffs2.
* -------------------------------------------------- */
-const struct xattr_handler *jffs2_xattr_handlers[] = {
+const struct xattr_handler * const jffs2_xattr_handlers[] = {
&jffs2_user_xattr_handler,
#ifdef CONFIG_JFFS2_FS_SECURITY
&jffs2_security_xattr_handler,
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index 1b5030a3349d..7e7de093ec0a 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -94,7 +94,7 @@ extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname
extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
const char *buffer, size_t size, int flags);
-extern const struct xattr_handler *jffs2_xattr_handlers[];
+extern const struct xattr_handler * const jffs2_xattr_handlers[];
extern const struct xattr_handler jffs2_user_xattr_handler;
extern const struct xattr_handler jffs2_trusted_xattr_handler;
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 920d58a1566b..1a6b5921d17a 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -393,7 +393,7 @@ void jfs_truncate_nolock(struct inode *ip, loff_t length)
break;
}
- ip->i_mtime = inode_set_ctime_current(ip);
+ inode_set_mtime_to_ts(ip, inode_set_ctime_current(ip));
mark_inode_dirty(ip);
txCommit(tid, 1, &ip, 0);
diff --git a/fs/jfs/jfs_dinode.h b/fs/jfs/jfs_dinode.h
index 6b231d0d0071..603aae17a693 100644
--- a/fs/jfs/jfs_dinode.h
+++ b/fs/jfs/jfs_dinode.h
@@ -96,7 +96,7 @@ struct dinode {
#define di_gengen u._file._u1._imap._gengen
union {
- xtpage_t _xtroot;
+ xtroot_t _xtroot;
struct {
u8 unused[16]; /* 16: */
dxd_t _dxd; /* 16: */
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 88afd108c2dd..11c77757ead9 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -87,7 +87,7 @@ static int dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno,
static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks);
static int dbFindBits(u32 word, int l2nb);
static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno);
-static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx);
+static int dbFindLeaf(dmtree_t *tp, int l2nb, int *leafidx, bool is_ctl);
static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
int nblocks);
static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
@@ -180,7 +180,8 @@ int dbMount(struct inode *ipbmap)
bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree);
bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
- if (bmp->db_l2nbperpage > L2PSIZE - L2MINBLOCKSIZE) {
+ if (bmp->db_l2nbperpage > L2PSIZE - L2MINBLOCKSIZE ||
+ bmp->db_l2nbperpage < 0) {
err = -EINVAL;
goto err_release_metapage;
}
@@ -194,6 +195,12 @@ int dbMount(struct inode *ipbmap)
bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel);
bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
+ if (bmp->db_maxag >= MAXAG || bmp->db_maxag < 0 ||
+ bmp->db_agpref >= MAXAG || bmp->db_agpref < 0) {
+ err = -EINVAL;
+ goto err_release_metapage;
+ }
+
bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel);
bmp->db_agheight = le32_to_cpu(dbmp_le->dn_agheight);
bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
@@ -1710,7 +1717,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
* dbFindLeaf() returns the index of the leaf at which
* free space was found.
*/
- rc = dbFindLeaf((dmtree_t *) dcp, l2nb, &leafidx);
+ rc = dbFindLeaf((dmtree_t *) dcp, l2nb, &leafidx, true);
/* release the buffer.
*/
@@ -1957,7 +1964,7 @@ dbAllocDmapLev(struct bmap * bmp,
* free space. if sufficient free space is found, dbFindLeaf()
* returns the index of the leaf at which free space was found.
*/
- if (dbFindLeaf((dmtree_t *) & dp->tree, l2nb, &leafidx))
+ if (dbFindLeaf((dmtree_t *) &dp->tree, l2nb, &leafidx, false))
return -ENOSPC;
if (leafidx < 0)
@@ -2921,14 +2928,18 @@ static void dbAdjTree(dmtree_t * tp, int leafno, int newval)
* leafidx - return pointer to be set to the index of the leaf
* describing at least l2nb free blocks if sufficient
* free blocks are found.
+ * is_ctl - determines if the tree is of type ctl
*
* RETURN VALUES:
* 0 - success
* -ENOSPC - insufficient free blocks.
*/
-static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx)
+static int dbFindLeaf(dmtree_t *tp, int l2nb, int *leafidx, bool is_ctl)
{
int ti, n = 0, k, x = 0;
+ int max_size;
+
+ max_size = is_ctl ? CTLTREESIZE : TREESIZE;
/* first check the root of the tree to see if there is
* sufficient free space.
@@ -2949,6 +2960,8 @@ static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx)
/* sufficient free space found. move to the next
* level (or quit if this is the last level).
*/
+ if (x + n > max_size)
+ return -ENOSPC;
if (l2nb <= tp->dmt_stree[x + n])
break;
}
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 923a58422c46..a037ee59e398 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -670,7 +670,7 @@ int diWrite(tid_t tid, struct inode *ip)
* This is the special xtree inside the directory for storing
* the directory table
*/
- xtpage_t *p, *xp;
+ xtroot_t *p, *xp;
xad_t *xad;
jfs_ip->xtlid = 0;
@@ -684,7 +684,7 @@ int diWrite(tid_t tid, struct inode *ip)
* copy xtree root from inode to dinode:
*/
p = &jfs_ip->i_xtroot;
- xp = (xtpage_t *) &dp->di_dirtable;
+ xp = (xtroot_t *) &dp->di_dirtable;
lv = ilinelock->lv;
for (n = 0; n < ilinelock->index; n++, lv++) {
memcpy(&xp->xad[lv->offset], &p->xad[lv->offset],
@@ -713,7 +713,7 @@ int diWrite(tid_t tid, struct inode *ip)
* regular file: 16 byte (XAD slot) granularity
*/
if (type & tlckXTREE) {
- xtpage_t *p, *xp;
+ xtroot_t *p, *xp;
xad_t *xad;
/*
@@ -1320,7 +1320,7 @@ diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp)
int diAlloc(struct inode *pip, bool dir, struct inode *ip)
{
int rc, ino, iagno, addext, extno, bitno, sword;
- int nwords, rem, i, agno;
+ int nwords, rem, i, agno, dn_numag;
u32 mask, inosmap, extsmap;
struct inode *ipimap;
struct metapage *mp;
@@ -1356,6 +1356,9 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
/* get the ag number of this iag */
agno = BLKTOAG(JFS_IP(pip)->agstart, JFS_SBI(pip->i_sb));
+ dn_numag = JFS_SBI(pip->i_sb)->bmap->db_numag;
+ if (agno < 0 || agno > dn_numag)
+ return -EIO;
if (atomic_read(&JFS_SBI(pip->i_sb)->bmap->db_active[agno])) {
/*
@@ -3061,10 +3064,10 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip)
}
ip->i_size = le64_to_cpu(dip->di_size);
- ip->i_atime.tv_sec = le32_to_cpu(dip->di_atime.tv_sec);
- ip->i_atime.tv_nsec = le32_to_cpu(dip->di_atime.tv_nsec);
- ip->i_mtime.tv_sec = le32_to_cpu(dip->di_mtime.tv_sec);
- ip->i_mtime.tv_nsec = le32_to_cpu(dip->di_mtime.tv_nsec);
+ inode_set_atime(ip, le32_to_cpu(dip->di_atime.tv_sec),
+ le32_to_cpu(dip->di_atime.tv_nsec));
+ inode_set_mtime(ip, le32_to_cpu(dip->di_mtime.tv_sec),
+ le32_to_cpu(dip->di_mtime.tv_nsec));
inode_set_ctime(ip, le32_to_cpu(dip->di_ctime.tv_sec),
le32_to_cpu(dip->di_ctime.tv_nsec));
ip->i_blocks = LBLK2PBLK(ip->i_sb, le64_to_cpu(dip->di_nblocks));
@@ -3138,12 +3141,12 @@ static void copy_to_dinode(struct dinode * dip, struct inode *ip)
else /* Leave the original permissions alone */
dip->di_mode = cpu_to_le32(jfs_ip->mode2);
- dip->di_atime.tv_sec = cpu_to_le32(ip->i_atime.tv_sec);
- dip->di_atime.tv_nsec = cpu_to_le32(ip->i_atime.tv_nsec);
- dip->di_ctime.tv_sec = cpu_to_le32(inode_get_ctime(ip).tv_sec);
- dip->di_ctime.tv_nsec = cpu_to_le32(inode_get_ctime(ip).tv_nsec);
- dip->di_mtime.tv_sec = cpu_to_le32(ip->i_mtime.tv_sec);
- dip->di_mtime.tv_nsec = cpu_to_le32(ip->i_mtime.tv_nsec);
+ dip->di_atime.tv_sec = cpu_to_le32(inode_get_atime_sec(ip));
+ dip->di_atime.tv_nsec = cpu_to_le32(inode_get_atime_nsec(ip));
+ dip->di_ctime.tv_sec = cpu_to_le32(inode_get_ctime_sec(ip));
+ dip->di_ctime.tv_nsec = cpu_to_le32(inode_get_ctime_nsec(ip));
+ dip->di_mtime.tv_sec = cpu_to_le32(inode_get_mtime_sec(ip));
+ dip->di_mtime.tv_nsec = cpu_to_le32(inode_get_mtime_nsec(ip));
dip->di_ixpxd = jfs_ip->ixpxd; /* in-memory pxd's are little-endian */
dip->di_acl = jfs_ip->acl; /* as are dxd's */
dip->di_ea = jfs_ip->ea;
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
index 721def69e732..dd4264aa9bed 100644
--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h
@@ -66,7 +66,7 @@ struct jfs_inode_info {
lid_t xtlid; /* lid of xtree lock on directory */
union {
struct {
- xtpage_t _xtroot; /* 288: xtree root */
+ xtroot_t _xtroot; /* 288: xtree root */
struct inomap *_imap; /* 4: inode map header */
} file;
struct {
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 87594efa7f7c..f10f295d1502 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -97,8 +97,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
jfs_inode->mode2 |= inode->i_mode;
inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
- jfs_inode->otime = inode_get_ctime(inode).tv_sec;
+ simple_inode_init_ts(inode);
+ jfs_inode->otime = inode_get_ctime_sec(inode);
inode->i_generation = JFS_SBI(sb)->gengen++;
jfs_inode->cflag = 0;
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index e855b8fde76c..cb6d1fda66a7 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1058,7 +1058,7 @@ void jfs_syncpt(struct jfs_log *log, int hard_sync)
int lmLogOpen(struct super_block *sb)
{
int rc;
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
struct jfs_log *log;
struct jfs_sb_info *sbi = JFS_SBI(sb);
@@ -1070,7 +1070,7 @@ int lmLogOpen(struct super_block *sb)
mutex_lock(&jfs_log_mutex);
list_for_each_entry(log, &jfs_external_logs, journal_list) {
- if (log->bdev->bd_dev == sbi->logdev) {
+ if (log->bdev_handle->bdev->bd_dev == sbi->logdev) {
if (!uuid_equal(&log->uuid, &sbi->loguuid)) {
jfs_warn("wrong uuid on JFS journal");
mutex_unlock(&jfs_log_mutex);
@@ -1100,14 +1100,14 @@ int lmLogOpen(struct super_block *sb)
* file systems to log may have n-to-1 relationship;
*/
- bdev = blkdev_get_by_dev(sbi->logdev, BLK_OPEN_READ | BLK_OPEN_WRITE,
- log, NULL);
- if (IS_ERR(bdev)) {
- rc = PTR_ERR(bdev);
+ bdev_handle = bdev_open_by_dev(sbi->logdev,
+ BLK_OPEN_READ | BLK_OPEN_WRITE, log, NULL);
+ if (IS_ERR(bdev_handle)) {
+ rc = PTR_ERR(bdev_handle);
goto free;
}
- log->bdev = bdev;
+ log->bdev_handle = bdev_handle;
uuid_copy(&log->uuid, &sbi->loguuid);
/*
@@ -1141,7 +1141,7 @@ journal_found:
lbmLogShutdown(log);
close: /* close external log device */
- blkdev_put(bdev, log);
+ bdev_release(bdev_handle);
free: /* free log descriptor */
mutex_unlock(&jfs_log_mutex);
@@ -1162,7 +1162,7 @@ static int open_inline_log(struct super_block *sb)
init_waitqueue_head(&log->syncwait);
set_bit(log_INLINELOG, &log->flag);
- log->bdev = sb->s_bdev;
+ log->bdev_handle = sb->s_bdev_handle;
log->base = addressPXD(&JFS_SBI(sb)->logpxd);
log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >>
(L2LOGPSIZE - sb->s_blocksize_bits);
@@ -1436,7 +1436,7 @@ int lmLogClose(struct super_block *sb)
{
struct jfs_sb_info *sbi = JFS_SBI(sb);
struct jfs_log *log = sbi->log;
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
int rc = 0;
jfs_info("lmLogClose: log:0x%p", log);
@@ -1482,10 +1482,10 @@ int lmLogClose(struct super_block *sb)
* external log as separate logical volume
*/
list_del(&log->journal_list);
- bdev = log->bdev;
+ bdev_handle = log->bdev_handle;
rc = lmLogShutdown(log);
- blkdev_put(bdev, log);
+ bdev_release(bdev_handle);
kfree(log);
@@ -1972,7 +1972,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
bp->l_flag |= lbmREAD;
- bio = bio_alloc(log->bdev, 1, REQ_OP_READ, GFP_NOFS);
+ bio = bio_alloc(log->bdev_handle->bdev, 1, REQ_OP_READ, GFP_NOFS);
bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
__bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);
@@ -2110,10 +2110,15 @@ static void lbmStartIO(struct lbuf * bp)
{
struct bio *bio;
struct jfs_log *log = bp->l_log;
+ struct block_device *bdev = NULL;
jfs_info("lbmStartIO");
- bio = bio_alloc(log->bdev, 1, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS);
+ if (!log->no_integrity)
+ bdev = log->bdev_handle->bdev;
+
+ bio = bio_alloc(bdev, 1, REQ_OP_WRITE | REQ_SYNC,
+ GFP_NOFS);
bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
__bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);
diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h
index 805877ce5020..84aa2d253907 100644
--- a/fs/jfs/jfs_logmgr.h
+++ b/fs/jfs/jfs_logmgr.h
@@ -356,7 +356,7 @@ struct jfs_log {
* before writing syncpt.
*/
struct list_head journal_list; /* Global list */
- struct block_device *bdev; /* 4: log lv pointer */
+ struct bdev_handle *bdev_handle; /* 4: log lv pointer */
int serial; /* 4: log mount serial number */
s64 base; /* @8: log extent address (inline log ) */
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index b83aae56a1f2..415eb65a36ff 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -430,7 +430,8 @@ int updateSuper(struct super_block *sb, uint state)
if (state == FM_MOUNT) {
/* record log's dev_t and mount serial number */
- j_sb->s_logdev = cpu_to_le32(new_encode_dev(sbi->log->bdev->bd_dev));
+ j_sb->s_logdev = cpu_to_le32(
+ new_encode_dev(sbi->log->bdev_handle->bdev->bd_dev));
j_sb->s_logserial = cpu_to_le32(sbi->log->serial);
} else if (state == FM_CLEAN) {
/*
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index ce4b4760fcb1..dccc8b3f1045 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -783,7 +783,7 @@ struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp,
if (mp->xflag & COMMIT_PAGE)
p = (xtpage_t *) mp->data;
else
- p = &jfs_ip->i_xtroot;
+ p = (xtpage_t *) &jfs_ip->i_xtroot;
xtlck->lwm.offset =
le16_to_cpu(p->header.nextindex);
}
@@ -1676,7 +1676,7 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
if (tlck->type & tlckBTROOT) {
lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT);
- p = &JFS_IP(ip)->i_xtroot;
+ p = (xtpage_t *) &JFS_IP(ip)->i_xtroot;
if (S_ISDIR(ip->i_mode))
lrd->log.redopage.type |=
cpu_to_le16(LOG_DIR_XTREE);
diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h
index 0d33816d251d..ec67d8554d2c 100644
--- a/fs/jfs/jfs_xattr.h
+++ b/fs/jfs/jfs_xattr.h
@@ -46,7 +46,7 @@ extern int __jfs_setxattr(tid_t, struct inode *, const char *, const void *,
extern ssize_t __jfs_getxattr(struct inode *, const char *, void *, size_t);
extern ssize_t jfs_listxattr(struct dentry *, char *, size_t);
-extern const struct xattr_handler *jfs_xattr_handlers[];
+extern const struct xattr_handler * const jfs_xattr_handlers[];
#ifdef CONFIG_JFS_SECURITY
extern int jfs_init_security(tid_t, struct inode *, struct inode *,
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 2d304cee884c..5ee618d17e77 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -1213,7 +1213,7 @@ xtSplitRoot(tid_t tid,
struct xtlock *xtlck;
int rc;
- sp = &JFS_IP(ip)->i_xtroot;
+ sp = (xtpage_t *) &JFS_IP(ip)->i_xtroot;
INCREMENT(xtStat.split);
@@ -2098,7 +2098,7 @@ int xtAppend(tid_t tid, /* transaction id */
*/
void xtInitRoot(tid_t tid, struct inode *ip)
{
- xtpage_t *p;
+ xtroot_t *p;
/*
* acquire a transaction lock on the root
diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h
index ad7592191d76..0f6cf5a1ce75 100644
--- a/fs/jfs/jfs_xtree.h
+++ b/fs/jfs/jfs_xtree.h
@@ -65,24 +65,33 @@ struct xadlist {
#define XTPAGEMAXSLOT 256
#define XTENTRYSTART 2
-/*
- * xtree page:
- */
-typedef union {
- struct xtheader {
- __le64 next; /* 8: */
- __le64 prev; /* 8: */
+struct xtheader {
+ __le64 next; /* 8: */
+ __le64 prev; /* 8: */
- u8 flag; /* 1: */
- u8 rsrvd1; /* 1: */
- __le16 nextindex; /* 2: next index = number of entries */
- __le16 maxentry; /* 2: max number of entries */
- __le16 rsrvd2; /* 2: */
+ u8 flag; /* 1: */
+ u8 rsrvd1; /* 1: */
+ __le16 nextindex; /* 2: next index = number of entries */
+ __le16 maxentry; /* 2: max number of entries */
+ __le16 rsrvd2; /* 2: */
- pxd_t self; /* 8: self */
- } header; /* (32) */
+ pxd_t self; /* 8: self */
+};
+/*
+ * xtree root (in inode):
+ */
+typedef union {
+ struct xtheader header;
xad_t xad[XTROOTMAXSLOT]; /* 16 * maxentry: xad array */
+} xtroot_t;
+
+/*
+ * xtree page:
+ */
+typedef union {
+ struct xtheader header;
+ xad_t xad[XTPAGEMAXSLOT]; /* 16 * maxentry: xad array */
} xtpage_t;
/*
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 57d7a4300210..d68a4e6ac345 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -149,7 +149,7 @@ static int jfs_create(struct mnt_idmap *idmap, struct inode *dip,
mark_inode_dirty(ip);
- dip->i_mtime = inode_set_ctime_current(dip);
+ inode_set_mtime_to_ts(dip, inode_set_ctime_current(dip));
mark_inode_dirty(dip);
@@ -284,7 +284,7 @@ static int jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip,
/* update parent directory inode */
inc_nlink(dip); /* for '..' from child directory */
- dip->i_mtime = inode_set_ctime_current(dip);
+ inode_set_mtime_to_ts(dip, inode_set_ctime_current(dip));
mark_inode_dirty(dip);
rc = txCommit(tid, 2, &iplist[0], 0);
@@ -390,7 +390,7 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
/* update parent directory's link count corresponding
* to ".." entry of the target directory deleted
*/
- dip->i_mtime = inode_set_ctime_current(dip);
+ inode_set_mtime_to_ts(dip, inode_set_ctime_current(dip));
inode_dec_link_count(dip);
/*
@@ -512,7 +512,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
ASSERT(ip->i_nlink);
- dip->i_mtime = inode_set_ctime_to_ts(dip, inode_set_ctime_current(ip));
+ inode_set_mtime_to_ts(dip,
+ inode_set_ctime_to_ts(dip, inode_set_ctime_current(ip)));
mark_inode_dirty(dip);
/* update target's inode */
@@ -828,7 +829,7 @@ static int jfs_link(struct dentry *old_dentry,
/* update object inode */
inc_nlink(ip); /* for new link */
inode_set_ctime_current(ip);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
ihold(ip);
@@ -1028,7 +1029,7 @@ static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip,
mark_inode_dirty(ip);
- dip->i_mtime = inode_set_ctime_current(dip);
+ inode_set_mtime_to_ts(dip, inode_set_ctime_current(dip));
mark_inode_dirty(dip);
/*
* commit update of parent directory and link object
@@ -1271,7 +1272,7 @@ static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
inode_set_ctime_current(old_ip);
mark_inode_dirty(old_ip);
- new_dir->i_mtime = inode_set_ctime_current(new_dir);
+ inode_set_mtime_to_ts(new_dir, inode_set_ctime_current(new_dir));
mark_inode_dirty(new_dir);
/* Build list of inodes modified by this transaction */
@@ -1283,7 +1284,8 @@ static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (old_dir != new_dir) {
iplist[ipcount++] = new_dir;
- old_dir->i_mtime = inode_set_ctime_current(old_dir);
+ inode_set_mtime_to_ts(old_dir,
+ inode_set_ctime_current(old_dir));
mark_inode_dirty(old_dir);
}
@@ -1416,7 +1418,7 @@ static int jfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
mark_inode_dirty(ip);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 2e2f7f6d36a0..8d8e556bd610 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -818,7 +818,7 @@ out:
}
if (inode->i_size < off+len-towrite)
i_size_write(inode, off+len-towrite);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
inode_unlock(inode);
return len - towrite;
@@ -896,6 +896,7 @@ static const struct super_operations jfs_super_operations = {
};
static const struct export_operations jfs_export_operations = {
+ .encode_fh = generic_encode_ino32_fh,
.fh_to_dentry = jfs_fh_to_dentry,
.fh_to_parent = jfs_fh_to_parent,
.get_parent = jfs_get_parent,
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 8577ad494e05..0fb7afac298e 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -985,7 +985,7 @@ static const struct xattr_handler jfs_trusted_xattr_handler = {
.set = jfs_xattr_set,
};
-const struct xattr_handler *jfs_xattr_handlers[] = {
+const struct xattr_handler * const jfs_xattr_handlers[] = {
&jfs_os2_xattr_handler,
&jfs_user_xattr_handler,
&jfs_security_xattr_handler,
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 180906c36f51..f0cb729e9a97 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -429,60 +429,11 @@ static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr,
return ret;
}
-#ifdef CONFIG_NUMA
-static int kernfs_vma_set_policy(struct vm_area_struct *vma,
- struct mempolicy *new)
-{
- struct file *file = vma->vm_file;
- struct kernfs_open_file *of = kernfs_of(file);
- int ret;
-
- if (!of->vm_ops)
- return 0;
-
- if (!kernfs_get_active(of->kn))
- return -EINVAL;
-
- ret = 0;
- if (of->vm_ops->set_policy)
- ret = of->vm_ops->set_policy(vma, new);
-
- kernfs_put_active(of->kn);
- return ret;
-}
-
-static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma,
- unsigned long addr)
-{
- struct file *file = vma->vm_file;
- struct kernfs_open_file *of = kernfs_of(file);
- struct mempolicy *pol;
-
- if (!of->vm_ops)
- return vma->vm_policy;
-
- if (!kernfs_get_active(of->kn))
- return vma->vm_policy;
-
- pol = vma->vm_policy;
- if (of->vm_ops->get_policy)
- pol = of->vm_ops->get_policy(vma, addr);
-
- kernfs_put_active(of->kn);
- return pol;
-}
-
-#endif
-
static const struct vm_operations_struct kernfs_vm_ops = {
.open = kernfs_vma_open,
.fault = kernfs_vma_fault,
.page_mkwrite = kernfs_vma_page_mkwrite,
.access = kernfs_vma_access,
-#ifdef CONFIG_NUMA
- .set_policy = kernfs_vma_set_policy,
- .get_policy = kernfs_vma_get_policy,
-#endif
};
static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
@@ -903,6 +854,33 @@ static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)
return ret;
}
+static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct kernfs_open_file *of = kernfs_of(file);
+ const struct kernfs_ops *ops;
+ loff_t ret;
+
+ /*
+ * @of->mutex nests outside active ref and is primarily to ensure that
+ * the ops aren't called concurrently for the same open file.
+ */
+ mutex_lock(&of->mutex);
+ if (!kernfs_get_active(of->kn)) {
+ mutex_unlock(&of->mutex);
+ return -ENODEV;
+ }
+
+ ops = kernfs_ops(of->kn);
+ if (ops->llseek)
+ ret = ops->llseek(of, offset, whence);
+ else
+ ret = generic_file_llseek(file, offset, whence);
+
+ kernfs_put_active(of->kn);
+ mutex_unlock(&of->mutex);
+ return ret;
+}
+
static void kernfs_notify_workfn(struct work_struct *work)
{
struct kernfs_node *kn;
@@ -1005,7 +983,7 @@ EXPORT_SYMBOL_GPL(kernfs_notify);
const struct file_operations kernfs_file_fops = {
.read_iter = kernfs_fop_read_iter,
.write_iter = kernfs_fop_write_iter,
- .llseek = generic_file_llseek,
+ .llseek = kernfs_fop_llseek,
.mmap = kernfs_fop_mmap,
.open = kernfs_fop_open,
.release = kernfs_fop_release,
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 922719a343a7..b83054da68b3 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -151,7 +151,7 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
{
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
}
static inline void set_inode_attr(struct inode *inode,
@@ -159,8 +159,8 @@ static inline void set_inode_attr(struct inode *inode,
{
inode->i_uid = attrs->ia_uid;
inode->i_gid = attrs->ia_gid;
- inode->i_atime = attrs->ia_atime;
- inode->i_mtime = attrs->ia_mtime;
+ inode_set_atime_to_ts(inode, attrs->ia_atime);
+ inode_set_mtime_to_ts(inode, attrs->ia_mtime);
inode_set_ctime_to_ts(inode, attrs->ia_ctime);
}
@@ -445,7 +445,7 @@ static const struct xattr_handler kernfs_user_xattr_handler = {
.set = kernfs_vfs_user_xattr_set,
};
-const struct xattr_handler *kernfs_xattr_handlers[] = {
+const struct xattr_handler * const kernfs_xattr_handlers[] = {
&kernfs_trusted_xattr_handler,
&kernfs_security_xattr_handler,
&kernfs_user_xattr_handler,
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index a9b854cdfdb5..237f2764b941 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -127,7 +127,7 @@ extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
/*
* inode.c
*/
-extern const struct xattr_handler *kernfs_xattr_handlers[];
+extern const struct xattr_handler * const kernfs_xattr_handlers[];
void kernfs_evict_inode(struct inode *inode);
int kernfs_iop_permission(struct mnt_idmap *idmap,
struct inode *inode, int mask);
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index c4bf26142eec..4628edde2e7e 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -21,8 +21,9 @@
#include "kernfs-internal.h"
-struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
-struct kernfs_global_locks *kernfs_locks;
+struct kmem_cache *kernfs_node_cache __ro_after_init;
+struct kmem_cache *kernfs_iattrs_cache __ro_after_init;
+struct kernfs_global_locks *kernfs_locks __ro_after_init;
static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
{
@@ -265,7 +266,7 @@ static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *k
sb->s_time_gran = 1;
/* sysfs dentries and inodes don't require IO to create */
- sb->s_shrink.seeks = 0;
+ sb->s_shrink->seeks = 0;
/* get root inode, initialize and unlock it */
down_read(&kf_root->kernfs_rwsem);
diff --git a/fs/libfs.c b/fs/libfs.c
index 37f2d34ee090..c2aa6fd4795c 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -41,6 +41,9 @@ EXPORT_SYMBOL(simple_getattr);
int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
{
+ u64 id = huge_encode_dev(dentry->d_sb->s_dev);
+
+ buf->f_fsid = u64_to_fsid(id);
buf->f_type = dentry->d_sb->s_magic;
buf->f_bsize = PAGE_SIZE;
buf->f_namelen = NAME_MAX;
@@ -396,6 +399,8 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
return -EINVAL;
}
+ /* In this case, ->private_data is protected by f_pos_lock */
+ file->private_data = NULL;
return vfs_setpos(file, offset, U32_MAX);
}
@@ -425,7 +430,7 @@ static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
inode->i_ino, fs_umode_to_dtype(inode->i_mode));
}
-static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
+static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
{
struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode);
XA_STATE(xas, &so_ctx->xa, ctx->pos);
@@ -434,7 +439,7 @@ static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
while (true) {
dentry = offset_find_next(&xas);
if (!dentry)
- break;
+ return ERR_PTR(-ENOENT);
if (!offset_dir_emit(ctx, dentry)) {
dput(dentry);
@@ -444,6 +449,7 @@ static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
dput(dentry);
ctx->pos = xas.xa_index + 1;
}
+ return NULL;
}
/**
@@ -476,7 +482,12 @@ static int offset_readdir(struct file *file, struct dir_context *ctx)
if (!dir_emit_dots(file, ctx))
return 0;
- offset_iterate_dir(d_inode(dir), ctx);
+ /* In this case, ->private_data is protected by f_pos_lock */
+ if (ctx->pos == 2)
+ file->private_data = NULL;
+ else if (file->private_data == ERR_PTR(-ENOENT))
+ return 0;
+ file->private_data = offset_iterate_dir(d_inode(dir), ctx);
return 0;
}
@@ -541,7 +552,8 @@ void simple_recursive_removal(struct dentry *dentry,
dput(victim); // unpin it
}
if (victim == dentry) {
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode,
+ inode_set_ctime_current(inode));
if (d_is_dir(dentry))
drop_nlink(inode);
inode_unlock(inode);
@@ -582,7 +594,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
*/
root->i_ino = 1;
root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
- root->i_atime = root->i_mtime = inode_set_ctime_current(root);
+ simple_inode_init_ts(root);
s->s_root = d_make_root(root);
if (!s->s_root)
return -ENOMEM;
@@ -638,8 +650,8 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den
{
struct inode *inode = d_inode(old_dentry);
- dir->i_mtime = inode_set_ctime_to_ts(dir,
- inode_set_ctime_current(inode));
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
inc_nlink(inode);
ihold(inode);
dget(dentry);
@@ -673,8 +685,8 @@ int simple_unlink(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
- dir->i_mtime = inode_set_ctime_to_ts(dir,
- inode_set_ctime_current(inode));
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
drop_nlink(inode);
dput(dentry);
return 0;
@@ -709,9 +721,10 @@ void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry,
{
struct inode *newino = d_inode(new_dentry);
- old_dir->i_mtime = inode_set_ctime_current(old_dir);
+ inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir));
if (new_dir != old_dir)
- new_dir->i_mtime = inode_set_ctime_current(new_dir);
+ inode_set_mtime_to_ts(new_dir,
+ inode_set_ctime_current(new_dir));
inode_set_ctime_current(d_inode(old_dentry));
if (newino)
inode_set_ctime_current(newino);
@@ -926,7 +939,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
*/
inode->i_ino = 1;
inode->i_mode = S_IFDIR | 0755;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
set_nlink(inode, 2);
@@ -952,7 +965,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
goto out;
}
inode->i_mode = S_IFREG | files->mode;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_fop = files->ops;
inode->i_ino = i;
d_add(dentry, inode);
@@ -1308,6 +1321,47 @@ ssize_t simple_attr_write_signed(struct file *file, const char __user *buf,
EXPORT_SYMBOL_GPL(simple_attr_write_signed);
/**
+ * generic_encode_ino32_fh - generic export_operations->encode_fh function
+ * @inode: the object to encode
+ * @fh: where to store the file handle fragment
+ * @max_len: maximum length to store there (in 4 byte units)
+ * @parent: parent directory inode, if wanted
+ *
+ * This generic encode_fh function assumes that the 32 inode number
+ * is suitable for locating an inode, and that the generation number
+ * can be used to check that it is still valid. It places them in the
+ * filehandle fragment where export_decode_fh expects to find them.
+ */
+int generic_encode_ino32_fh(struct inode *inode, __u32 *fh, int *max_len,
+ struct inode *parent)
+{
+ struct fid *fid = (void *)fh;
+ int len = *max_len;
+ int type = FILEID_INO32_GEN;
+
+ if (parent && (len < 4)) {
+ *max_len = 4;
+ return FILEID_INVALID;
+ } else if (len < 2) {
+ *max_len = 2;
+ return FILEID_INVALID;
+ }
+
+ len = 2;
+ fid->i32.ino = inode->i_ino;
+ fid->i32.gen = inode->i_generation;
+ if (parent) {
+ fid->i32.parent_ino = parent->i_ino;
+ fid->i32.parent_gen = parent->i_generation;
+ len = 4;
+ type = FILEID_INO32_GEN_PARENT;
+ }
+ *max_len = len;
+ return type;
+}
+EXPORT_SYMBOL_GPL(generic_encode_ino32_fh);
+
+/**
* generic_fh_to_dentry - generic helper for the fh_to_dentry export operation
* @sb: filesystem to do the file handle conversion on
* @fid: file handle to convert
@@ -1520,7 +1574,7 @@ struct inode *alloc_anon_inode(struct super_block *s)
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
inode->i_flags |= S_PRIVATE;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
return inode;
}
EXPORT_SYMBOL(alloc_anon_inode);
@@ -1912,3 +1966,20 @@ ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter,
return direct_written + buffered_written;
}
EXPORT_SYMBOL_GPL(direct_write_fallback);
+
+/**
+ * simple_inode_init_ts - initialize the timestamps for a new inode
+ * @inode: inode to be initialized
+ *
+ * When a new inode is created, most filesystems set the timestamps to the
+ * current time. Add a helper to do this.
+ */
+struct timespec64 simple_inode_init_ts(struct inode *inode)
+{
+ struct timespec64 ts = inode_set_ctime_current(inode);
+
+ inode_set_atime_to_ts(inode, ts);
+ inode_set_mtime_to_ts(inode, ts);
+ return ts;
+}
+EXPORT_SYMBOL(simple_inode_init_ts);
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 6579948070a4..81be07c1d3d1 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -24,7 +24,6 @@
#include <linux/uio.h>
#include <linux/smp.h>
#include <linux/mutex.h>
-#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/inetdevice.h>
@@ -135,11 +134,11 @@ lockd(void *vrqstp)
* The main request loop. We don't terminate until the last
* NFS mount or NFS daemon has gone away.
*/
- while (!kthread_should_stop()) {
+ while (!svc_thread_should_stop(rqstp)) {
/* update sv_maxconn if it has changed */
rqstp->rq_server->sv_maxconn = nlm_max_connections;
- nlmsvc_retry_blocked();
+ nlmsvc_retry_blocked(rqstp);
svc_recv(rqstp);
}
if (nlmsvc_ops)
@@ -373,7 +372,9 @@ static void lockd_put(void)
unregister_inet6addr_notifier(&lockd_inet6addr_notifier);
#endif
+ svc_get(nlmsvc_serv);
svc_set_num_threads(nlmsvc_serv, NULL, 0);
+ svc_put(nlmsvc_serv);
timer_delete_sync(&nlmsvc_retry);
nlmsvc_serv = NULL;
dprintk("lockd_down: service destroyed\n");
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 43aeba9de55c..2dc10900ad1c 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -30,7 +30,6 @@
#include <linux/sunrpc/svc_xprt.h>
#include <linux/lockd/nlm.h>
#include <linux/lockd/lockd.h>
-#include <linux/kthread.h>
#include <linux/exportfs.h>
#define NLMDBG_FACILITY NLMDBG_SVCLOCK
@@ -481,9 +480,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
struct nlm_host *host, struct nlm_lock *lock, int wait,
struct nlm_cookie *cookie, int reclaim)
{
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
struct inode *inode = nlmsvc_file_inode(file);
-#endif
struct nlm_block *block = NULL;
int error;
int mode;
@@ -497,7 +494,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
(long long)lock->fl.fl_end,
wait);
- if (nlmsvc_file_file(file)->f_op->lock) {
+ if (!exportfs_lock_op_is_async(inode->i_sb->s_export_op)) {
async_block = wait;
wait = 0;
}
@@ -543,6 +540,25 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
goto out;
}
+ spin_lock(&nlm_blocked_lock);
+ /*
+ * If this is a lock request for an already pending
+ * lock request we return nlm_lck_blocked without calling
+ * vfs_lock_file() again. Otherwise we have two pending
+ * requests on the underlaying ->lock() implementation but
+ * only one nlm_block to being granted by lm_grant().
+ */
+ if (exportfs_lock_op_is_async(inode->i_sb->s_export_op) &&
+ !list_empty(&block->b_list)) {
+ spin_unlock(&nlm_blocked_lock);
+ ret = nlm_lck_blocked;
+ goto out;
+ }
+
+ /* Append to list of blocked */
+ nlmsvc_insert_block_locked(block, NLM_NEVER);
+ spin_unlock(&nlm_blocked_lock);
+
if (!wait)
lock->fl.fl_flags &= ~FL_SLEEP;
mode = lock_to_openmode(&lock->fl);
@@ -552,16 +568,12 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
dprintk("lockd: vfs_lock_file returned %d\n", error);
switch (error) {
case 0:
+ nlmsvc_remove_block(block);
ret = nlm_granted;
goto out;
case -EAGAIN:
- /*
- * If this is a blocking request for an
- * already pending lock request then we need
- * to put it back on lockd's block list
- */
- if (wait)
- break;
+ if (!wait)
+ nlmsvc_remove_block(block);
ret = async_block ? nlm_lck_blocked : nlm_lck_denied;
goto out;
case FILE_LOCK_DEFERRED:
@@ -572,17 +584,16 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
ret = nlmsvc_defer_lock_rqst(rqstp, block);
goto out;
case -EDEADLK:
+ nlmsvc_remove_block(block);
ret = nlm_deadlock;
goto out;
default: /* includes ENOLCK */
+ nlmsvc_remove_block(block);
ret = nlm_lck_denied_nolocks;
goto out;
}
ret = nlm_lck_blocked;
-
- /* Append to list of blocked */
- nlmsvc_insert_block(block, NLM_NEVER);
out:
mutex_unlock(&file->f_mutex);
nlmsvc_release_block(block);
@@ -1020,13 +1031,13 @@ retry_deferred_block(struct nlm_block *block)
* be retransmitted.
*/
void
-nlmsvc_retry_blocked(void)
+nlmsvc_retry_blocked(struct svc_rqst *rqstp)
{
unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
struct nlm_block *block;
spin_lock(&nlm_blocked_lock);
- while (!list_empty(&nlm_blocked) && !kthread_should_stop()) {
+ while (!list_empty(&nlm_blocked) && !svc_thread_should_stop(rqstp)) {
block = list_entry(nlm_blocked.next, struct nlm_block, b_list);
if (block->b_when == NLM_NEVER)
diff --git a/fs/locks.c b/fs/locks.c
index 76ad05f8070a..46d88b9e222c 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -167,8 +167,8 @@ static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS);
*/
static DEFINE_SPINLOCK(blocked_lock_lock);
-static struct kmem_cache *flctx_cache __read_mostly;
-static struct kmem_cache *filelock_cache __read_mostly;
+static struct kmem_cache *flctx_cache __ro_after_init;
+static struct kmem_cache *filelock_cache __ro_after_init;
static struct file_lock_context *
locks_get_lock_context(struct inode *inode, int type)
@@ -2264,11 +2264,13 @@ out:
* To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX
* locks, the ->lock() interface may return asynchronously, before the lock has
* been granted or denied by the underlying filesystem, if (and only if)
- * lm_grant is set. Callers expecting ->lock() to return asynchronously
- * will only use F_SETLK, not F_SETLKW; they will set FL_SLEEP if (and only if)
- * the request is for a blocking lock. When ->lock() does return asynchronously,
- * it must return FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock
- * request completes.
+ * lm_grant is set. Additionally EXPORT_OP_ASYNC_LOCK in export_operations
+ * flags need to be set.
+ *
+ * Callers expecting ->lock() to return asynchronously will only use F_SETLK,
+ * not F_SETLKW; they will set FL_SLEEP if (and only if) the request is for a
+ * blocking lock. When ->lock() does return asynchronously, it must return
+ * FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock request completes.
* If the request is for non-blocking lock the file system should return
* FILE_LOCK_DEFERRED then try to get the lock and call the callback routine
* with the result. If the request timed out the callback routine will return a
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 2a4b8b549e93..82aa7a35db26 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -37,7 +37,7 @@ struct mb_cache {
struct list_head c_list;
/* Number of entries in cache */
unsigned long c_entry_count;
- struct shrinker c_shrink;
+ struct shrinker *c_shrink;
/* Work for shrinking when the cache has too many entries */
struct work_struct c_shrink_work;
};
@@ -293,8 +293,7 @@ EXPORT_SYMBOL(mb_cache_entry_touch);
static unsigned long mb_cache_count(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct mb_cache *cache = container_of(shrink, struct mb_cache,
- c_shrink);
+ struct mb_cache *cache = shrink->private_data;
return cache->c_entry_count;
}
@@ -333,8 +332,7 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache,
static unsigned long mb_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct mb_cache *cache = container_of(shrink, struct mb_cache,
- c_shrink);
+ struct mb_cache *cache = shrink->private_data;
return mb_cache_shrink(cache, sc->nr_to_scan);
}
@@ -377,15 +375,19 @@ struct mb_cache *mb_cache_create(int bucket_bits)
for (i = 0; i < bucket_count; i++)
INIT_HLIST_BL_HEAD(&cache->c_hash[i]);
- cache->c_shrink.count_objects = mb_cache_count;
- cache->c_shrink.scan_objects = mb_cache_scan;
- cache->c_shrink.seeks = DEFAULT_SEEKS;
- if (register_shrinker(&cache->c_shrink, "mbcache-shrinker")) {
+ cache->c_shrink = shrinker_alloc(0, "mbcache-shrinker");
+ if (!cache->c_shrink) {
kfree(cache->c_hash);
kfree(cache);
goto err_out;
}
+ cache->c_shrink->count_objects = mb_cache_count;
+ cache->c_shrink->scan_objects = mb_cache_scan;
+ cache->c_shrink->private_data = cache;
+
+ shrinker_register(cache->c_shrink);
+
INIT_WORK(&cache->c_shrink_work, mb_cache_shrink_worker);
return cache;
@@ -406,7 +408,7 @@ void mb_cache_destroy(struct mb_cache *cache)
{
struct mb_cache_entry *entry, *next;
- unregister_shrinker(&cache->c_shrink);
+ shrinker_free(cache->c_shrink);
/*
* We don't bother with any locking. Cache must not be used at this
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 25c08fbfcb9d..7da66ca184f4 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -251,7 +251,7 @@ struct inode *minix_new_inode(const struct inode *dir, umode_t mode)
}
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
inode->i_ino = j;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_blocks = 0;
memset(&minix_i(inode)->u, 0, sizeof(minix_i(inode)->u));
insert_inode_hash(inode);
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 20f23e6e58ad..62c313fc9a49 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -281,7 +281,7 @@ got_it:
de->inode = inode->i_ino;
}
dir_commit_chunk(page, pos, sbi->s_dirsize);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
err = minix_handle_dirsync(dir);
out_put:
@@ -313,7 +313,7 @@ int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
else
de->inode = 0;
dir_commit_chunk(page, pos, len);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
return minix_handle_dirsync(inode);
}
@@ -436,7 +436,7 @@ int minix_set_link(struct minix_dir_entry *de, struct page *page,
else
de->inode = inode->i_ino;
dir_commit_chunk(page, pos, sbi->s_dirsize);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
return minix_handle_dirsync(dir);
}
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index df575473c1cc..f8af6c3ae336 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -501,7 +501,8 @@ static struct inode *V1_minix_iget(struct inode *inode)
i_gid_write(inode, raw_inode->i_gid);
set_nlink(inode, raw_inode->i_nlinks);
inode->i_size = raw_inode->i_size;
- inode->i_mtime = inode->i_atime = inode_set_ctime(inode, raw_inode->i_time, 0);
+ inode_set_mtime_to_ts(inode,
+ inode_set_atime_to_ts(inode, inode_set_ctime(inode, raw_inode->i_time, 0)));
inode->i_blocks = 0;
for (i = 0; i < 9; i++)
minix_inode->u.i1_data[i] = raw_inode->i_zone[i];
@@ -538,11 +539,9 @@ static struct inode *V2_minix_iget(struct inode *inode)
i_gid_write(inode, raw_inode->i_gid);
set_nlink(inode, raw_inode->i_nlinks);
inode->i_size = raw_inode->i_size;
- inode->i_mtime.tv_sec = raw_inode->i_mtime;
- inode->i_atime.tv_sec = raw_inode->i_atime;
+ inode_set_mtime(inode, raw_inode->i_mtime, 0);
+ inode_set_atime(inode, raw_inode->i_atime, 0);
inode_set_ctime(inode, raw_inode->i_ctime, 0);
- inode->i_mtime.tv_nsec = 0;
- inode->i_atime.tv_nsec = 0;
inode->i_blocks = 0;
for (i = 0; i < 10; i++)
minix_inode->u.i2_data[i] = raw_inode->i_zone[i];
@@ -589,7 +588,7 @@ static struct buffer_head * V1_minix_update_inode(struct inode * inode)
raw_inode->i_gid = fs_high2lowgid(i_gid_read(inode));
raw_inode->i_nlinks = inode->i_nlink;
raw_inode->i_size = inode->i_size;
- raw_inode->i_time = inode->i_mtime.tv_sec;
+ raw_inode->i_time = inode_get_mtime_sec(inode);
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
raw_inode->i_zone[0] = old_encode_dev(inode->i_rdev);
else for (i = 0; i < 9; i++)
@@ -616,9 +615,9 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode)
raw_inode->i_gid = fs_high2lowgid(i_gid_read(inode));
raw_inode->i_nlinks = inode->i_nlink;
raw_inode->i_size = inode->i_size;
- raw_inode->i_mtime = inode->i_mtime.tv_sec;
- raw_inode->i_atime = inode->i_atime.tv_sec;
- raw_inode->i_ctime = inode_get_ctime(inode).tv_sec;
+ raw_inode->i_mtime = inode_get_mtime_sec(inode);
+ raw_inode->i_atime = inode_get_atime_sec(inode);
+ raw_inode->i_ctime = inode_get_ctime_sec(inode);
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
raw_inode->i_zone[0] = old_encode_dev(inode->i_rdev);
else for (i = 0; i < 10; i++)
diff --git a/fs/minix/itree_common.c b/fs/minix/itree_common.c
index ce18ae37c29d..dad131e30c05 100644
--- a/fs/minix/itree_common.c
+++ b/fs/minix/itree_common.c
@@ -350,7 +350,7 @@ do_indirects:
}
first_whole++;
}
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
}
diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c
index 4905665c47d0..57d1dedf3f8f 100644
--- a/fs/mnt_idmapping.c
+++ b/fs/mnt_idmapping.c
@@ -256,6 +256,7 @@ struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap)
return idmap;
}
+EXPORT_SYMBOL_GPL(mnt_idmap_get);
/**
* mnt_idmap_put - put a reference to an idmapping
@@ -271,3 +272,4 @@ void mnt_idmap_put(struct mnt_idmap *idmap)
kfree(idmap);
}
}
+EXPORT_SYMBOL_GPL(mnt_idmap_put);
diff --git a/fs/mpage.c b/fs/mpage.c
index 242e213ee064..ffb064ed9d04 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -119,8 +119,7 @@ static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh,
folio_mark_uptodate(folio);
return;
}
- create_empty_buffers(&folio->page, i_blocksize(inode), 0);
- head = folio_buffers(folio);
+ head = create_empty_buffers(folio, i_blocksize(inode), 0);
}
page_bh = head;
diff --git a/fs/namei.c b/fs/namei.c
index 94565bd7e73f..71c13b2990b4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3105,25 +3105,6 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
EXPORT_SYMBOL(unlock_rename);
/**
- * mode_strip_umask - handle vfs umask stripping
- * @dir: parent directory of the new inode
- * @mode: mode of the new inode to be created in @dir
- *
- * Umask stripping depends on whether or not the filesystem supports POSIX
- * ACLs. If the filesystem doesn't support it umask stripping is done directly
- * in here. If the filesystem does support POSIX ACLs umask stripping is
- * deferred until the filesystem calls posix_acl_create().
- *
- * Returns: mode
- */
-static inline umode_t mode_strip_umask(const struct inode *dir, umode_t mode)
-{
- if (!IS_POSIXACL(dir))
- mode &= ~current_umask();
- return mode;
-}
-
-/**
* vfs_prepare_mode - prepare the mode to be used for a new inode
* @idmap: idmap of the mount the inode was found from
* @dir: parent directory of the new inode
@@ -3536,7 +3517,8 @@ static const char *open_last_lookups(struct nameidata *nd,
if (likely(dentry))
goto finish_lookup;
- BUG_ON(nd->flags & LOOKUP_RCU);
+ if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU))
+ return ERR_PTR(-ECHILD);
} else {
/* create side of things */
if (nd->flags & LOOKUP_RCU) {
@@ -3803,7 +3785,10 @@ static struct file *path_openat(struct nameidata *nd,
WARN_ON(1);
error = -EINVAL;
}
- fput(file);
+ if (unlikely(file->f_mode & FMODE_OPENED))
+ fput(file);
+ else
+ release_empty_file(file);
if (error == -EOPENSTALE) {
if (flags & LOOKUP_RCU)
error = -ECHILD;
@@ -4387,11 +4372,9 @@ retry_deleg:
if (!IS_ERR(dentry)) {
/* Why not before? Because we want correct error value */
- if (last.name[last.len])
+ if (last.name[last.len] || d_is_negative(dentry))
goto slashes;
inode = dentry->d_inode;
- if (d_is_negative(dentry))
- goto slashes;
ihold(inode);
error = security_path_unlink(&path, dentry);
if (error)
diff --git a/fs/namespace.c b/fs/namespace.c
index e157efc54023..fbf0e596fcd3 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -39,10 +39,10 @@
/* Maximum number of mounts in a mount namespace */
static unsigned int sysctl_mount_max __read_mostly = 100000;
-static unsigned int m_hash_mask __read_mostly;
-static unsigned int m_hash_shift __read_mostly;
-static unsigned int mp_hash_mask __read_mostly;
-static unsigned int mp_hash_shift __read_mostly;
+static unsigned int m_hash_mask __ro_after_init;
+static unsigned int m_hash_shift __ro_after_init;
+static unsigned int mp_hash_mask __ro_after_init;
+static unsigned int mp_hash_shift __ro_after_init;
static __initdata unsigned long mhash_entries;
static int __init set_mhash_entries(char *str)
@@ -68,9 +68,9 @@ static u64 event;
static DEFINE_IDA(mnt_id_ida);
static DEFINE_IDA(mnt_group_ida);
-static struct hlist_head *mount_hashtable __read_mostly;
-static struct hlist_head *mountpoint_hashtable __read_mostly;
-static struct kmem_cache *mnt_cache __read_mostly;
+static struct hlist_head *mount_hashtable __ro_after_init;
+static struct hlist_head *mountpoint_hashtable __ro_after_init;
+static struct kmem_cache *mnt_cache __ro_after_init;
static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
@@ -86,7 +86,7 @@ struct mount_kattr {
};
/* /sys/fs */
-struct kobject *fs_kobj;
+struct kobject *fs_kobj __ro_after_init;
EXPORT_SYMBOL_GPL(fs_kobj);
/*
@@ -330,16 +330,16 @@ static int mnt_is_readonly(struct vfsmount *mnt)
* can determine when writes are able to occur to a filesystem.
*/
/**
- * __mnt_want_write - get write access to a mount without freeze protection
+ * mnt_get_write_access - get write access to a mount without freeze protection
* @m: the mount on which to take a write
*
* This tells the low-level filesystem that a write is about to be performed to
* it, and makes sure that writes are allowed (mnt it read-write) before
* returning success. This operation does not protect against filesystem being
- * frozen. When the write operation is finished, __mnt_drop_write() must be
+ * frozen. When the write operation is finished, mnt_put_write_access() must be
* called. This is effectively a refcount.
*/
-int __mnt_want_write(struct vfsmount *m)
+int mnt_get_write_access(struct vfsmount *m)
{
struct mount *mnt = real_mount(m);
int ret = 0;
@@ -386,6 +386,7 @@ int __mnt_want_write(struct vfsmount *m)
return ret;
}
+EXPORT_SYMBOL_GPL(mnt_get_write_access);
/**
* mnt_want_write - get write access to a mount
@@ -401,7 +402,7 @@ int mnt_want_write(struct vfsmount *m)
int ret;
sb_start_write(m->mnt_sb);
- ret = __mnt_want_write(m);
+ ret = mnt_get_write_access(m);
if (ret)
sb_end_write(m->mnt_sb);
return ret;
@@ -409,15 +410,15 @@ int mnt_want_write(struct vfsmount *m)
EXPORT_SYMBOL_GPL(mnt_want_write);
/**
- * __mnt_want_write_file - get write access to a file's mount
+ * mnt_get_write_access_file - get write access to a file's mount
* @file: the file who's mount on which to take a write
*
- * This is like __mnt_want_write, but if the file is already open for writing it
+ * This is like mnt_get_write_access, but if @file is already open for write it
* skips incrementing mnt_writers (since the open file already has a reference)
* and instead only does the check for emergency r/o remounts. This must be
- * paired with __mnt_drop_write_file.
+ * paired with mnt_put_write_access_file.
*/
-int __mnt_want_write_file(struct file *file)
+int mnt_get_write_access_file(struct file *file)
{
if (file->f_mode & FMODE_WRITER) {
/*
@@ -428,7 +429,7 @@ int __mnt_want_write_file(struct file *file)
return -EROFS;
return 0;
}
- return __mnt_want_write(file->f_path.mnt);
+ return mnt_get_write_access(file->f_path.mnt);
}
/**
@@ -445,7 +446,7 @@ int mnt_want_write_file(struct file *file)
int ret;
sb_start_write(file_inode(file)->i_sb);
- ret = __mnt_want_write_file(file);
+ ret = mnt_get_write_access_file(file);
if (ret)
sb_end_write(file_inode(file)->i_sb);
return ret;
@@ -453,19 +454,20 @@ int mnt_want_write_file(struct file *file)
EXPORT_SYMBOL_GPL(mnt_want_write_file);
/**
- * __mnt_drop_write - give up write access to a mount
+ * mnt_put_write_access - give up write access to a mount
* @mnt: the mount on which to give up write access
*
* Tells the low-level filesystem that we are done
* performing writes to it. Must be matched with
- * __mnt_want_write() call above.
+ * mnt_get_write_access() call above.
*/
-void __mnt_drop_write(struct vfsmount *mnt)
+void mnt_put_write_access(struct vfsmount *mnt)
{
preempt_disable();
mnt_dec_writers(real_mount(mnt));
preempt_enable();
}
+EXPORT_SYMBOL_GPL(mnt_put_write_access);
/**
* mnt_drop_write - give up write access to a mount
@@ -477,20 +479,20 @@ void __mnt_drop_write(struct vfsmount *mnt)
*/
void mnt_drop_write(struct vfsmount *mnt)
{
- __mnt_drop_write(mnt);
+ mnt_put_write_access(mnt);
sb_end_write(mnt->mnt_sb);
}
EXPORT_SYMBOL_GPL(mnt_drop_write);
-void __mnt_drop_write_file(struct file *file)
+void mnt_put_write_access_file(struct file *file)
{
if (!(file->f_mode & FMODE_WRITER))
- __mnt_drop_write(file->f_path.mnt);
+ mnt_put_write_access(file->f_path.mnt);
}
void mnt_drop_write_file(struct file *file)
{
- __mnt_drop_write_file(file);
+ mnt_put_write_access_file(file);
sb_end_write(file_inode(file)->i_sb);
}
EXPORT_SYMBOL(mnt_drop_write_file);
@@ -1344,9 +1346,9 @@ void mntput(struct vfsmount *mnt)
{
if (mnt) {
struct mount *m = real_mount(mnt);
- /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
+ /* avoid cacheline pingpong */
if (unlikely(m->mnt_expiry_mark))
- m->mnt_expiry_mark = 0;
+ WRITE_ONCE(m->mnt_expiry_mark, 0);
mntput_no_expire(m);
}
}
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 7df2503cef6c..01ac733a6320 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -125,7 +125,7 @@ config PNFS_BLOCK
config PNFS_FLEXFILE_LAYOUT
tristate
- depends on NFS_V4_1 && NFS_V3
+ depends on NFS_V4_1
default NFS_V4
config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 716bc75e9ed2..b4294a8aa2d4 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -108,7 +108,7 @@ struct pnfs_block_dev {
struct pnfs_block_dev *children;
u64 chunk_size;
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
u64 disk_offset;
u64 pr_key;
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index 65cbb5607a5f..f318a05a80e1 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -25,17 +25,17 @@ bl_free_device(struct pnfs_block_dev *dev)
} else {
if (dev->pr_registered) {
const struct pr_ops *ops =
- dev->bdev->bd_disk->fops->pr_ops;
+ dev->bdev_handle->bdev->bd_disk->fops->pr_ops;
int error;
- error = ops->pr_register(dev->bdev, dev->pr_key, 0,
- false);
+ error = ops->pr_register(dev->bdev_handle->bdev,
+ dev->pr_key, 0, false);
if (error)
pr_err("failed to unregister PR key.\n");
}
- if (dev->bdev)
- blkdev_put(dev->bdev, NULL);
+ if (dev->bdev_handle)
+ bdev_release(dev->bdev_handle);
}
}
@@ -169,7 +169,7 @@ static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset,
map->start = dev->start;
map->len = dev->len;
map->disk_offset = dev->disk_offset;
- map->bdev = dev->bdev;
+ map->bdev = dev->bdev_handle->bdev;
return true;
}
@@ -236,28 +236,26 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
struct pnfs_block_volume *v = &volumes[idx];
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
dev_t dev;
dev = bl_resolve_deviceid(server, v, gfp_mask);
if (!dev)
return -EIO;
- bdev = blkdev_get_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, NULL,
- NULL);
- if (IS_ERR(bdev)) {
+ bdev_handle = bdev_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE,
+ NULL, NULL);
+ if (IS_ERR(bdev_handle)) {
printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
- MAJOR(dev), MINOR(dev), PTR_ERR(bdev));
- return PTR_ERR(bdev);
+ MAJOR(dev), MINOR(dev), PTR_ERR(bdev_handle));
+ return PTR_ERR(bdev_handle);
}
- d->bdev = bdev;
-
-
- d->len = bdev_nr_bytes(d->bdev);
+ d->bdev_handle = bdev_handle;
+ d->len = bdev_nr_bytes(bdev_handle->bdev);
d->map = bl_map_simple;
printk(KERN_INFO "pNFS: using block device %s\n",
- d->bdev->bd_disk->disk_name);
+ bdev_handle->bdev->bd_disk->disk_name);
return 0;
}
@@ -302,10 +300,10 @@ bl_validate_designator(struct pnfs_block_volume *v)
}
}
-static struct block_device *
+static struct bdev_handle *
bl_open_path(struct pnfs_block_volume *v, const char *prefix)
{
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
const char *devname;
devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/%s%*phN",
@@ -313,15 +311,15 @@ bl_open_path(struct pnfs_block_volume *v, const char *prefix)
if (!devname)
return ERR_PTR(-ENOMEM);
- bdev = blkdev_get_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE, NULL,
- NULL);
- if (IS_ERR(bdev)) {
+ bdev_handle = bdev_open_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE,
+ NULL, NULL);
+ if (IS_ERR(bdev_handle)) {
pr_warn("pNFS: failed to open device %s (%ld)\n",
- devname, PTR_ERR(bdev));
+ devname, PTR_ERR(bdev_handle));
}
kfree(devname);
- return bdev;
+ return bdev_handle;
}
static int
@@ -329,7 +327,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
struct pnfs_block_volume *v = &volumes[idx];
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
const struct pr_ops *ops;
int error;
@@ -342,32 +340,32 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
* On other distributions like Debian, the default SCSI by-id path will
* point to the dm-multipath device if one exists.
*/
- bdev = bl_open_path(v, "dm-uuid-mpath-0x");
- if (IS_ERR(bdev))
- bdev = bl_open_path(v, "wwn-0x");
- if (IS_ERR(bdev))
- return PTR_ERR(bdev);
- d->bdev = bdev;
-
- d->len = bdev_nr_bytes(d->bdev);
+ bdev_handle = bl_open_path(v, "dm-uuid-mpath-0x");
+ if (IS_ERR(bdev_handle))
+ bdev_handle = bl_open_path(v, "wwn-0x");
+ if (IS_ERR(bdev_handle))
+ return PTR_ERR(bdev_handle);
+ d->bdev_handle = bdev_handle;
+
+ d->len = bdev_nr_bytes(d->bdev_handle->bdev);
d->map = bl_map_simple;
d->pr_key = v->scsi.pr_key;
pr_info("pNFS: using block device %s (reservation key 0x%llx)\n",
- d->bdev->bd_disk->disk_name, d->pr_key);
+ d->bdev_handle->bdev->bd_disk->disk_name, d->pr_key);
- ops = d->bdev->bd_disk->fops->pr_ops;
+ ops = d->bdev_handle->bdev->bd_disk->fops->pr_ops;
if (!ops) {
pr_err("pNFS: block device %s does not support reservations.",
- d->bdev->bd_disk->disk_name);
+ d->bdev_handle->bdev->bd_disk->disk_name);
error = -EINVAL;
goto out_blkdev_put;
}
- error = ops->pr_register(d->bdev, 0, d->pr_key, true);
+ error = ops->pr_register(d->bdev_handle->bdev, 0, d->pr_key, true);
if (error) {
pr_err("pNFS: failed to register key for block device %s.",
- d->bdev->bd_disk->disk_name);
+ d->bdev_handle->bdev->bd_disk->disk_name);
goto out_blkdev_put;
}
@@ -375,7 +373,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
return 0;
out_blkdev_put:
- blkdev_put(d->bdev, NULL);
+ bdev_release(d->bdev_handle);
return error;
}
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 466ebf1d41b2..4ffa1f469e90 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -78,7 +78,7 @@ nfs4_callback_svc(void *vrqstp)
set_freezable();
- while (!kthread_freezable_should_stop(NULL))
+ while (!svc_thread_should_stop(rqstp))
svc_recv(rqstp);
svc_exit_thread(rqstp);
@@ -86,45 +86,6 @@ nfs4_callback_svc(void *vrqstp)
}
#if defined(CONFIG_NFS_V4_1)
-/*
- * The callback service for NFSv4.1 callbacks
- */
-static int
-nfs41_callback_svc(void *vrqstp)
-{
- struct svc_rqst *rqstp = vrqstp;
- struct svc_serv *serv = rqstp->rq_server;
- struct rpc_rqst *req;
- int error;
- DEFINE_WAIT(wq);
-
- set_freezable();
-
- while (!kthread_freezable_should_stop(NULL)) {
- prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_IDLE);
- spin_lock_bh(&serv->sv_cb_lock);
- if (!list_empty(&serv->sv_cb_list)) {
- req = list_first_entry(&serv->sv_cb_list,
- struct rpc_rqst, rq_bc_list);
- list_del(&req->rq_bc_list);
- spin_unlock_bh(&serv->sv_cb_lock);
- finish_wait(&serv->sv_cb_waitq, &wq);
- dprintk("Invoking bc_svc_process()\n");
- error = bc_svc_process(serv, req, rqstp);
- dprintk("bc_svc_process() returned w/ error code= %d\n",
- error);
- } else {
- spin_unlock_bh(&serv->sv_cb_lock);
- if (!kthread_should_stop())
- schedule();
- finish_wait(&serv->sv_cb_waitq, &wq);
- }
- }
-
- svc_exit_thread(rqstp);
- return 0;
-}
-
static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
struct svc_serv *serv)
{
@@ -237,10 +198,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
cb_info->users);
threadfn = nfs4_callback_svc;
-#if defined(CONFIG_NFS_V4_1)
- if (minorversion)
- threadfn = nfs41_callback_svc;
-#else
+#if !defined(CONFIG_NFS_V4_1)
if (minorversion)
return ERR_PTR(-ENOTSUPP);
#endif
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 6bed1394d748..96a4923080ae 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -60,7 +60,7 @@ __be32 nfs4_callback_getattr(void *argp, void *resp,
if (nfs_have_writebacks(inode))
res->change_attr++;
res->ctime = inode_get_ctime(inode);
- res->mtime = inode->i_mtime;
+ res->mtime = inode_get_mtime(inode);
res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) &
args->bitmap[0];
res->bitmap[1] = (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) &
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index cf7365581031..fa1a14def45c 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -448,6 +448,7 @@ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred,
delegation->cred = get_cred(cred);
delegation->inode = inode;
delegation->flags = 1<<NFS_DELEGATION_REFERENCED;
+ delegation->test_gen = 0;
spin_lock_init(&delegation->lock);
spin_lock(&clp->cl_lock);
@@ -1294,6 +1295,8 @@ static int nfs_server_reap_expired_delegations(struct nfs_server *server,
struct inode *inode;
const struct cred *cred;
nfs4_stateid stateid;
+ unsigned long gen = ++server->delegation_gen;
+
restart:
rcu_read_lock();
restart_locked:
@@ -1303,7 +1306,8 @@ restart_locked:
test_bit(NFS_DELEGATION_RETURNING,
&delegation->flags) ||
test_bit(NFS_DELEGATION_TEST_EXPIRED,
- &delegation->flags) == 0)
+ &delegation->flags) == 0 ||
+ delegation->test_gen == gen)
continue;
inode = nfs_delegation_grab_inode(delegation);
if (inode == NULL)
@@ -1312,6 +1316,7 @@ restart_locked:
cred = get_cred_rcu(delegation->cred);
nfs4_stateid_copy(&stateid, &delegation->stateid);
spin_unlock(&delegation->lock);
+ delegation->test_gen = gen;
clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags);
rcu_read_unlock();
nfs_delegation_test_free_expired(inode, &stateid, cred);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 1c378992b7c0..a6f495d012cf 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -21,6 +21,7 @@ struct nfs_delegation {
fmode_t type;
unsigned long pagemod_limit;
__u64 change_attr;
+ unsigned long test_gen;
unsigned long flags;
refcount_t refcount;
spinlock_t lock;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e6a51fd94fea..13dffe4201e6 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2532,7 +2532,7 @@ EXPORT_SYMBOL_GPL(nfs_unlink);
int nfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const char *symname)
{
- struct page *page;
+ struct folio *folio;
char *kaddr;
struct iattr attr;
unsigned int pathlen = strlen(symname);
@@ -2547,24 +2547,24 @@ int nfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
attr.ia_mode = S_IFLNK | S_IRWXUGO;
attr.ia_valid = ATTR_MODE;
- page = alloc_page(GFP_USER);
- if (!page)
+ folio = folio_alloc(GFP_USER, 0);
+ if (!folio)
return -ENOMEM;
- kaddr = page_address(page);
+ kaddr = folio_address(folio);
memcpy(kaddr, symname, pathlen);
if (pathlen < PAGE_SIZE)
memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen);
trace_nfs_symlink_enter(dir, dentry);
- error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
+ error = NFS_PROTO(dir)->symlink(dir, dentry, folio, pathlen, &attr);
trace_nfs_symlink_exit(dir, dentry, error);
if (error != 0) {
dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s) error %d\n",
dir->i_sb->s_id, dir->i_ino,
dentry, symname, error);
d_drop(dentry);
- __free_page(page);
+ folio_put(folio);
return error;
}
@@ -2574,18 +2574,13 @@ int nfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
* No big deal if we can't add this page to the page cache here.
* READLINK will get the missing page from the server if needed.
*/
- if (!add_to_page_cache_lru(page, d_inode(dentry)->i_mapping, 0,
- GFP_KERNEL)) {
- SetPageUptodate(page);
- unlock_page(page);
- /*
- * add_to_page_cache_lru() grabs an extra page refcount.
- * Drop it here to avoid leaking this page later.
- */
- put_page(page);
- } else
- __free_page(page);
+ if (filemap_add_folio(d_inode(dentry)->i_mapping, folio, 0,
+ GFP_KERNEL) == 0) {
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
+ }
+ folio_put(folio);
return 0;
}
EXPORT_SYMBOL_GPL(nfs_symlink);
diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h
index aed0748fd6ec..c7bb5da93307 100644
--- a/fs/nfs/filelayout/filelayout.h
+++ b/fs/nfs/filelayout/filelayout.h
@@ -51,7 +51,7 @@ struct nfs4_file_layout_dsaddr {
u32 stripe_count;
u8 *stripe_indices;
u32 ds_num;
- struct nfs4_pnfs_ds *ds_list[];
+ struct nfs4_pnfs_ds *ds_list[] __counted_by(ds_num);
};
struct nfs4_filelayout_segment {
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 354a031c69b1..f84b3fb0dddd 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -99,7 +99,7 @@ struct nfs4_ff_layout_segment {
u64 stripe_unit;
u32 flags;
u32 mirror_array_cnt;
- struct nfs4_ff_layout_mirror *mirror_array[];
+ struct nfs4_ff_layout_mirror *mirror_array[] __counted_by(mirror_array_cnt);
};
struct nfs4_flexfile_layout {
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 2dc64454492b..5407ab8c8783 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -114,8 +114,8 @@ static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *
struct inode *inode)
{
memset(auxdata, 0, sizeof(*auxdata));
- auxdata->mtime_sec = inode->i_mtime.tv_sec;
- auxdata->mtime_nsec = inode->i_mtime.tv_nsec;
+ auxdata->mtime_sec = inode_get_mtime(inode).tv_sec;
+ auxdata->mtime_nsec = inode_get_mtime(inode).tv_nsec;
auxdata->ctime_sec = inode_get_ctime(inode).tv_sec;
auxdata->ctime_nsec = inode_get_ctime(inode).tv_nsec;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e21c073158e5..ebb8d60e1152 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -512,8 +512,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
} else
init_special_inode(inode, inode->i_mode, fattr->rdev);
- memset(&inode->i_atime, 0, sizeof(inode->i_atime));
- memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
+ inode_set_atime(inode, 0, 0);
+ inode_set_mtime(inode, 0, 0);
inode_set_ctime(inode, 0, 0);
inode_set_iversion_raw(inode, 0);
inode->i_size = 0;
@@ -527,11 +527,11 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
nfsi->read_cache_jiffies = fattr->time_start;
nfsi->attr_gencount = fattr->gencount;
if (fattr->valid & NFS_ATTR_FATTR_ATIME)
- inode->i_atime = fattr->atime;
+ inode_set_atime_to_ts(inode, fattr->atime);
else if (fattr_supported & NFS_ATTR_FATTR_ATIME)
nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME);
if (fattr->valid & NFS_ATTR_FATTR_MTIME)
- inode->i_mtime = fattr->mtime;
+ inode_set_mtime_to_ts(inode, fattr->mtime);
else if (fattr_supported & NFS_ATTR_FATTR_MTIME)
nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME);
if (fattr->valid & NFS_ATTR_FATTR_CTIME)
@@ -742,9 +742,9 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_ATIME
| NFS_INO_INVALID_CTIME);
if (fattr->valid & NFS_ATTR_FATTR_ATIME)
- inode->i_atime = fattr->atime;
+ inode_set_atime_to_ts(inode, fattr->atime);
else if (attr->ia_valid & ATTR_ATIME_SET)
- inode->i_atime = attr->ia_atime;
+ inode_set_atime_to_ts(inode, attr->ia_atime);
else
nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME);
@@ -758,9 +758,9 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_MTIME
| NFS_INO_INVALID_CTIME);
if (fattr->valid & NFS_ATTR_FATTR_MTIME)
- inode->i_mtime = fattr->mtime;
+ inode_set_mtime_to_ts(inode, fattr->mtime);
else if (attr->ia_valid & ATTR_MTIME_SET)
- inode->i_mtime = attr->ia_mtime;
+ inode_set_mtime_to_ts(inode, attr->ia_mtime);
else
nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME);
@@ -1451,11 +1451,11 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
inode_set_ctime_to_ts(inode, fattr->ctime);
}
- ts = inode->i_mtime;
+ ts = inode_get_mtime(inode);
if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
&& (fattr->valid & NFS_ATTR_FATTR_MTIME)
&& timespec64_equal(&ts, &fattr->pre_mtime)) {
- inode->i_mtime = fattr->mtime;
+ inode_set_mtime_to_ts(inode, fattr->mtime);
}
if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
&& (fattr->valid & NFS_ATTR_FATTR_SIZE)
@@ -1506,7 +1506,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && !inode_eq_iversion_raw(inode, fattr->change_attr))
invalid |= NFS_INO_INVALID_CHANGE;
- ts = inode->i_mtime;
+ ts = inode_get_mtime(inode);
if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec64_equal(&ts, &fattr->mtime))
invalid |= NFS_INO_INVALID_MTIME;
@@ -1534,7 +1534,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink)
invalid |= NFS_INO_INVALID_NLINK;
- ts = inode->i_atime;
+ ts = inode_get_atime(inode);
if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec64_equal(&ts, &fattr->atime))
invalid |= NFS_INO_INVALID_ATIME;
@@ -2002,7 +2002,7 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa
}
if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 &&
(fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) {
- fattr->pre_mtime = inode->i_mtime;
+ fattr->pre_mtime = inode_get_mtime(inode);
fattr->valid |= NFS_ATTR_FATTR_PREMTIME;
}
if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 &&
@@ -2184,7 +2184,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
}
if (fattr->valid & NFS_ATTR_FATTR_MTIME)
- inode->i_mtime = fattr->mtime;
+ inode_set_mtime_to_ts(inode, fattr->mtime);
else if (fattr_supported & NFS_ATTR_FATTR_MTIME)
nfsi->cache_validity |=
save_cache_validity & NFS_INO_INVALID_MTIME;
@@ -2220,7 +2220,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
save_cache_validity & NFS_INO_INVALID_SIZE;
if (fattr->valid & NFS_ATTR_FATTR_ATIME)
- inode->i_atime = fattr->atime;
+ inode_set_atime_to_ts(inode, fattr->atime);
else if (fattr_supported & NFS_ATTR_FATTR_ATIME)
nfsi->cache_validity |=
save_cache_validity & NFS_INO_INVALID_ATIME;
diff --git a/fs/nfs/nfs.h b/fs/nfs/nfs.h
index 5ba00610aede..0d3ce0460e35 100644
--- a/fs/nfs/nfs.h
+++ b/fs/nfs/nfs.h
@@ -18,7 +18,7 @@ struct nfs_subversion {
const struct rpc_version *rpc_vers; /* NFS version information */
const struct nfs_rpc_ops *rpc_ops; /* NFS operations */
const struct super_operations *sops; /* NFS Super operations */
- const struct xattr_handler **xattr; /* NFS xattr handlers */
+ const struct xattr_handler * const *xattr; /* NFS xattr handlers */
struct list_head list; /* List of NFS versions */
};
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 4bf208a0a8e9..2de66e4e8280 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -543,9 +543,10 @@ out:
}
static int
-nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
+nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct folio *folio,
unsigned int len, struct iattr *sattr)
{
+ struct page *page = &folio->page;
struct nfs3_createdata *data;
struct dentry *d_alias;
int status = -ENOMEM;
diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c
index 911f634ba3da..2ad66a8922f4 100644
--- a/fs/nfs/nfs42xattr.c
+++ b/fs/nfs/nfs42xattr.c
@@ -796,28 +796,9 @@ static unsigned long nfs4_xattr_cache_scan(struct shrinker *shrink,
static unsigned long nfs4_xattr_entry_scan(struct shrinker *shrink,
struct shrink_control *sc);
-static struct shrinker nfs4_xattr_cache_shrinker = {
- .count_objects = nfs4_xattr_cache_count,
- .scan_objects = nfs4_xattr_cache_scan,
- .seeks = DEFAULT_SEEKS,
- .flags = SHRINKER_MEMCG_AWARE,
-};
-
-static struct shrinker nfs4_xattr_entry_shrinker = {
- .count_objects = nfs4_xattr_entry_count,
- .scan_objects = nfs4_xattr_entry_scan,
- .seeks = DEFAULT_SEEKS,
- .batch = 512,
- .flags = SHRINKER_MEMCG_AWARE,
-};
-
-static struct shrinker nfs4_xattr_large_entry_shrinker = {
- .count_objects = nfs4_xattr_entry_count,
- .scan_objects = nfs4_xattr_entry_scan,
- .seeks = 1,
- .batch = 512,
- .flags = SHRINKER_MEMCG_AWARE,
-};
+static struct shrinker *nfs4_xattr_cache_shrinker;
+static struct shrinker *nfs4_xattr_entry_shrinker;
+static struct shrinker *nfs4_xattr_large_entry_shrinker;
static enum lru_status
cache_lru_isolate(struct list_head *item,
@@ -943,7 +924,7 @@ nfs4_xattr_entry_scan(struct shrinker *shrink, struct shrink_control *sc)
struct nfs4_xattr_entry *entry;
struct list_lru *lru;
- lru = (shrink == &nfs4_xattr_large_entry_shrinker) ?
+ lru = (shrink == nfs4_xattr_large_entry_shrinker) ?
&nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
freed = list_lru_shrink_walk(lru, sc, entry_lru_isolate, &dispose);
@@ -971,7 +952,7 @@ nfs4_xattr_entry_count(struct shrinker *shrink, struct shrink_control *sc)
unsigned long count;
struct list_lru *lru;
- lru = (shrink == &nfs4_xattr_large_entry_shrinker) ?
+ lru = (shrink == nfs4_xattr_large_entry_shrinker) ?
&nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
count = list_lru_shrink_count(lru, sc);
@@ -991,18 +972,34 @@ static void nfs4_xattr_cache_init_once(void *p)
INIT_LIST_HEAD(&cache->dispose);
}
-static int nfs4_xattr_shrinker_init(struct shrinker *shrinker,
- struct list_lru *lru, const char *name)
+typedef unsigned long (*count_objects_cb)(struct shrinker *s,
+ struct shrink_control *sc);
+typedef unsigned long (*scan_objects_cb)(struct shrinker *s,
+ struct shrink_control *sc);
+
+static int __init nfs4_xattr_shrinker_init(struct shrinker **shrinker,
+ struct list_lru *lru, const char *name,
+ count_objects_cb count,
+ scan_objects_cb scan, long batch, int seeks)
{
- int ret = 0;
+ int ret;
- ret = register_shrinker(shrinker, name);
- if (ret)
+ *shrinker = shrinker_alloc(SHRINKER_MEMCG_AWARE, name);
+ if (!*shrinker)
+ return -ENOMEM;
+
+ ret = list_lru_init_memcg(lru, *shrinker);
+ if (ret) {
+ shrinker_free(*shrinker);
return ret;
+ }
- ret = list_lru_init_memcg(lru, shrinker);
- if (ret)
- unregister_shrinker(shrinker);
+ (*shrinker)->count_objects = count;
+ (*shrinker)->scan_objects = scan;
+ (*shrinker)->batch = batch;
+ (*shrinker)->seeks = seeks;
+
+ shrinker_register(*shrinker);
return ret;
}
@@ -1010,7 +1007,7 @@ static int nfs4_xattr_shrinker_init(struct shrinker *shrinker,
static void nfs4_xattr_shrinker_destroy(struct shrinker *shrinker,
struct list_lru *lru)
{
- unregister_shrinker(shrinker);
+ shrinker_free(shrinker);
list_lru_destroy(lru);
}
@@ -1026,27 +1023,31 @@ int __init nfs4_xattr_cache_init(void)
return -ENOMEM;
ret = nfs4_xattr_shrinker_init(&nfs4_xattr_cache_shrinker,
- &nfs4_xattr_cache_lru,
- "nfs-xattr_cache");
+ &nfs4_xattr_cache_lru, "nfs-xattr_cache",
+ nfs4_xattr_cache_count,
+ nfs4_xattr_cache_scan, 0, DEFAULT_SEEKS);
if (ret)
goto out1;
ret = nfs4_xattr_shrinker_init(&nfs4_xattr_entry_shrinker,
- &nfs4_xattr_entry_lru,
- "nfs-xattr_entry");
+ &nfs4_xattr_entry_lru, "nfs-xattr_entry",
+ nfs4_xattr_entry_count,
+ nfs4_xattr_entry_scan, 512, DEFAULT_SEEKS);
if (ret)
goto out2;
ret = nfs4_xattr_shrinker_init(&nfs4_xattr_large_entry_shrinker,
&nfs4_xattr_large_entry_lru,
- "nfs-xattr_large_entry");
+ "nfs-xattr_large_entry",
+ nfs4_xattr_entry_count,
+ nfs4_xattr_entry_scan, 512, 1);
if (!ret)
return 0;
- nfs4_xattr_shrinker_destroy(&nfs4_xattr_entry_shrinker,
+ nfs4_xattr_shrinker_destroy(nfs4_xattr_entry_shrinker,
&nfs4_xattr_entry_lru);
out2:
- nfs4_xattr_shrinker_destroy(&nfs4_xattr_cache_shrinker,
+ nfs4_xattr_shrinker_destroy(nfs4_xattr_cache_shrinker,
&nfs4_xattr_cache_lru);
out1:
kmem_cache_destroy(nfs4_xattr_cache_cachep);
@@ -1056,11 +1057,11 @@ out1:
void nfs4_xattr_cache_exit(void)
{
- nfs4_xattr_shrinker_destroy(&nfs4_xattr_large_entry_shrinker,
+ nfs4_xattr_shrinker_destroy(nfs4_xattr_large_entry_shrinker,
&nfs4_xattr_large_entry_lru);
- nfs4_xattr_shrinker_destroy(&nfs4_xattr_entry_shrinker,
+ nfs4_xattr_shrinker_destroy(nfs4_xattr_entry_shrinker,
&nfs4_xattr_entry_lru);
- nfs4_xattr_shrinker_destroy(&nfs4_xattr_cache_shrinker,
+ nfs4_xattr_shrinker_destroy(nfs4_xattr_cache_shrinker,
&nfs4_xattr_cache_lru);
kmem_cache_destroy(nfs4_xattr_cache_cachep);
}
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 47c5c1f86d66..581698f1b7b2 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -209,6 +209,7 @@ struct nfs4_exception {
struct inode *inode;
nfs4_stateid *stateid;
long timeout;
+ unsigned short retrans;
unsigned char task_is_privileged : 1;
unsigned char delay : 1,
recovering : 1,
@@ -315,7 +316,7 @@ extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *,
struct nfs_fh *,
struct nfs_fattr *);
extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *);
-extern const struct xattr_handler *nfs4_xattr_handlers[];
+extern const struct xattr_handler * const nfs4_xattr_handlers[];
extern int nfs4_set_rw_stateid(nfs4_stateid *stateid,
const struct nfs_open_context *ctx,
const struct nfs_lock_context *l_ctx,
@@ -546,6 +547,7 @@ extern unsigned short max_session_slots;
extern unsigned short max_session_cb_slots;
extern unsigned short send_implementation_id;
extern bool recover_lost_locks;
+extern short nfs_delay_retrans;
#define NFS4_CLIENT_ID_UNIQ_LEN (64)
extern char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN];
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 5ee283eb9660..8a943fffaad5 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -585,6 +585,21 @@ wait_on_recovery:
return 0;
}
+/*
+ * Track the number of NFS4ERR_DELAY related retransmissions and return
+ * EAGAIN if the 'softerr' mount option is set, and we've exceeded the limit
+ * set by 'nfs_delay_retrans'.
+ */
+static int nfs4_exception_should_retrans(const struct nfs_server *server,
+ struct nfs4_exception *exception)
+{
+ if (server->flags & NFS_MOUNT_SOFTERR && nfs_delay_retrans >= 0) {
+ if (exception->retrans++ >= (unsigned short)nfs_delay_retrans)
+ return -EAGAIN;
+ }
+ return 0;
+}
+
/* This is the error handling routine for processes that are allowed
* to sleep.
*/
@@ -595,6 +610,11 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
ret = nfs4_do_handle_exception(server, errorcode, exception);
if (exception->delay) {
+ int ret2 = nfs4_exception_should_retrans(server, exception);
+ if (ret2 < 0) {
+ exception->retry = 0;
+ return ret2;
+ }
ret = nfs4_delay(&exception->timeout,
exception->interruptible);
goto out_retry;
@@ -623,6 +643,11 @@ nfs4_async_handle_exception(struct rpc_task *task, struct nfs_server *server,
ret = nfs4_do_handle_exception(server, errorcode, exception);
if (exception->delay) {
+ int ret2 = nfs4_exception_should_retrans(server, exception);
+ if (ret2 < 0) {
+ exception->retry = 0;
+ return ret2;
+ }
rpc_delay(task, nfs4_update_delay(&exception->timeout));
goto out_retry;
}
@@ -5011,9 +5036,10 @@ static void nfs4_free_createdata(struct nfs4_createdata *data)
}
static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
- struct page *page, unsigned int len, struct iattr *sattr,
+ struct folio *folio, unsigned int len, struct iattr *sattr,
struct nfs4_label *label)
{
+ struct page *page = &folio->page;
struct nfs4_createdata *data;
int status = -ENAMETOOLONG;
@@ -5038,7 +5064,7 @@ out:
}
static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
- struct page *page, unsigned int len, struct iattr *sattr)
+ struct folio *folio, unsigned int len, struct iattr *sattr)
{
struct nfs4_exception exception = {
.interruptible = true,
@@ -5049,7 +5075,7 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
label = nfs4_label_init_security(dir, dentry, sattr, &l);
do {
- err = _nfs4_proc_symlink(dir, dentry, page, len, sattr, label);
+ err = _nfs4_proc_symlink(dir, dentry, folio, len, sattr, label);
trace_nfs4_symlink(dir, &dentry->d_name, err);
err = nfs4_handle_exception(NFS_SERVER(dir), err,
&exception);
@@ -5622,7 +5648,7 @@ static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0);
- nfs4_state_protect_write(server->nfs_client, clnt, msg, hdr);
+ nfs4_state_protect_write(hdr->ds_clp ? hdr->ds_clp : server->nfs_client, clnt, msg, hdr);
}
static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
@@ -5663,7 +5689,8 @@ static void nfs4_proc_commit_setup(struct nfs_commit_data *data, struct rpc_mess
data->res.server = server;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, 0);
- nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_COMMIT, clnt, msg);
+ nfs4_state_protect(data->ds_clp ? data->ds_clp : server->nfs_client,
+ NFS_SP4_MACH_CRED_COMMIT, clnt, msg);
}
static int _nfs4_proc_commit(struct file *dst, struct nfs_commitargs *args,
@@ -8934,6 +8961,7 @@ void nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt,
sp4_how = (adata->clp->cl_sp4_flags == 0 ? SP4_NONE : SP4_MACH_CRED);
+try_again:
/* Test connection for session trunking. Async exchange_id call */
task = nfs4_run_exchange_id(adata->clp, adata->cred, sp4_how, xprt);
if (IS_ERR(task))
@@ -8946,11 +8974,15 @@ void nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt,
if (status == 0)
rpc_clnt_xprt_switch_add_xprt(clnt, xprt);
- else if (rpc_clnt_xprt_switch_has_addr(clnt,
+ else if (status != -NFS4ERR_DELAY && rpc_clnt_xprt_switch_has_addr(clnt,
(struct sockaddr *)&xprt->addr))
rpc_clnt_xprt_switch_remove_xprt(clnt, xprt);
rpc_put_task(task);
+ if (status == -NFS4ERR_DELAY) {
+ ssleep(1);
+ goto try_again;
+ }
}
EXPORT_SYMBOL_GPL(nfs4_test_session_trunk);
@@ -9621,6 +9653,9 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
nfs4_sequence_free_slot(&lgp->res.seq_res);
+ exception->state = NULL;
+ exception->stateid = NULL;
+
switch (nfs4err) {
case 0:
goto out;
@@ -9716,7 +9751,8 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = {
};
struct pnfs_layout_segment *
-nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout)
+nfs4_proc_layoutget(struct nfs4_layoutget *lgp,
+ struct nfs4_exception *exception)
{
struct inode *inode = lgp->args.inode;
struct nfs_server *server = NFS_SERVER(inode);
@@ -9736,13 +9772,10 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout)
RPC_TASK_MOVEABLE,
};
struct pnfs_layout_segment *lseg = NULL;
- struct nfs4_exception exception = {
- .inode = inode,
- .timeout = *timeout,
- };
int status = 0;
nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0, 0);
+ exception->retry = 0;
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
@@ -9753,11 +9786,12 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout)
goto out;
if (task->tk_status < 0) {
- status = nfs4_layoutget_handle_exception(task, lgp, &exception);
- *timeout = exception.timeout;
+ exception->retry = 1;
+ status = nfs4_layoutget_handle_exception(task, lgp, exception);
} else if (lgp->res.layoutp->len == 0) {
+ exception->retry = 1;
status = -EAGAIN;
- *timeout = nfs4_update_delay(&exception.timeout);
+ nfs4_update_delay(&exception->timeout);
} else
lseg = pnfs_layout_process(lgp);
out:
@@ -10737,7 +10771,7 @@ static const struct xattr_handler nfs4_xattr_nfs4_user_handler = {
};
#endif
-const struct xattr_handler *nfs4_xattr_handlers[] = {
+const struct xattr_handler * const nfs4_xattr_handlers[] = {
&nfs4_xattr_nfs4_acl_handler,
#if defined(CONFIG_NFS_V4_1)
&nfs4_xattr_nfs4_dacl_handler,
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 84343aefbbd6..21a365357629 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1980,7 +1980,9 @@ pnfs_update_layout(struct inode *ino,
struct pnfs_layout_segment *lseg = NULL;
struct nfs4_layoutget *lgp;
nfs4_stateid stateid;
- long timeout = 0;
+ struct nfs4_exception exception = {
+ .inode = ino,
+ };
unsigned long giveup = jiffies + (clp->cl_lease_time << 1);
bool first;
@@ -2144,7 +2146,7 @@ lookup_again:
lgp->lo = lo;
pnfs_get_layout_hdr(lo);
- lseg = nfs4_proc_layoutget(lgp, &timeout);
+ lseg = nfs4_proc_layoutget(lgp, &exception);
trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
nfs_layoutget_end(lo);
@@ -2171,6 +2173,8 @@ lookup_again:
goto out_put_layout_hdr;
}
if (lseg) {
+ if (!exception.retry)
+ goto out_put_layout_hdr;
if (first)
pnfs_clear_first_layoutget(lo);
trace_pnfs_update_layout(ino, pos, count,
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index d886c8226d8f..db57a85500ee 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -35,6 +35,7 @@
#include <linux/nfs_page.h>
#include <linux/workqueue.h>
+struct nfs4_exception;
struct nfs4_opendata;
enum {
@@ -245,7 +246,9 @@ extern size_t max_response_pages(struct nfs_server *server);
extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
struct pnfs_device *dev,
const struct cred *cred);
-extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout);
+extern struct pnfs_layout_segment *
+nfs4_proc_layoutget(struct nfs4_layoutget *lgp,
+ struct nfs4_exception *exception);
extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync);
/* pnfs.c */
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index e3570c656b0f..ad3a321ae997 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -396,9 +396,10 @@ nfs_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name)
}
static int
-nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
+nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct folio *folio,
unsigned int len, struct iattr *sattr)
{
+ struct page *page = &folio->page;
struct nfs_fh *fh;
struct nfs_fattr *fattr;
struct nfs_symlinkargs arg = {
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0d6473cb00cb..075b31c93f87 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -129,11 +129,7 @@ static void nfs_ssc_unregister_ops(void)
}
#endif /* CONFIG_NFS_V4_2 */
-static struct shrinker acl_shrinker = {
- .count_objects = nfs_access_cache_count,
- .scan_objects = nfs_access_cache_scan,
- .seeks = DEFAULT_SEEKS,
-};
+static struct shrinker *acl_shrinker;
/*
* Register the NFS filesystems
@@ -153,9 +149,18 @@ int __init register_nfs_fs(void)
ret = nfs_register_sysctl();
if (ret < 0)
goto error_2;
- ret = register_shrinker(&acl_shrinker, "nfs-acl");
- if (ret < 0)
+
+ acl_shrinker = shrinker_alloc(0, "nfs-acl");
+ if (!acl_shrinker) {
+ ret = -ENOMEM;
goto error_3;
+ }
+
+ acl_shrinker->count_objects = nfs_access_cache_count;
+ acl_shrinker->scan_objects = nfs_access_cache_scan;
+
+ shrinker_register(acl_shrinker);
+
#ifdef CONFIG_NFS_V4_2
nfs_ssc_register_ops();
#endif
@@ -175,7 +180,7 @@ error_0:
*/
void __exit unregister_nfs_fs(void)
{
- unregister_shrinker(&acl_shrinker);
+ shrinker_free(acl_shrinker);
nfs_unregister_sysctl();
unregister_nfs4_fs();
#ifdef CONFIG_NFS_V4_2
@@ -1071,7 +1076,7 @@ static void nfs_fill_super(struct super_block *sb, struct nfs_fs_context *ctx)
sb->s_export_op = &nfs_export_ops;
break;
case 4:
- sb->s_flags |= SB_POSIXACL;
+ sb->s_iflags |= SB_I_NOUMASK;
sb->s_time_gran = 1;
sb->s_time_min = S64_MIN;
sb->s_time_max = S64_MAX;
@@ -1366,6 +1371,7 @@ unsigned short max_session_cb_slots = NFS4_DEF_CB_SLOT_TABLE_SIZE;
unsigned short send_implementation_id = 1;
char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = "";
bool recover_lost_locks = false;
+short nfs_delay_retrans = -1;
EXPORT_SYMBOL_GPL(nfs_callback_nr_threads);
EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport);
@@ -1376,6 +1382,7 @@ EXPORT_SYMBOL_GPL(max_session_cb_slots);
EXPORT_SYMBOL_GPL(send_implementation_id);
EXPORT_SYMBOL_GPL(nfs4_client_id_uniquifier);
EXPORT_SYMBOL_GPL(recover_lost_locks);
+EXPORT_SYMBOL_GPL(nfs_delay_retrans);
#define NFS_CALLBACK_MAXPORTNR (65535U)
@@ -1424,5 +1431,9 @@ MODULE_PARM_DESC(recover_lost_locks,
"If the server reports that a lock might be lost, "
"try to recover it risking data corruption.");
-
+module_param_named(delay_retrans, nfs_delay_retrans, short, 0644);
+MODULE_PARM_DESC(delay_retrans,
+ "Unless negative, specifies the number of times the NFSv4 "
+ "client retries a request before returning an EAGAIN error, "
+ "after a reply of NFS4ERR_DELAY from the server.");
#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 9d82d50ce0b1..b664caea8b4e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -739,6 +739,8 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
&pgio);
pgio.pg_error = 0;
nfs_pageio_complete(&pgio);
+ if (err == -EAGAIN && mntflags & NFS_MOUNT_SOFTERR)
+ break;
} while (err < 0 && !nfs_error_is_fatal(err));
nfs_io_completion_put(ioc);
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 6fffc8f03f74..b8736a82e57c 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -12,7 +12,8 @@ nfsd-y += trace.o
nfsd-y += nfssvc.o nfsctl.o nfsfh.o vfs.o \
export.o auth.o lockd.o nfscache.o \
- stats.o filecache.o nfs3proc.o nfs3xdr.o
+ stats.o filecache.o nfs3proc.o nfs3xdr.o \
+ netlink.o
nfsd-$(CONFIG_NFSD_V2) += nfsproc.o nfsxdr.o
nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index fdf2aad73470..e6beaaf4f170 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -26,8 +26,6 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
int i;
int flags = nfsexp_flags(rqstp, exp);
- validate_process_creds();
-
/* discard any old override before preparing the new set */
revert_creds(get_cred(current_real_cred()));
new = prepare_creds();
@@ -81,10 +79,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
else
new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
new->cap_permitted);
- validate_process_creds();
put_cred(override_creds(new));
put_cred(new);
- validate_process_creds();
return 0;
oom:
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 01d7fd108cf3..46fd74d91ea9 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -117,12 +117,13 @@ static __be32
nfsd4_block_commit_blocks(struct inode *inode, struct nfsd4_layoutcommit *lcp,
struct iomap *iomaps, int nr_iomaps)
{
+ struct timespec64 mtime = inode_get_mtime(inode);
loff_t new_size = lcp->lc_last_wr + 1;
struct iattr iattr = { .ia_valid = 0 };
int error;
if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
- timespec64_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
+ timespec64_compare(&lcp->lc_mtime, &mtime) < 0)
lcp->lc_mtime = current_time(inode);
iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME;
iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime;
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 1ed2f691ebb9..ce78f74715ee 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -16,9 +16,9 @@
__be32
nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
- struct nfsd4_layoutget *lgp)
+ const struct nfsd4_layoutget *lgp)
{
- struct pnfs_block_extent *b = lgp->lg_content;
+ const struct pnfs_block_extent *b = lgp->lg_content;
int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32);
__be32 *p;
@@ -77,7 +77,7 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
__be32
nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
- struct nfsd4_getdeviceinfo *gdp)
+ const struct nfsd4_getdeviceinfo *gdp)
{
struct pnfs_block_deviceaddr *dev = gdp->gd_device;
int len = sizeof(__be32), ret, i;
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
index bc5166bfe46b..b0361e8aa9a7 100644
--- a/fs/nfsd/blocklayoutxdr.h
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -51,9 +51,9 @@ struct pnfs_block_deviceaddr {
};
__be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
- struct nfsd4_getdeviceinfo *gdp);
+ const struct nfsd4_getdeviceinfo *gdp);
__be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
- struct nfsd4_layoutget *lgp);
+ const struct nfsd4_layoutget *lgp);
int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
u32 block_size);
int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index 929248c6ca84..4cbe0434cbb8 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -84,8 +84,8 @@ int nfsd_net_reply_cache_init(struct nfsd_net *nn);
void nfsd_net_reply_cache_destroy(struct nfsd_net *nn);
int nfsd_reply_cache_init(struct nfsd_net *);
void nfsd_reply_cache_shutdown(struct nfsd_net *);
-int nfsd_cache_lookup(struct svc_rqst *rqstp,
- struct nfsd_cacherep **cacherep);
+int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
+ unsigned int len, struct nfsd_cacherep **cacherep);
void nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp,
int cachetype, __be32 *statp);
int nfsd_reply_cache_stats_show(struct seq_file *m, void *v);
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 11a0eaa2f914..7b641095a665 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -339,12 +339,16 @@ static int export_stats_init(struct export_stats *stats)
static void export_stats_reset(struct export_stats *stats)
{
- nfsd_percpu_counters_reset(stats->counter, EXP_STATS_COUNTERS_NUM);
+ if (stats)
+ nfsd_percpu_counters_reset(stats->counter,
+ EXP_STATS_COUNTERS_NUM);
}
static void export_stats_destroy(struct export_stats *stats)
{
- nfsd_percpu_counters_destroy(stats->counter, EXP_STATS_COUNTERS_NUM);
+ if (stats)
+ nfsd_percpu_counters_destroy(stats->counter,
+ EXP_STATS_COUNTERS_NUM);
}
static void svc_export_put(struct kref *ref)
@@ -353,7 +357,8 @@ static void svc_export_put(struct kref *ref)
path_put(&exp->ex_path);
auth_domain_put(exp->ex_client);
nfsd4_fslocs_free(&exp->ex_fslocs);
- export_stats_destroy(&exp->ex_stats);
+ export_stats_destroy(exp->ex_stats);
+ kfree(exp->ex_stats);
kfree(exp->ex_uuid);
kfree_rcu(exp, ex_rcu);
}
@@ -421,8 +426,7 @@ static int check_export(struct path *path, int *flags, unsigned char *uuid)
return -EINVAL;
}
- if (!inode->i_sb->s_export_op ||
- !inode->i_sb->s_export_op->fh_to_dentry) {
+ if (!exportfs_can_decode_fh(inode->i_sb->s_export_op)) {
dprintk("exp_export: export of invalid fs type.\n");
return -EINVAL;
}
@@ -767,13 +771,15 @@ static int svc_export_show(struct seq_file *m,
seq_putc(m, '\t');
seq_escape(m, exp->ex_client->name, " \t\n\\");
if (export_stats) {
- seq_printf(m, "\t%lld\n", exp->ex_stats.start_time);
+ struct percpu_counter *counter = exp->ex_stats->counter;
+
+ seq_printf(m, "\t%lld\n", exp->ex_stats->start_time);
seq_printf(m, "\tfh_stale: %lld\n",
- percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_FH_STALE]));
+ percpu_counter_sum_positive(&counter[EXP_STATS_FH_STALE]));
seq_printf(m, "\tio_read: %lld\n",
- percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_IO_READ]));
+ percpu_counter_sum_positive(&counter[EXP_STATS_IO_READ]));
seq_printf(m, "\tio_write: %lld\n",
- percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_IO_WRITE]));
+ percpu_counter_sum_positive(&counter[EXP_STATS_IO_WRITE]));
seq_putc(m, '\n');
return 0;
}
@@ -819,7 +825,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
new->ex_layout_types = 0;
new->ex_uuid = NULL;
new->cd = item->cd;
- export_stats_reset(&new->ex_stats);
+ export_stats_reset(new->ex_stats);
}
static void export_update(struct cache_head *cnew, struct cache_head *citem)
@@ -856,7 +862,14 @@ static struct cache_head *svc_export_alloc(void)
if (!i)
return NULL;
- if (export_stats_init(&i->ex_stats)) {
+ i->ex_stats = kmalloc(sizeof(*(i->ex_stats)), GFP_KERNEL);
+ if (!i->ex_stats) {
+ kfree(i);
+ return NULL;
+ }
+
+ if (export_stats_init(i->ex_stats)) {
+ kfree(i->ex_stats);
kfree(i);
return NULL;
}
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index 2df8ae25aad3..ca9dc230ae3d 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -64,10 +64,10 @@ struct svc_export {
struct cache_head h;
struct auth_domain * ex_client;
int ex_flags;
+ int ex_fsid;
struct path ex_path;
kuid_t ex_anon_uid;
kgid_t ex_anon_gid;
- int ex_fsid;
unsigned char * ex_uuid; /* 16 byte fsid */
struct nfsd4_fs_locations ex_fslocs;
uint32_t ex_nflavors;
@@ -76,8 +76,8 @@ struct svc_export {
struct nfsd4_deviceid_map *ex_devid_map;
struct cache_detail *cd;
struct rcu_head ex_rcu;
- struct export_stats ex_stats;
unsigned long ex_xprtsec_modes;
+ struct export_stats *ex_stats;
};
/* an "export key" (expkey) maps a filehandlefragement to an
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index ee9c923192e0..ef063f93fde9 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -521,11 +521,7 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
return ret;
}
-static struct shrinker nfsd_file_shrinker = {
- .scan_objects = nfsd_file_lru_scan,
- .count_objects = nfsd_file_lru_count,
- .seeks = 1,
-};
+static struct shrinker *nfsd_file_shrinker;
/**
* nfsd_file_cond_queue - conditionally unhash and queue a nfsd_file
@@ -746,12 +742,19 @@ nfsd_file_cache_init(void)
goto out_err;
}
- ret = register_shrinker(&nfsd_file_shrinker, "nfsd-filecache");
- if (ret) {
- pr_err("nfsd: failed to register nfsd_file_shrinker: %d\n", ret);
+ nfsd_file_shrinker = shrinker_alloc(0, "nfsd-filecache");
+ if (!nfsd_file_shrinker) {
+ ret = -ENOMEM;
+ pr_err("nfsd: failed to allocate nfsd_file_shrinker\n");
goto out_lru;
}
+ nfsd_file_shrinker->count_objects = nfsd_file_lru_count;
+ nfsd_file_shrinker->scan_objects = nfsd_file_lru_scan;
+ nfsd_file_shrinker->seeks = 1;
+
+ shrinker_register(nfsd_file_shrinker);
+
ret = lease_register_notifier(&nfsd_file_lease_notifier);
if (ret) {
pr_err("nfsd: unable to register lease notifier: %d\n", ret);
@@ -774,7 +777,7 @@ out:
out_notifier:
lease_unregister_notifier(&nfsd_file_lease_notifier);
out_shrinker:
- unregister_shrinker(&nfsd_file_shrinker);
+ shrinker_free(nfsd_file_shrinker);
out_lru:
list_lru_destroy(&nfsd_file_lru);
out_err:
@@ -891,7 +894,7 @@ nfsd_file_cache_shutdown(void)
return;
lease_unregister_notifier(&nfsd_file_lease_notifier);
- unregister_shrinker(&nfsd_file_shrinker);
+ shrinker_free(nfsd_file_shrinker);
/*
* make sure all callers of nfsd_file_lru_cb are done before
* calling nfsd_file_cache_purge
@@ -989,22 +992,21 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned char need = may_flags & NFSD_FILE_MAY_MASK;
struct net *net = SVC_NET(rqstp);
struct nfsd_file *new, *nf;
- const struct cred *cred;
+ bool stale_retry = true;
bool open_retry = true;
struct inode *inode;
__be32 status;
int ret;
+retry:
status = fh_verify(rqstp, fhp, S_IFREG,
may_flags|NFSD_MAY_OWNER_OVERRIDE);
if (status != nfs_ok)
return status;
inode = d_inode(fhp->fh_dentry);
- cred = get_current_cred();
-retry:
rcu_read_lock();
- nf = nfsd_file_lookup_locked(net, cred, inode, need, want_gc);
+ nf = nfsd_file_lookup_locked(net, current_cred(), inode, need, want_gc);
rcu_read_unlock();
if (nf) {
@@ -1026,7 +1028,7 @@ retry:
rcu_read_lock();
spin_lock(&inode->i_lock);
- nf = nfsd_file_lookup_locked(net, cred, inode, need, want_gc);
+ nf = nfsd_file_lookup_locked(net, current_cred(), inode, need, want_gc);
if (unlikely(nf)) {
spin_unlock(&inode->i_lock);
rcu_read_unlock();
@@ -1058,6 +1060,7 @@ wait_for_construction:
goto construction_err;
}
open_retry = false;
+ fh_put(fhp);
goto retry;
}
this_cpu_inc(nfsd_file_cache_hits);
@@ -1074,7 +1077,6 @@ out:
nfsd_file_check_write_error(nf);
*pnf = nf;
}
- put_cred(cred);
trace_nfsd_file_acquire(rqstp, inode, may_flags, nf, status);
return status;
@@ -1088,8 +1090,20 @@ open_file:
status = nfs_ok;
trace_nfsd_file_opened(nf, status);
} else {
- status = nfsd_open_verified(rqstp, fhp, may_flags,
- &nf->nf_file);
+ ret = nfsd_open_verified(rqstp, fhp, may_flags,
+ &nf->nf_file);
+ if (ret == -EOPENSTALE && stale_retry) {
+ stale_retry = false;
+ nfsd_file_unhash(nf);
+ clear_and_wake_up_bit(NFSD_FILE_PENDING,
+ &nf->nf_flags);
+ if (refcount_dec_and_test(&nf->nf_ref))
+ nfsd_file_free(nf);
+ nf = NULL;
+ fh_put(fhp);
+ goto retry;
+ }
+ status = nfserrno(ret);
trace_nfsd_file_open(nf, status);
}
} else
diff --git a/fs/nfsd/flexfilelayoutxdr.c b/fs/nfsd/flexfilelayoutxdr.c
index bb205328e043..aeb71c10ff1b 100644
--- a/fs/nfsd/flexfilelayoutxdr.c
+++ b/fs/nfsd/flexfilelayoutxdr.c
@@ -17,9 +17,9 @@ struct ff_idmap {
__be32
nfsd4_ff_encode_layoutget(struct xdr_stream *xdr,
- struct nfsd4_layoutget *lgp)
+ const struct nfsd4_layoutget *lgp)
{
- struct pnfs_ff_layout *fl = lgp->lg_content;
+ const struct pnfs_ff_layout *fl = lgp->lg_content;
int len, mirror_len, ds_len, fh_len;
__be32 *p;
@@ -77,7 +77,7 @@ nfsd4_ff_encode_layoutget(struct xdr_stream *xdr,
__be32
nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr,
- struct nfsd4_getdeviceinfo *gdp)
+ const struct nfsd4_getdeviceinfo *gdp)
{
struct pnfs_ff_device_addr *da = gdp->gd_device;
int len;
diff --git a/fs/nfsd/flexfilelayoutxdr.h b/fs/nfsd/flexfilelayoutxdr.h
index 8e195aeca023..6d5a1066a903 100644
--- a/fs/nfsd/flexfilelayoutxdr.h
+++ b/fs/nfsd/flexfilelayoutxdr.h
@@ -43,8 +43,8 @@ struct pnfs_ff_layout {
};
__be32 nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr,
- struct nfsd4_getdeviceinfo *gdp);
+ const struct nfsd4_getdeviceinfo *gdp);
__be32 nfsd4_ff_encode_layoutget(struct xdr_stream *xdr,
- struct nfsd4_layoutget *lgp);
+ const struct nfsd4_layoutget *lgp);
#endif /* _NFSD_FLEXFILELAYOUTXDR_H */
diff --git a/fs/nfsd/netlink.c b/fs/nfsd/netlink.c
new file mode 100644
index 000000000000..0e1d635ec5f9
--- /dev/null
+++ b/fs/nfsd/netlink.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/nfsd.yaml */
+/* YNL-GEN kernel source */
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "netlink.h"
+
+#include <uapi/linux/nfsd_netlink.h>
+
+/* Ops table for nfsd */
+static const struct genl_split_ops nfsd_nl_ops[] = {
+ {
+ .cmd = NFSD_CMD_RPC_STATUS_GET,
+ .start = nfsd_nl_rpc_status_get_start,
+ .dumpit = nfsd_nl_rpc_status_get_dumpit,
+ .done = nfsd_nl_rpc_status_get_done,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+};
+
+struct genl_family nfsd_nl_family __ro_after_init = {
+ .name = NFSD_FAMILY_NAME,
+ .version = NFSD_FAMILY_VERSION,
+ .netnsok = true,
+ .parallel_ops = true,
+ .module = THIS_MODULE,
+ .split_ops = nfsd_nl_ops,
+ .n_split_ops = ARRAY_SIZE(nfsd_nl_ops),
+};
diff --git a/fs/nfsd/netlink.h b/fs/nfsd/netlink.h
new file mode 100644
index 000000000000..d83dd6bdee92
--- /dev/null
+++ b/fs/nfsd/netlink.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/nfsd.yaml */
+/* YNL-GEN kernel header */
+
+#ifndef _LINUX_NFSD_GEN_H
+#define _LINUX_NFSD_GEN_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/nfsd_netlink.h>
+
+int nfsd_nl_rpc_status_get_start(struct netlink_callback *cb);
+int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb);
+
+int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+
+extern struct genl_family nfsd_nl_family;
+
+#endif /* _LINUX_NFSD_GEN_H */
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index ec49b200b797..ab303a8b77d5 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -177,7 +177,7 @@ struct nfsd_net {
/* size of cache when we saw the longest hash chain */
unsigned int longest_chain_cachesize;
- struct shrinker nfsd_reply_cache_shrinker;
+ struct shrinker *nfsd_reply_cache_shrinker;
/* tracking server-to-server copy mounts */
spinlock_t nfsd_ssc_lock;
@@ -195,7 +195,7 @@ struct nfsd_net {
int nfs4_max_clients;
atomic_t nfsd_courtesy_clients;
- struct shrinker nfsd_client_shrinker;
+ struct shrinker *nfsd_client_shrinker;
struct work_struct nfsd_shrinker_work;
};
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 268ef57751c4..b78eceebd945 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -171,7 +171,8 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
* + 1 (xdr opaque byte count) = 26
*/
resp->count = argp->count;
- svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
+ svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3) << 2) +
+ resp->count + 4);
fh_copy(&resp->fh, &argp->fh);
resp->status = nfsd_read(rqstp, &resp->fh, argp->offset,
@@ -194,7 +195,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
SVCFH_fmt(&argp->fh),
argp->len,
(unsigned long long) argp->offset,
- argp->stable? " stable" : "");
+ argp->stable ? " stable" : "");
resp->status = nfserr_fbig;
if (argp->offset > (u64)OFFSET_MAX ||
@@ -294,8 +295,8 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
status = nfserr_exist;
break;
case NFS3_CREATE_EXCLUSIVE:
- if (d_inode(child)->i_mtime.tv_sec == v_mtime &&
- d_inode(child)->i_atime.tv_sec == v_atime &&
+ if (inode_get_mtime_sec(d_inode(child)) == v_mtime &&
+ inode_get_atime_sec(d_inode(child)) == v_atime &&
d_inode(child)->i_size == 0) {
break;
}
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index e8a80052cb1b..5e8096bc5eaa 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -515,11 +515,11 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp,
if (!list_empty(&ls->ls_layouts)) {
if (found)
nfs4_inc_and_copy_stateid(&lrp->lr_sid, &ls->ls_stid);
- lrp->lrs_present = 1;
+ lrp->lrs_present = true;
} else {
trace_nfsd_layoutstate_unhash(&ls->ls_stid.sc_stateid);
nfs4_unhash_stid(&ls->ls_stid);
- lrp->lrs_present = 0;
+ lrp->lrs_present = false;
}
spin_unlock(&ls->ls_lock);
@@ -539,7 +539,7 @@ nfsd4_return_client_layouts(struct svc_rqst *rqstp,
struct nfs4_layout *lp, *t;
LIST_HEAD(reaplist);
- lrp->lrs_present = 0;
+ lrp->lrs_present = false;
spin_lock(&clp->cl_lock);
list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) {
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 4199ede0583c..6f2d4aa4970d 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -322,8 +322,8 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
status = nfserr_exist;
break;
case NFS4_CREATE_EXCLUSIVE:
- if (d_inode(child)->i_mtime.tv_sec == v_mtime &&
- d_inode(child)->i_atime.tv_sec == v_atime &&
+ if (inode_get_mtime_sec(d_inode(child)) == v_mtime &&
+ inode_get_atime_sec(d_inode(child)) == v_atime &&
d_inode(child)->i_size == 0) {
open->op_created = true;
break; /* subtle */
@@ -331,8 +331,8 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
status = nfserr_exist;
break;
case NFS4_CREATE_EXCLUSIVE4_1:
- if (d_inode(child)->i_mtime.tv_sec == v_mtime &&
- d_inode(child)->i_atime.tv_sec == v_atime &&
+ if (inode_get_mtime_sec(d_inode(child)) == v_mtime &&
+ inode_get_atime_sec(d_inode(child)) == v_atime &&
d_inode(child)->i_size == 0) {
open->op_created = true;
goto set_attr; /* subtle */
@@ -1329,7 +1329,8 @@ extern void nfs_sb_deactive(struct super_block *sb);
* setup a work entry in the ssc delayed unmount list.
*/
static __be32 nfsd4_ssc_setup_dul(struct nfsd_net *nn, char *ipaddr,
- struct nfsd4_ssc_umount_item **nsui)
+ struct nfsd4_ssc_umount_item **nsui,
+ struct svc_rqst *rqstp)
{
struct nfsd4_ssc_umount_item *ni = NULL;
struct nfsd4_ssc_umount_item *work = NULL;
@@ -1351,7 +1352,7 @@ try_again:
spin_unlock(&nn->nfsd_ssc_lock);
/* allow 20secs for mount/unmount for now - revisit */
- if (kthread_should_stop() ||
+ if (svc_thread_should_stop(rqstp) ||
(schedule_timeout(20*HZ) == 0)) {
finish_wait(&nn->nfsd_ssc_waitq, &wait);
kfree(work);
@@ -1467,7 +1468,7 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
goto out_free_rawdata;
snprintf(dev_name, len + 5, "%s%s%s:/", startsep, ipaddr, endsep);
- status = nfsd4_ssc_setup_dul(nn, ipaddr, nsui);
+ status = nfsd4_ssc_setup_dul(nn, ipaddr, nsui, rqstp);
if (status)
goto out_free_devname;
if ((*nsui)->nsui_vfsmount)
@@ -1642,6 +1643,7 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy,
if (bytes_total == 0)
bytes_total = ULLONG_MAX;
do {
+ /* Only async copies can be stopped here */
if (kthread_should_stop())
break;
bytes_copied = nfsd_copy_file_range(src, src_pos, dst, dst_pos,
@@ -1760,6 +1762,7 @@ static int nfsd4_do_async_copy(void *data)
struct nfsd4_copy *copy = (struct nfsd4_copy *)data;
__be32 nfserr;
+ trace_nfsd_copy_do_async(copy);
if (nfsd4_ssc_is_inter(copy)) {
struct file *filp;
@@ -1798,21 +1801,27 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
__be32 status;
struct nfsd4_copy *async_copy = NULL;
+ copy->cp_clp = cstate->clp;
if (nfsd4_ssc_is_inter(copy)) {
+ trace_nfsd_copy_inter(copy);
if (!inter_copy_offload_enable || nfsd4_copy_is_sync(copy)) {
status = nfserr_notsupp;
goto out;
}
status = nfsd4_setup_inter_ssc(rqstp, cstate, copy);
- if (status)
+ if (status) {
+ trace_nfsd_copy_done(copy, status);
return nfserr_offload_denied;
+ }
} else {
+ trace_nfsd_copy_intra(copy);
status = nfsd4_setup_intra_ssc(rqstp, cstate, copy);
- if (status)
+ if (status) {
+ trace_nfsd_copy_done(copy, status);
return status;
+ }
}
- copy->cp_clp = cstate->clp;
memcpy(&copy->fh, &cstate->current_fh.fh_handle,
sizeof(struct knfsd_fh));
if (nfsd4_copy_is_async(copy)) {
@@ -1847,6 +1856,7 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
copy->nf_dst->nf_file, true);
}
out:
+ trace_nfsd_copy_done(copy, status);
release_copy_files(copy);
return status;
out_err:
@@ -1929,8 +1939,8 @@ nfsd4_copy_notify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
return status;
- cn->cpn_sec = nn->nfsd4_lease;
- cn->cpn_nsec = 0;
+ cn->cpn_lease_time.tv_sec = nn->nfsd4_lease;
+ cn->cpn_lease_time.tv_nsec = 0;
status = nfserrno(-ENOMEM);
cps = nfs4_alloc_init_cpntf_state(nn, stid);
@@ -2347,10 +2357,10 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp,
mutex_unlock(&ls->ls_mutex);
if (new_size > i_size_read(inode)) {
- lcp->lc_size_chg = 1;
+ lcp->lc_size_chg = true;
lcp->lc_newsize = new_size;
} else {
- lcp->lc_size_chg = 0;
+ lcp->lc_size_chg = false;
}
nfserr = ops->proc_layoutcommit(inode, lcp);
@@ -3200,6 +3210,7 @@ static const struct nfsd4_operation nfsd4_ops[] = {
},
[OP_LOCK] = {
.op_func = nfsd4_lock,
+ .op_release = nfsd4_lock_release,
.op_flags = OP_MODIFIES_SOMETHING |
OP_NONTRIVIAL_ERROR_ENCODE,
.op_name = "OP_LOCK",
@@ -3208,6 +3219,7 @@ static const struct nfsd4_operation nfsd4_ops[] = {
},
[OP_LOCKT] = {
.op_func = nfsd4_lockt,
+ .op_release = nfsd4_lockt_release,
.op_flags = OP_NONTRIVIAL_ERROR_ENCODE,
.op_name = "OP_LOCKT",
.op_rsize_bop = nfsd4_lock_rsize,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 8534693eb6a4..3edbfa0233e6 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -59,7 +59,7 @@
#define NFSDDBG_FACILITY NFSDDBG_PROC
-#define all_ones {{~0,~0},~0}
+#define all_ones {{ ~0, ~0}, ~0}
static const stateid_t one_stateid = {
.si_generation = ~0,
.si_opaque = all_ones,
@@ -297,7 +297,7 @@ find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh,
nbl = find_blocked_lock(lo, fh, nn);
if (!nbl) {
- nbl= kmalloc(sizeof(*nbl), GFP_KERNEL);
+ nbl = kmalloc(sizeof(*nbl), GFP_KERNEL);
if (nbl) {
INIT_LIST_HEAD(&nbl->nbl_list);
INIT_LIST_HEAD(&nbl->nbl_lru);
@@ -1159,6 +1159,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
struct nfs4_clnt_odstate *odstate, u32 dl_type)
{
struct nfs4_delegation *dp;
+ struct nfs4_stid *stid;
long n;
dprintk("NFSD alloc_init_deleg\n");
@@ -1167,9 +1168,10 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
goto out_dec;
if (delegation_blocked(&fp->fi_fhandle))
goto out_dec;
- dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab, nfs4_free_deleg));
- if (dp == NULL)
+ stid = nfs4_alloc_stid(clp, deleg_slab, nfs4_free_deleg);
+ if (stid == NULL)
goto out_dec;
+ dp = delegstateid(stid);
/*
* delegation seqid's are never incremented. The 4.1 special
@@ -2797,7 +2799,7 @@ static int client_opens_release(struct inode *inode, struct file *file)
/* XXX: alternatively, we could get/drop in seq start/stop */
drop_client(clp);
- return 0;
+ return seq_release(inode, file);
}
static const struct file_operations client_states_fops = {
@@ -4400,8 +4402,7 @@ static unsigned long
nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
{
int count;
- struct nfsd_net *nn = container_of(shrink,
- struct nfsd_net, nfsd_client_shrinker);
+ struct nfsd_net *nn = shrink->private_data;
count = atomic_read(&nn->nfsd_courtesy_clients);
if (!count)
@@ -5636,11 +5637,11 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
int status = 0;
cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client);
- open->op_recall = 0;
+ open->op_recall = false;
switch (open->op_claim_type) {
case NFS4_OPEN_CLAIM_PREVIOUS:
if (!cb_up)
- open->op_recall = 1;
+ open->op_recall = true;
break;
case NFS4_OPEN_CLAIM_NULL:
parent = currentfh;
@@ -5682,7 +5683,7 @@ out_no_deleg:
if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS &&
open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) {
dprintk("NFSD: WARNING: refusing delegation reclaim\n");
- open->op_recall = 1;
+ open->op_recall = true;
}
/* 4.1 client asking for a delegation? */
@@ -7487,6 +7488,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_blocked_lock *nbl = NULL;
struct file_lock *file_lock = NULL;
struct file_lock *conflock = NULL;
+ struct super_block *sb;
__be32 status = 0;
int lkflg;
int err;
@@ -7508,6 +7510,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
dprintk("NFSD: nfsd4_lock: permission denied!\n");
return status;
}
+ sb = cstate->current_fh.fh_dentry->d_sb;
if (lock->lk_is_new) {
if (nfsd4_has_session(cstate))
@@ -7559,7 +7562,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
fp = lock_stp->st_stid.sc_file;
switch (lock->lk_type) {
case NFS4_READW_LT:
- if (nfsd4_has_session(cstate))
+ if (nfsd4_has_session(cstate) ||
+ exportfs_lock_op_is_async(sb->s_export_op))
fl_flags |= FL_SLEEP;
fallthrough;
case NFS4_READ_LT:
@@ -7571,7 +7575,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
fl_type = F_RDLCK;
break;
case NFS4_WRITEW_LT:
- if (nfsd4_has_session(cstate))
+ if (nfsd4_has_session(cstate) ||
+ exportfs_lock_op_is_async(sb->s_export_op))
fl_flags |= FL_SLEEP;
fallthrough;
case NFS4_WRITE_LT:
@@ -7599,7 +7604,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* for file locks), so don't attempt blocking lock notifications
* on those filesystems:
*/
- if (nf->nf_file->f_op->lock)
+ if (!exportfs_lock_op_is_async(sb->s_export_op))
fl_flags &= ~FL_SLEEP;
nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn);
@@ -7705,6 +7710,14 @@ out:
return status;
}
+void nfsd4_lock_release(union nfsd4_op_u *u)
+{
+ struct nfsd4_lock *lock = &u->lock;
+ struct nfsd4_lock_denied *deny = &lock->lk_denied;
+
+ kfree(deny->ld_owner.data);
+}
+
/*
* The NFSv4 spec allows a client to do a LOCKT without holding an OPEN,
* so we do a temporary open here just to get an open file to pass to
@@ -7810,6 +7823,14 @@ out:
return status;
}
+void nfsd4_lockt_release(union nfsd4_op_u *u)
+{
+ struct nfsd4_lockt *lockt = &u->lockt;
+ struct nfsd4_lock_denied *deny = &lockt->lt_denied;
+
+ kfree(deny->ld_owner.data);
+}
+
__be32
nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
union nfsd4_op_u *u)
@@ -8149,12 +8170,16 @@ static int nfs4_state_create_net(struct net *net)
INIT_WORK(&nn->nfsd_shrinker_work, nfsd4_state_shrinker_worker);
get_net(net);
- nn->nfsd_client_shrinker.scan_objects = nfsd4_state_shrinker_scan;
- nn->nfsd_client_shrinker.count_objects = nfsd4_state_shrinker_count;
- nn->nfsd_client_shrinker.seeks = DEFAULT_SEEKS;
-
- if (register_shrinker(&nn->nfsd_client_shrinker, "nfsd-client"))
+ nn->nfsd_client_shrinker = shrinker_alloc(0, "nfsd-client");
+ if (!nn->nfsd_client_shrinker)
goto err_shrinker;
+
+ nn->nfsd_client_shrinker->scan_objects = nfsd4_state_shrinker_scan;
+ nn->nfsd_client_shrinker->count_objects = nfsd4_state_shrinker_count;
+ nn->nfsd_client_shrinker->private_data = nn;
+
+ shrinker_register(nn->nfsd_client_shrinker);
+
return 0;
err_shrinker:
@@ -8252,7 +8277,7 @@ nfs4_state_shutdown_net(struct net *net)
struct list_head *pos, *next, reaplist;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- unregister_shrinker(&nn->nfsd_client_shrinker);
+ shrinker_free(nn->nfsd_client_shrinker);
cancel_work(&nn->nfsd_shrinker_work);
cancel_delayed_work_sync(&nn->laundromat_work);
locks_end_grace(&nn->nfsd4_manager);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 92c7dde148a4..b499fe9caa32 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2530,66 +2530,62 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
return true;
}
-static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
- struct svc_export *exp)
+static __be32 nfsd4_encode_nfs_fh4(struct xdr_stream *xdr,
+ struct knfsd_fh *fh_handle)
{
- if (exp->ex_flags & NFSEXP_V4ROOT) {
- *p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time));
- *p++ = 0;
- } else
- p = xdr_encode_hyper(p, nfsd4_change_attribute(stat, inode));
- return p;
+ return nfsd4_encode_opaque(xdr, fh_handle->fh_raw, fh_handle->fh_size);
}
+/* This is a frequently-encoded type; open-coded for speed */
static __be32 nfsd4_encode_nfstime4(struct xdr_stream *xdr,
- struct timespec64 *tv)
+ const struct timespec64 *tv)
{
__be32 *p;
p = xdr_reserve_space(xdr, XDR_UNIT * 3);
if (!p)
return nfserr_resource;
-
- p = xdr_encode_hyper(p, (s64)tv->tv_sec);
+ p = xdr_encode_hyper(p, tv->tv_sec);
*p = cpu_to_be32(tv->tv_nsec);
return nfs_ok;
}
-/*
- * ctime (in NFSv4, time_metadata) is not writeable, and the client
- * doesn't really care what resolution could theoretically be stored by
- * the filesystem.
- *
- * The client cares how close together changes can be while still
- * guaranteeing ctime changes. For most filesystems (which have
- * timestamps with nanosecond fields) that is limited by the resolution
- * of the time returned from current_time() (which I'm assuming to be
- * 1/HZ).
- */
-static __be32 *encode_time_delta(__be32 *p, struct inode *inode)
+static __be32 nfsd4_encode_specdata4(struct xdr_stream *xdr,
+ unsigned int major, unsigned int minor)
{
- struct timespec64 ts;
- u32 ns;
+ __be32 status;
- ns = max_t(u32, NSEC_PER_SEC/HZ, inode->i_sb->s_time_gran);
- ts = ns_to_timespec64(ns);
+ status = nfsd4_encode_uint32_t(xdr, major);
+ if (status != nfs_ok)
+ return status;
+ return nfsd4_encode_uint32_t(xdr, minor);
+}
- p = xdr_encode_hyper(p, ts.tv_sec);
- *p++ = cpu_to_be32(ts.tv_nsec);
+static __be32
+nfsd4_encode_change_info4(struct xdr_stream *xdr, const struct nfsd4_change_info *c)
+{
+ __be32 status;
- return p;
+ status = nfsd4_encode_bool(xdr, c->atomic);
+ if (status != nfs_ok)
+ return status;
+ status = nfsd4_encode_changeid4(xdr, c->before_change);
+ if (status != nfs_ok)
+ return status;
+ return nfsd4_encode_changeid4(xdr, c->after_change);
}
-static __be32
-nfsd4_encode_change_info4(struct xdr_stream *xdr, struct nfsd4_change_info *c)
+static __be32 nfsd4_encode_netaddr4(struct xdr_stream *xdr,
+ const struct nfs42_netaddr *addr)
{
- if (xdr_stream_encode_bool(xdr, c->atomic) < 0)
- return nfserr_resource;
- if (xdr_stream_encode_u64(xdr, c->before_change) < 0)
- return nfserr_resource;
- if (xdr_stream_encode_u64(xdr, c->after_change) < 0)
- return nfserr_resource;
- return nfs_ok;
+ __be32 status;
+
+ /* na_r_netid */
+ status = nfsd4_encode_opaque(xdr, addr->netid, addr->netid_len);
+ if (status != nfs_ok)
+ return status;
+ /* na_r_addr */
+ return nfsd4_encode_opaque(xdr, addr->addr, addr->addr_len);
}
/* Encode as an array of strings the string given with components
@@ -2661,9 +2657,6 @@ static __be32 nfsd4_encode_components(struct xdr_stream *xdr, char sep,
return nfsd4_encode_components_esc(xdr, sep, components, 0, 0);
}
-/*
- * encode a location element of a fs_locations structure
- */
static __be32 nfsd4_encode_fs_location4(struct xdr_stream *xdr,
struct nfsd4_fs_location *location)
{
@@ -2676,15 +2669,12 @@ static __be32 nfsd4_encode_fs_location4(struct xdr_stream *xdr,
status = nfsd4_encode_components(xdr, '/', location->path);
if (status)
return status;
- return 0;
+ return nfs_ok;
}
-/*
- * Encode a path in RFC3530 'pathname4' format
- */
-static __be32 nfsd4_encode_path(struct xdr_stream *xdr,
- const struct path *root,
- const struct path *path)
+static __be32 nfsd4_encode_pathname4(struct xdr_stream *xdr,
+ const struct path *root,
+ const struct path *path)
{
struct path cur = *path;
__be32 *p;
@@ -2752,89 +2742,59 @@ out_free:
return err;
}
-static __be32 nfsd4_encode_fsloc_fsroot(struct xdr_stream *xdr,
- struct svc_rqst *rqstp, const struct path *path)
+static __be32 nfsd4_encode_fs_locations4(struct xdr_stream *xdr,
+ struct svc_rqst *rqstp,
+ struct svc_export *exp)
{
+ struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs;
struct svc_export *exp_ps;
- __be32 res;
+ unsigned int i;
+ __be32 status;
+ /* fs_root */
exp_ps = rqst_find_fsidzero_export(rqstp);
if (IS_ERR(exp_ps))
return nfserrno(PTR_ERR(exp_ps));
- res = nfsd4_encode_path(xdr, &exp_ps->ex_path, path);
+ status = nfsd4_encode_pathname4(xdr, &exp_ps->ex_path, &exp->ex_path);
exp_put(exp_ps);
- return res;
-}
-
-/*
- * encode a fs_locations structure
- */
-static __be32 nfsd4_encode_fs_locations(struct xdr_stream *xdr,
- struct svc_rqst *rqstp, struct svc_export *exp)
-{
- __be32 status;
- int i;
- __be32 *p;
- struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs;
-
- status = nfsd4_encode_fsloc_fsroot(xdr, rqstp, &exp->ex_path);
- if (status)
+ if (status != nfs_ok)
return status;
- p = xdr_reserve_space(xdr, 4);
- if (!p)
+
+ /* locations<> */
+ if (xdr_stream_encode_u32(xdr, fslocs->locations_count) != XDR_UNIT)
return nfserr_resource;
- *p++ = cpu_to_be32(fslocs->locations_count);
- for (i=0; i<fslocs->locations_count; i++) {
+ for (i = 0; i < fslocs->locations_count; i++) {
status = nfsd4_encode_fs_location4(xdr, &fslocs->locations[i]);
- if (status)
+ if (status != nfs_ok)
return status;
}
- return 0;
-}
-static u32 nfs4_file_type(umode_t mode)
-{
- switch (mode & S_IFMT) {
- case S_IFIFO: return NF4FIFO;
- case S_IFCHR: return NF4CHR;
- case S_IFDIR: return NF4DIR;
- case S_IFBLK: return NF4BLK;
- case S_IFLNK: return NF4LNK;
- case S_IFREG: return NF4REG;
- case S_IFSOCK: return NF4SOCK;
- default: return NF4BAD;
- }
+ return nfs_ok;
}
-static inline __be32
-nfsd4_encode_aclname(struct xdr_stream *xdr, struct svc_rqst *rqstp,
- struct nfs4_ace *ace)
+static __be32 nfsd4_encode_nfsace4(struct xdr_stream *xdr, struct svc_rqst *rqstp,
+ struct nfs4_ace *ace)
{
+ __be32 status;
+
+ /* type */
+ status = nfsd4_encode_acetype4(xdr, ace->type);
+ if (status != nfs_ok)
+ return nfserr_resource;
+ /* flag */
+ status = nfsd4_encode_aceflag4(xdr, ace->flag);
+ if (status != nfs_ok)
+ return nfserr_resource;
+ /* access mask */
+ status = nfsd4_encode_acemask4(xdr, ace->access_mask & NFS4_ACE_MASK_ALL);
+ if (status != nfs_ok)
+ return nfserr_resource;
+ /* who */
if (ace->whotype != NFS4_ACL_WHO_NAMED)
return nfs4_acl_write_who(xdr, ace->whotype);
- else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
+ if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
return nfsd4_encode_group(xdr, rqstp, ace->who_gid);
- else
- return nfsd4_encode_user(xdr, rqstp, ace->who_uid);
-}
-
-static inline __be32
-nfsd4_encode_layout_types(struct xdr_stream *xdr, u32 layout_types)
-{
- __be32 *p;
- unsigned long i = hweight_long(layout_types);
-
- p = xdr_reserve_space(xdr, 4 + 4 * i);
- if (!p)
- return nfserr_resource;
-
- *p++ = cpu_to_be32(i);
-
- for (i = LAYOUT_NFSV4_1_FILES; i < LAYOUT_TYPE_MAX; ++i)
- if (layout_types & (1 << i))
- *p++ = cpu_to_be32(i);
-
- return 0;
+ return nfsd4_encode_user(xdr, rqstp, ace->who_uid);
}
#define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \
@@ -2906,12 +2866,12 @@ static int nfsd4_get_mounted_on_ino(struct svc_export *exp, u64 *pino)
}
static __be32
-nfsd4_encode_bitmap(struct xdr_stream *xdr, u32 bmval0, u32 bmval1, u32 bmval2)
+nfsd4_encode_bitmap4(struct xdr_stream *xdr, u32 bmval0, u32 bmval1, u32 bmval2)
{
__be32 *p;
if (bmval2) {
- p = xdr_reserve_space(xdr, 16);
+ p = xdr_reserve_space(xdr, XDR_UNIT * 4);
if (!p)
goto out_resource;
*p++ = cpu_to_be32(3);
@@ -2919,94 +2879,684 @@ nfsd4_encode_bitmap(struct xdr_stream *xdr, u32 bmval0, u32 bmval1, u32 bmval2)
*p++ = cpu_to_be32(bmval1);
*p++ = cpu_to_be32(bmval2);
} else if (bmval1) {
- p = xdr_reserve_space(xdr, 12);
+ p = xdr_reserve_space(xdr, XDR_UNIT * 3);
if (!p)
goto out_resource;
*p++ = cpu_to_be32(2);
*p++ = cpu_to_be32(bmval0);
*p++ = cpu_to_be32(bmval1);
} else {
- p = xdr_reserve_space(xdr, 8);
+ p = xdr_reserve_space(xdr, XDR_UNIT * 2);
if (!p)
goto out_resource;
*p++ = cpu_to_be32(1);
*p++ = cpu_to_be32(bmval0);
}
- return 0;
+ return nfs_ok;
out_resource:
return nfserr_resource;
}
+struct nfsd4_fattr_args {
+ struct svc_rqst *rqstp;
+ struct svc_fh *fhp;
+ struct svc_export *exp;
+ struct dentry *dentry;
+ struct kstat stat;
+ struct kstatfs statfs;
+ struct nfs4_acl *acl;
+ u64 size;
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+ void *context;
+ int contextlen;
+#endif
+ u32 rdattr_err;
+ bool contextsupport;
+ bool ignore_crossmnt;
+};
+
+typedef __be32(*nfsd4_enc_attr)(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args);
+
+static __be32 nfsd4_encode_fattr4__noop(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfs_ok;
+}
+
+static __be32 nfsd4_encode_fattr4__true(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_bool(xdr, true);
+}
+
+static __be32 nfsd4_encode_fattr4__false(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_bool(xdr, false);
+}
+
+static __be32 nfsd4_encode_fattr4_supported_attrs(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ struct nfsd4_compoundres *resp = args->rqstp->rq_resp;
+ u32 minorversion = resp->cstate.minorversion;
+ u32 supp[3];
+
+ memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp));
+ if (!IS_POSIXACL(d_inode(args->dentry)))
+ supp[0] &= ~FATTR4_WORD0_ACL;
+ if (!args->contextsupport)
+ supp[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+
+ return nfsd4_encode_bitmap4(xdr, supp[0], supp[1], supp[2]);
+}
+
+static __be32 nfsd4_encode_fattr4_type(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, XDR_UNIT);
+ if (!p)
+ return nfserr_resource;
+
+ switch (args->stat.mode & S_IFMT) {
+ case S_IFIFO:
+ *p = cpu_to_be32(NF4FIFO);
+ break;
+ case S_IFCHR:
+ *p = cpu_to_be32(NF4CHR);
+ break;
+ case S_IFDIR:
+ *p = cpu_to_be32(NF4DIR);
+ break;
+ case S_IFBLK:
+ *p = cpu_to_be32(NF4BLK);
+ break;
+ case S_IFLNK:
+ *p = cpu_to_be32(NF4LNK);
+ break;
+ case S_IFREG:
+ *p = cpu_to_be32(NF4REG);
+ break;
+ case S_IFSOCK:
+ *p = cpu_to_be32(NF4SOCK);
+ break;
+ default:
+ return nfserr_serverfault;
+ }
+
+ return nfs_ok;
+}
+
+static __be32 nfsd4_encode_fattr4_fh_expire_type(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ u32 mask;
+
+ mask = NFS4_FH_PERSISTENT;
+ if (!(args->exp->ex_flags & NFSEXP_NOSUBTREECHECK))
+ mask |= NFS4_FH_VOL_RENAME;
+ return nfsd4_encode_uint32_t(xdr, mask);
+}
+
+static __be32 nfsd4_encode_fattr4_change(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ const struct svc_export *exp = args->exp;
+ u64 c;
+
+ if (unlikely(exp->ex_flags & NFSEXP_V4ROOT)) {
+ u32 flush_time = convert_to_wallclock(exp->cd->flush_time);
+
+ if (xdr_stream_encode_u32(xdr, flush_time) != XDR_UNIT)
+ return nfserr_resource;
+ if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
+ return nfserr_resource;
+ return nfs_ok;
+ }
+
+ c = nfsd4_change_attribute(&args->stat, d_inode(args->dentry));
+ return nfsd4_encode_changeid4(xdr, c);
+}
+
+static __be32 nfsd4_encode_fattr4_size(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_uint64_t(xdr, args->size);
+}
+
+static __be32 nfsd4_encode_fattr4_fsid(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, XDR_UNIT * 2 + XDR_UNIT * 2);
+ if (!p)
+ return nfserr_resource;
+
+ if (unlikely(args->exp->ex_fslocs.migrated)) {
+ p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MAJOR);
+ xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MINOR);
+ return nfs_ok;
+ }
+ switch (fsid_source(args->fhp)) {
+ case FSIDSOURCE_FSID:
+ p = xdr_encode_hyper(p, (u64)args->exp->ex_fsid);
+ xdr_encode_hyper(p, (u64)0);
+ break;
+ case FSIDSOURCE_DEV:
+ *p++ = xdr_zero;
+ *p++ = cpu_to_be32(MAJOR(args->stat.dev));
+ *p++ = xdr_zero;
+ *p = cpu_to_be32(MINOR(args->stat.dev));
+ break;
+ case FSIDSOURCE_UUID:
+ xdr_encode_opaque_fixed(p, args->exp->ex_uuid, EX_UUID_LEN);
+ break;
+ }
+
+ return nfs_ok;
+}
+
+static __be32 nfsd4_encode_fattr4_lease_time(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ struct nfsd_net *nn = net_generic(SVC_NET(args->rqstp), nfsd_net_id);
+
+ return nfsd4_encode_nfs_lease4(xdr, nn->nfsd4_lease);
+}
+
+static __be32 nfsd4_encode_fattr4_rdattr_error(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_uint32_t(xdr, args->rdattr_err);
+}
+
+static __be32 nfsd4_encode_fattr4_aclsupport(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ u32 mask;
+
+ mask = 0;
+ if (IS_POSIXACL(d_inode(args->dentry)))
+ mask = ACL4_SUPPORT_ALLOW_ACL | ACL4_SUPPORT_DENY_ACL;
+ return nfsd4_encode_uint32_t(xdr, mask);
+}
+
+static __be32 nfsd4_encode_fattr4_acl(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ struct nfs4_acl *acl = args->acl;
+ struct nfs4_ace *ace;
+ __be32 status;
+
+ /* nfsace4<> */
+ if (!acl) {
+ if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
+ return nfserr_resource;
+ } else {
+ if (xdr_stream_encode_u32(xdr, acl->naces) != XDR_UNIT)
+ return nfserr_resource;
+ for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) {
+ status = nfsd4_encode_nfsace4(xdr, args->rqstp, ace);
+ if (status != nfs_ok)
+ return status;
+ }
+ }
+ return nfs_ok;
+}
+
+static __be32 nfsd4_encode_fattr4_filehandle(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_nfs_fh4(xdr, &args->fhp->fh_handle);
+}
+
+static __be32 nfsd4_encode_fattr4_fileid(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_uint64_t(xdr, args->stat.ino);
+}
+
+static __be32 nfsd4_encode_fattr4_files_avail(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_uint64_t(xdr, args->statfs.f_ffree);
+}
+
+static __be32 nfsd4_encode_fattr4_files_free(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_uint64_t(xdr, args->statfs.f_ffree);
+}
+
+static __be32 nfsd4_encode_fattr4_files_total(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_uint64_t(xdr, args->statfs.f_files);
+}
+
+static __be32 nfsd4_encode_fattr4_fs_locations(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_fs_locations4(xdr, args->rqstp, args->exp);
+}
+
+static __be32 nfsd4_encode_fattr4_maxfilesize(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ struct super_block *sb = args->exp->ex_path.mnt->mnt_sb;
+
+ return nfsd4_encode_uint64_t(xdr, sb->s_maxbytes);
+}
+
+static __be32 nfsd4_encode_fattr4_maxlink(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_uint32_t(xdr, 255);
+}
+
+static __be32 nfsd4_encode_fattr4_maxname(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_uint32_t(xdr, args->statfs.f_namelen);
+}
+
+static __be32 nfsd4_encode_fattr4_maxread(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_uint64_t(xdr, svc_max_payload(args->rqstp));
+}
+
+static __be32 nfsd4_encode_fattr4_maxwrite(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_uint64_t(xdr, svc_max_payload(args->rqstp));
+}
+
+static __be32 nfsd4_encode_fattr4_mode(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_mode4(xdr, args->stat.mode & S_IALLUGO);
+}
+
+static __be32 nfsd4_encode_fattr4_numlinks(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_uint32_t(xdr, args->stat.nlink);
+}
+
+static __be32 nfsd4_encode_fattr4_owner(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_user(xdr, args->rqstp, args->stat.uid);
+}
+
+static __be32 nfsd4_encode_fattr4_owner_group(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_group(xdr, args->rqstp, args->stat.gid);
+}
+
+static __be32 nfsd4_encode_fattr4_rawdev(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_specdata4(xdr, MAJOR(args->stat.rdev),
+ MINOR(args->stat.rdev));
+}
+
+static __be32 nfsd4_encode_fattr4_space_avail(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ u64 avail = (u64)args->statfs.f_bavail * (u64)args->statfs.f_bsize;
+
+ return nfsd4_encode_uint64_t(xdr, avail);
+}
+
+static __be32 nfsd4_encode_fattr4_space_free(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ u64 free = (u64)args->statfs.f_bfree * (u64)args->statfs.f_bsize;
+
+ return nfsd4_encode_uint64_t(xdr, free);
+}
+
+static __be32 nfsd4_encode_fattr4_space_total(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ u64 total = (u64)args->statfs.f_blocks * (u64)args->statfs.f_bsize;
+
+ return nfsd4_encode_uint64_t(xdr, total);
+}
+
+static __be32 nfsd4_encode_fattr4_space_used(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_uint64_t(xdr, (u64)args->stat.blocks << 9);
+}
+
+static __be32 nfsd4_encode_fattr4_time_access(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_nfstime4(xdr, &args->stat.atime);
+}
+
+static __be32 nfsd4_encode_fattr4_time_create(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_nfstime4(xdr, &args->stat.btime);
+}
+
+/*
+ * ctime (in NFSv4, time_metadata) is not writeable, and the client
+ * doesn't really care what resolution could theoretically be stored by
+ * the filesystem.
+ *
+ * The client cares how close together changes can be while still
+ * guaranteeing ctime changes. For most filesystems (which have
+ * timestamps with nanosecond fields) that is limited by the resolution
+ * of the time returned from current_time() (which I'm assuming to be
+ * 1/HZ).
+ */
+static __be32 nfsd4_encode_fattr4_time_delta(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ const struct inode *inode = d_inode(args->dentry);
+ u32 ns = max_t(u32, NSEC_PER_SEC/HZ, inode->i_sb->s_time_gran);
+ struct timespec64 ts = ns_to_timespec64(ns);
+
+ return nfsd4_encode_nfstime4(xdr, &ts);
+}
+
+static __be32 nfsd4_encode_fattr4_time_metadata(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_nfstime4(xdr, &args->stat.ctime);
+}
+
+static __be32 nfsd4_encode_fattr4_time_modify(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_nfstime4(xdr, &args->stat.mtime);
+}
+
+static __be32 nfsd4_encode_fattr4_mounted_on_fileid(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ u64 ino;
+ int err;
+
+ if (!args->ignore_crossmnt &&
+ args->dentry == args->exp->ex_path.mnt->mnt_root) {
+ err = nfsd4_get_mounted_on_ino(args->exp, &ino);
+ if (err)
+ return nfserrno(err);
+ } else
+ ino = args->stat.ino;
+
+ return nfsd4_encode_uint64_t(xdr, ino);
+}
+
+#ifdef CONFIG_NFSD_PNFS
+
+static __be32 nfsd4_encode_fattr4_fs_layout_types(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ unsigned long mask = args->exp->ex_layout_types;
+ int i;
+
+ /* Hamming weight of @mask is the number of layout types to return */
+ if (xdr_stream_encode_u32(xdr, hweight_long(mask)) != XDR_UNIT)
+ return nfserr_resource;
+ for (i = LAYOUT_NFSV4_1_FILES; i < LAYOUT_TYPE_MAX; ++i)
+ if (mask & BIT(i)) {
+ /* layouttype4 */
+ if (xdr_stream_encode_u32(xdr, i) != XDR_UNIT)
+ return nfserr_resource;
+ }
+ return nfs_ok;
+}
+
+static __be32 nfsd4_encode_fattr4_layout_types(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ unsigned long mask = args->exp->ex_layout_types;
+ int i;
+
+ /* Hamming weight of @mask is the number of layout types to return */
+ if (xdr_stream_encode_u32(xdr, hweight_long(mask)) != XDR_UNIT)
+ return nfserr_resource;
+ for (i = LAYOUT_NFSV4_1_FILES; i < LAYOUT_TYPE_MAX; ++i)
+ if (mask & BIT(i)) {
+ /* layouttype4 */
+ if (xdr_stream_encode_u32(xdr, i) != XDR_UNIT)
+ return nfserr_resource;
+ }
+ return nfs_ok;
+}
+
+static __be32 nfsd4_encode_fattr4_layout_blksize(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_uint32_t(xdr, args->stat.blksize);
+}
+
+#endif
+
+static __be32 nfsd4_encode_fattr4_suppattr_exclcreat(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ struct nfsd4_compoundres *resp = args->rqstp->rq_resp;
+ u32 supp[3];
+
+ memcpy(supp, nfsd_suppattrs[resp->cstate.minorversion], sizeof(supp));
+ supp[0] &= NFSD_SUPPATTR_EXCLCREAT_WORD0;
+ supp[1] &= NFSD_SUPPATTR_EXCLCREAT_WORD1;
+ supp[2] &= NFSD_SUPPATTR_EXCLCREAT_WORD2;
+
+ return nfsd4_encode_bitmap4(xdr, supp[0], supp[1], supp[2]);
+}
+
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+static __be32 nfsd4_encode_fattr4_sec_label(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_security_label(xdr, args->rqstp,
+ args->context, args->contextlen);
+}
+#endif
+
+static __be32 nfsd4_encode_fattr4_xattr_support(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ int err = xattr_supports_user_prefix(d_inode(args->dentry));
+
+ return nfsd4_encode_bool(xdr, err == 0);
+}
+
+static const nfsd4_enc_attr nfsd4_enc_fattr4_encode_ops[] = {
+ [FATTR4_SUPPORTED_ATTRS] = nfsd4_encode_fattr4_supported_attrs,
+ [FATTR4_TYPE] = nfsd4_encode_fattr4_type,
+ [FATTR4_FH_EXPIRE_TYPE] = nfsd4_encode_fattr4_fh_expire_type,
+ [FATTR4_CHANGE] = nfsd4_encode_fattr4_change,
+ [FATTR4_SIZE] = nfsd4_encode_fattr4_size,
+ [FATTR4_LINK_SUPPORT] = nfsd4_encode_fattr4__true,
+ [FATTR4_SYMLINK_SUPPORT] = nfsd4_encode_fattr4__true,
+ [FATTR4_NAMED_ATTR] = nfsd4_encode_fattr4__false,
+ [FATTR4_FSID] = nfsd4_encode_fattr4_fsid,
+ [FATTR4_UNIQUE_HANDLES] = nfsd4_encode_fattr4__true,
+ [FATTR4_LEASE_TIME] = nfsd4_encode_fattr4_lease_time,
+ [FATTR4_RDATTR_ERROR] = nfsd4_encode_fattr4_rdattr_error,
+ [FATTR4_ACL] = nfsd4_encode_fattr4_acl,
+ [FATTR4_ACLSUPPORT] = nfsd4_encode_fattr4_aclsupport,
+ [FATTR4_ARCHIVE] = nfsd4_encode_fattr4__noop,
+ [FATTR4_CANSETTIME] = nfsd4_encode_fattr4__true,
+ [FATTR4_CASE_INSENSITIVE] = nfsd4_encode_fattr4__false,
+ [FATTR4_CASE_PRESERVING] = nfsd4_encode_fattr4__true,
+ [FATTR4_CHOWN_RESTRICTED] = nfsd4_encode_fattr4__true,
+ [FATTR4_FILEHANDLE] = nfsd4_encode_fattr4_filehandle,
+ [FATTR4_FILEID] = nfsd4_encode_fattr4_fileid,
+ [FATTR4_FILES_AVAIL] = nfsd4_encode_fattr4_files_avail,
+ [FATTR4_FILES_FREE] = nfsd4_encode_fattr4_files_free,
+ [FATTR4_FILES_TOTAL] = nfsd4_encode_fattr4_files_total,
+ [FATTR4_FS_LOCATIONS] = nfsd4_encode_fattr4_fs_locations,
+ [FATTR4_HIDDEN] = nfsd4_encode_fattr4__noop,
+ [FATTR4_HOMOGENEOUS] = nfsd4_encode_fattr4__true,
+ [FATTR4_MAXFILESIZE] = nfsd4_encode_fattr4_maxfilesize,
+ [FATTR4_MAXLINK] = nfsd4_encode_fattr4_maxlink,
+ [FATTR4_MAXNAME] = nfsd4_encode_fattr4_maxname,
+ [FATTR4_MAXREAD] = nfsd4_encode_fattr4_maxread,
+ [FATTR4_MAXWRITE] = nfsd4_encode_fattr4_maxwrite,
+ [FATTR4_MIMETYPE] = nfsd4_encode_fattr4__noop,
+ [FATTR4_MODE] = nfsd4_encode_fattr4_mode,
+ [FATTR4_NO_TRUNC] = nfsd4_encode_fattr4__true,
+ [FATTR4_NUMLINKS] = nfsd4_encode_fattr4_numlinks,
+ [FATTR4_OWNER] = nfsd4_encode_fattr4_owner,
+ [FATTR4_OWNER_GROUP] = nfsd4_encode_fattr4_owner_group,
+ [FATTR4_QUOTA_AVAIL_HARD] = nfsd4_encode_fattr4__noop,
+ [FATTR4_QUOTA_AVAIL_SOFT] = nfsd4_encode_fattr4__noop,
+ [FATTR4_QUOTA_USED] = nfsd4_encode_fattr4__noop,
+ [FATTR4_RAWDEV] = nfsd4_encode_fattr4_rawdev,
+ [FATTR4_SPACE_AVAIL] = nfsd4_encode_fattr4_space_avail,
+ [FATTR4_SPACE_FREE] = nfsd4_encode_fattr4_space_free,
+ [FATTR4_SPACE_TOTAL] = nfsd4_encode_fattr4_space_total,
+ [FATTR4_SPACE_USED] = nfsd4_encode_fattr4_space_used,
+ [FATTR4_SYSTEM] = nfsd4_encode_fattr4__noop,
+ [FATTR4_TIME_ACCESS] = nfsd4_encode_fattr4_time_access,
+ [FATTR4_TIME_ACCESS_SET] = nfsd4_encode_fattr4__noop,
+ [FATTR4_TIME_BACKUP] = nfsd4_encode_fattr4__noop,
+ [FATTR4_TIME_CREATE] = nfsd4_encode_fattr4_time_create,
+ [FATTR4_TIME_DELTA] = nfsd4_encode_fattr4_time_delta,
+ [FATTR4_TIME_METADATA] = nfsd4_encode_fattr4_time_metadata,
+ [FATTR4_TIME_MODIFY] = nfsd4_encode_fattr4_time_modify,
+ [FATTR4_TIME_MODIFY_SET] = nfsd4_encode_fattr4__noop,
+ [FATTR4_MOUNTED_ON_FILEID] = nfsd4_encode_fattr4_mounted_on_fileid,
+ [FATTR4_DIR_NOTIF_DELAY] = nfsd4_encode_fattr4__noop,
+ [FATTR4_DIRENT_NOTIF_DELAY] = nfsd4_encode_fattr4__noop,
+ [FATTR4_DACL] = nfsd4_encode_fattr4__noop,
+ [FATTR4_SACL] = nfsd4_encode_fattr4__noop,
+ [FATTR4_CHANGE_POLICY] = nfsd4_encode_fattr4__noop,
+ [FATTR4_FS_STATUS] = nfsd4_encode_fattr4__noop,
+
+#ifdef CONFIG_NFSD_PNFS
+ [FATTR4_FS_LAYOUT_TYPES] = nfsd4_encode_fattr4_fs_layout_types,
+ [FATTR4_LAYOUT_HINT] = nfsd4_encode_fattr4__noop,
+ [FATTR4_LAYOUT_TYPES] = nfsd4_encode_fattr4_layout_types,
+ [FATTR4_LAYOUT_BLKSIZE] = nfsd4_encode_fattr4_layout_blksize,
+ [FATTR4_LAYOUT_ALIGNMENT] = nfsd4_encode_fattr4__noop,
+#else
+ [FATTR4_FS_LAYOUT_TYPES] = nfsd4_encode_fattr4__noop,
+ [FATTR4_LAYOUT_HINT] = nfsd4_encode_fattr4__noop,
+ [FATTR4_LAYOUT_TYPES] = nfsd4_encode_fattr4__noop,
+ [FATTR4_LAYOUT_BLKSIZE] = nfsd4_encode_fattr4__noop,
+ [FATTR4_LAYOUT_ALIGNMENT] = nfsd4_encode_fattr4__noop,
+#endif
+
+ [FATTR4_FS_LOCATIONS_INFO] = nfsd4_encode_fattr4__noop,
+ [FATTR4_MDSTHRESHOLD] = nfsd4_encode_fattr4__noop,
+ [FATTR4_RETENTION_GET] = nfsd4_encode_fattr4__noop,
+ [FATTR4_RETENTION_SET] = nfsd4_encode_fattr4__noop,
+ [FATTR4_RETENTEVT_GET] = nfsd4_encode_fattr4__noop,
+ [FATTR4_RETENTEVT_SET] = nfsd4_encode_fattr4__noop,
+ [FATTR4_RETENTION_HOLD] = nfsd4_encode_fattr4__noop,
+ [FATTR4_MODE_SET_MASKED] = nfsd4_encode_fattr4__noop,
+ [FATTR4_SUPPATTR_EXCLCREAT] = nfsd4_encode_fattr4_suppattr_exclcreat,
+ [FATTR4_FS_CHARSET_CAP] = nfsd4_encode_fattr4__noop,
+ [FATTR4_CLONE_BLKSIZE] = nfsd4_encode_fattr4__noop,
+ [FATTR4_SPACE_FREED] = nfsd4_encode_fattr4__noop,
+ [FATTR4_CHANGE_ATTR_TYPE] = nfsd4_encode_fattr4__noop,
+
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+ [FATTR4_SEC_LABEL] = nfsd4_encode_fattr4_sec_label,
+#else
+ [FATTR4_SEC_LABEL] = nfsd4_encode_fattr4__noop,
+#endif
+
+ [FATTR4_MODE_UMASK] = nfsd4_encode_fattr4__noop,
+ [FATTR4_XATTR_SUPPORT] = nfsd4_encode_fattr4_xattr_support,
+};
+
/*
* Note: @fhp can be NULL; in this case, we might have to compose the filehandle
* ourselves.
*/
static __be32
-nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
- struct svc_export *exp,
- struct dentry *dentry, u32 *bmval,
- struct svc_rqst *rqstp, int ignore_crossmnt)
+nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
+ struct svc_fh *fhp, struct svc_export *exp,
+ struct dentry *dentry, const u32 *bmval,
+ int ignore_crossmnt)
{
- u32 bmval0 = bmval[0];
- u32 bmval1 = bmval[1];
- u32 bmval2 = bmval[2];
- struct kstat stat;
+ struct nfsd4_fattr_args args;
struct svc_fh *tempfh = NULL;
- struct kstatfs statfs;
- __be32 *p, *attrlen_p;
int starting_len = xdr->buf->len;
+ __be32 *attrlen_p, status;
int attrlen_offset;
- u32 dummy;
- u64 dummy64;
- u32 rdattr_err = 0;
- __be32 status;
int err;
- struct nfs4_acl *acl = NULL;
-#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
- void *context = NULL;
- int contextlen;
-#endif
- bool contextsupport = false;
struct nfsd4_compoundres *resp = rqstp->rq_resp;
u32 minorversion = resp->cstate.minorversion;
struct path path = {
.mnt = exp->ex_path.mnt,
.dentry = dentry,
};
- struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ union {
+ u32 attrmask[3];
+ unsigned long mask[2];
+ } u;
+ unsigned long bit;
+
+ WARN_ON_ONCE(bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1);
+ WARN_ON_ONCE(!nfsd_attrs_supported(minorversion, bmval));
- BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
- BUG_ON(!nfsd_attrs_supported(minorversion, bmval));
+ args.rqstp = rqstp;
+ args.exp = exp;
+ args.dentry = dentry;
+ args.ignore_crossmnt = (ignore_crossmnt != 0);
+ /*
+ * Make a local copy of the attribute bitmap that can be modified.
+ */
+ memset(&u, 0, sizeof(u));
+ u.attrmask[0] = bmval[0];
+ u.attrmask[1] = bmval[1];
+ u.attrmask[2] = bmval[2];
+
+ args.rdattr_err = 0;
if (exp->ex_fslocs.migrated) {
- status = fattr_handle_absent_fs(&bmval0, &bmval1, &bmval2, &rdattr_err);
+ status = fattr_handle_absent_fs(&u.attrmask[0], &u.attrmask[1],
+ &u.attrmask[2], &args.rdattr_err);
if (status)
goto out;
}
- if (bmval0 & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) {
+ args.size = 0;
+ if (u.attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) {
status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry));
if (status)
goto out;
}
- err = vfs_getattr(&path, &stat,
+ err = vfs_getattr(&path, &args.stat,
STATX_BASIC_STATS | STATX_BTIME | STATX_CHANGE_COOKIE,
AT_STATX_SYNC_AS_STAT);
if (err)
goto out_nfserr;
- if (!(stat.result_mask & STATX_BTIME))
+ args.size = args.stat.size;
+
+ if (!(args.stat.result_mask & STATX_BTIME))
/* underlying FS does not offer btime so we can't share it */
- bmval1 &= ~FATTR4_WORD1_TIME_CREATE;
- if ((bmval0 & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE |
+ u.attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE;
+ if ((u.attrmask[0] & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE |
FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) ||
- (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
+ (u.attrmask[1] & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
FATTR4_WORD1_SPACE_TOTAL))) {
- err = vfs_statfs(&path, &statfs);
+ err = vfs_statfs(&path, &args.statfs);
if (err)
goto out_nfserr;
}
- if ((bmval0 & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) {
+ if ((u.attrmask[0] & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) &&
+ !fhp) {
tempfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL);
status = nfserr_jukebox;
if (!tempfh)
@@ -3015,12 +3565,15 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
status = fh_compose(tempfh, exp, dentry, NULL);
if (status)
goto out;
- fhp = tempfh;
- }
- if (bmval0 & FATTR4_WORD0_ACL) {
- err = nfsd4_get_nfs4_acl(rqstp, dentry, &acl);
+ args.fhp = tempfh;
+ } else
+ args.fhp = fhp;
+
+ args.acl = NULL;
+ if (u.attrmask[0] & FATTR4_WORD0_ACL) {
+ err = nfsd4_get_nfs4_acl(rqstp, dentry, &args.acl);
if (err == -EOPNOTSUPP)
- bmval0 &= ~FATTR4_WORD0_ACL;
+ u.attrmask[0] &= ~FATTR4_WORD0_ACL;
else if (err == -EINVAL) {
status = nfserr_attrnotsupp;
goto out;
@@ -3028,452 +3581,53 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
goto out_nfserr;
}
+ args.contextsupport = false;
+
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
- if ((bmval2 & FATTR4_WORD2_SECURITY_LABEL) ||
- bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
+ args.context = NULL;
+ if ((u.attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) ||
+ u.attrmask[0] & FATTR4_WORD0_SUPPORTED_ATTRS) {
if (exp->ex_flags & NFSEXP_SECURITY_LABEL)
err = security_inode_getsecctx(d_inode(dentry),
- &context, &contextlen);
+ &args.context, &args.contextlen);
else
err = -EOPNOTSUPP;
- contextsupport = (err == 0);
- if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
+ args.contextsupport = (err == 0);
+ if (u.attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) {
if (err == -EOPNOTSUPP)
- bmval2 &= ~FATTR4_WORD2_SECURITY_LABEL;
+ u.attrmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
else if (err)
goto out_nfserr;
}
}
#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
- status = nfsd4_encode_bitmap(xdr, bmval0, bmval1, bmval2);
+ /* attrmask */
+ status = nfsd4_encode_bitmap4(xdr, u.attrmask[0],
+ u.attrmask[1], u.attrmask[2]);
if (status)
goto out;
+ /* attr_vals */
attrlen_offset = xdr->buf->len;
attrlen_p = xdr_reserve_space(xdr, XDR_UNIT);
if (!attrlen_p)
goto out_resource;
-
- if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
- u32 supp[3];
-
- memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp));
-
- if (!IS_POSIXACL(dentry->d_inode))
- supp[0] &= ~FATTR4_WORD0_ACL;
- if (!contextsupport)
- supp[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
- if (!supp[2]) {
- p = xdr_reserve_space(xdr, 12);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(2);
- *p++ = cpu_to_be32(supp[0]);
- *p++ = cpu_to_be32(supp[1]);
- } else {
- p = xdr_reserve_space(xdr, 16);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(3);
- *p++ = cpu_to_be32(supp[0]);
- *p++ = cpu_to_be32(supp[1]);
- *p++ = cpu_to_be32(supp[2]);
- }
- }
- if (bmval0 & FATTR4_WORD0_TYPE) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- dummy = nfs4_file_type(stat.mode);
- if (dummy == NF4BAD) {
- status = nfserr_serverfault;
+ for_each_set_bit(bit, (const unsigned long *)&u.mask,
+ ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops)) {
+ status = nfsd4_enc_fattr4_encode_ops[bit](xdr, &args);
+ if (status != nfs_ok)
goto out;
- }
- *p++ = cpu_to_be32(dummy);
- }
- if (bmval0 & FATTR4_WORD0_FH_EXPIRE_TYPE) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- if (exp->ex_flags & NFSEXP_NOSUBTREECHECK)
- *p++ = cpu_to_be32(NFS4_FH_PERSISTENT);
- else
- *p++ = cpu_to_be32(NFS4_FH_PERSISTENT|
- NFS4_FH_VOL_RENAME);
- }
- if (bmval0 & FATTR4_WORD0_CHANGE) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- p = encode_change(p, &stat, d_inode(dentry), exp);
}
- if (bmval0 & FATTR4_WORD0_SIZE) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- p = xdr_encode_hyper(p, stat.size);
- }
- if (bmval0 & FATTR4_WORD0_LINK_SUPPORT) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(1);
- }
- if (bmval0 & FATTR4_WORD0_SYMLINK_SUPPORT) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(1);
- }
- if (bmval0 & FATTR4_WORD0_NAMED_ATTR) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(0);
- }
- if (bmval0 & FATTR4_WORD0_FSID) {
- p = xdr_reserve_space(xdr, 16);
- if (!p)
- goto out_resource;
- if (exp->ex_fslocs.migrated) {
- p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MAJOR);
- p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MINOR);
- } else switch(fsid_source(fhp)) {
- case FSIDSOURCE_FSID:
- p = xdr_encode_hyper(p, (u64)exp->ex_fsid);
- p = xdr_encode_hyper(p, (u64)0);
- break;
- case FSIDSOURCE_DEV:
- *p++ = cpu_to_be32(0);
- *p++ = cpu_to_be32(MAJOR(stat.dev));
- *p++ = cpu_to_be32(0);
- *p++ = cpu_to_be32(MINOR(stat.dev));
- break;
- case FSIDSOURCE_UUID:
- p = xdr_encode_opaque_fixed(p, exp->ex_uuid,
- EX_UUID_LEN);
- break;
- }
- }
- if (bmval0 & FATTR4_WORD0_UNIQUE_HANDLES) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(0);
- }
- if (bmval0 & FATTR4_WORD0_LEASE_TIME) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(nn->nfsd4_lease);
- }
- if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(rdattr_err);
- }
- if (bmval0 & FATTR4_WORD0_ACL) {
- struct nfs4_ace *ace;
-
- if (acl == NULL) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
-
- *p++ = cpu_to_be32(0);
- goto out_acl;
- }
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(acl->naces);
-
- for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) {
- p = xdr_reserve_space(xdr, 4*3);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(ace->type);
- *p++ = cpu_to_be32(ace->flag);
- *p++ = cpu_to_be32(ace->access_mask &
- NFS4_ACE_MASK_ALL);
- status = nfsd4_encode_aclname(xdr, rqstp, ace);
- if (status)
- goto out;
- }
- }
-out_acl:
- if (bmval0 & FATTR4_WORD0_ACLSUPPORT) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(IS_POSIXACL(dentry->d_inode) ?
- ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL : 0);
- }
- if (bmval0 & FATTR4_WORD0_CANSETTIME) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(1);
- }
- if (bmval0 & FATTR4_WORD0_CASE_INSENSITIVE) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(0);
- }
- if (bmval0 & FATTR4_WORD0_CASE_PRESERVING) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(1);
- }
- if (bmval0 & FATTR4_WORD0_CHOWN_RESTRICTED) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(1);
- }
- if (bmval0 & FATTR4_WORD0_FILEHANDLE) {
- p = xdr_reserve_space(xdr, fhp->fh_handle.fh_size + 4);
- if (!p)
- goto out_resource;
- p = xdr_encode_opaque(p, &fhp->fh_handle.fh_raw,
- fhp->fh_handle.fh_size);
- }
- if (bmval0 & FATTR4_WORD0_FILEID) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- p = xdr_encode_hyper(p, stat.ino);
- }
- if (bmval0 & FATTR4_WORD0_FILES_AVAIL) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- p = xdr_encode_hyper(p, (u64) statfs.f_ffree);
- }
- if (bmval0 & FATTR4_WORD0_FILES_FREE) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- p = xdr_encode_hyper(p, (u64) statfs.f_ffree);
- }
- if (bmval0 & FATTR4_WORD0_FILES_TOTAL) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- p = xdr_encode_hyper(p, (u64) statfs.f_files);
- }
- if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) {
- status = nfsd4_encode_fs_locations(xdr, rqstp, exp);
- if (status)
- goto out;
- }
- if (bmval0 & FATTR4_WORD0_HOMOGENEOUS) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(1);
- }
- if (bmval0 & FATTR4_WORD0_MAXFILESIZE) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- p = xdr_encode_hyper(p, exp->ex_path.mnt->mnt_sb->s_maxbytes);
- }
- if (bmval0 & FATTR4_WORD0_MAXLINK) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(255);
- }
- if (bmval0 & FATTR4_WORD0_MAXNAME) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(statfs.f_namelen);
- }
- if (bmval0 & FATTR4_WORD0_MAXREAD) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp));
- }
- if (bmval0 & FATTR4_WORD0_MAXWRITE) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp));
- }
- if (bmval1 & FATTR4_WORD1_MODE) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(stat.mode & S_IALLUGO);
- }
- if (bmval1 & FATTR4_WORD1_NO_TRUNC) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(1);
- }
- if (bmval1 & FATTR4_WORD1_NUMLINKS) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(stat.nlink);
- }
- if (bmval1 & FATTR4_WORD1_OWNER) {
- status = nfsd4_encode_user(xdr, rqstp, stat.uid);
- if (status)
- goto out;
- }
- if (bmval1 & FATTR4_WORD1_OWNER_GROUP) {
- status = nfsd4_encode_group(xdr, rqstp, stat.gid);
- if (status)
- goto out;
- }
- if (bmval1 & FATTR4_WORD1_RAWDEV) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32((u32) MAJOR(stat.rdev));
- *p++ = cpu_to_be32((u32) MINOR(stat.rdev));
- }
- if (bmval1 & FATTR4_WORD1_SPACE_AVAIL) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- dummy64 = (u64)statfs.f_bavail * (u64)statfs.f_bsize;
- p = xdr_encode_hyper(p, dummy64);
- }
- if (bmval1 & FATTR4_WORD1_SPACE_FREE) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- dummy64 = (u64)statfs.f_bfree * (u64)statfs.f_bsize;
- p = xdr_encode_hyper(p, dummy64);
- }
- if (bmval1 & FATTR4_WORD1_SPACE_TOTAL) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- dummy64 = (u64)statfs.f_blocks * (u64)statfs.f_bsize;
- p = xdr_encode_hyper(p, dummy64);
- }
- if (bmval1 & FATTR4_WORD1_SPACE_USED) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- dummy64 = (u64)stat.blocks << 9;
- p = xdr_encode_hyper(p, dummy64);
- }
- if (bmval1 & FATTR4_WORD1_TIME_ACCESS) {
- status = nfsd4_encode_nfstime4(xdr, &stat.atime);
- if (status)
- goto out;
- }
- if (bmval1 & FATTR4_WORD1_TIME_CREATE) {
- status = nfsd4_encode_nfstime4(xdr, &stat.btime);
- if (status)
- goto out;
- }
- if (bmval1 & FATTR4_WORD1_TIME_DELTA) {
- p = xdr_reserve_space(xdr, 12);
- if (!p)
- goto out_resource;
- p = encode_time_delta(p, d_inode(dentry));
- }
- if (bmval1 & FATTR4_WORD1_TIME_METADATA) {
- status = nfsd4_encode_nfstime4(xdr, &stat.ctime);
- if (status)
- goto out;
- }
- if (bmval1 & FATTR4_WORD1_TIME_MODIFY) {
- status = nfsd4_encode_nfstime4(xdr, &stat.mtime);
- if (status)
- goto out;
- }
- if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
- u64 ino = stat.ino;
-
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- /*
- * Get ino of mountpoint in parent filesystem, if not ignoring
- * crossmount and this is the root of a cross-mounted
- * filesystem.
- */
- if (ignore_crossmnt == 0 &&
- dentry == exp->ex_path.mnt->mnt_root) {
- err = nfsd4_get_mounted_on_ino(exp, &ino);
- if (err)
- goto out_nfserr;
- }
- p = xdr_encode_hyper(p, ino);
- }
-#ifdef CONFIG_NFSD_PNFS
- if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) {
- status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types);
- if (status)
- goto out;
- }
-
- if (bmval2 & FATTR4_WORD2_LAYOUT_TYPES) {
- status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types);
- if (status)
- goto out;
- }
-
- if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(stat.blksize);
- }
-#endif /* CONFIG_NFSD_PNFS */
- if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
- u32 supp[3];
-
- memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp));
- supp[0] &= NFSD_SUPPATTR_EXCLCREAT_WORD0;
- supp[1] &= NFSD_SUPPATTR_EXCLCREAT_WORD1;
- supp[2] &= NFSD_SUPPATTR_EXCLCREAT_WORD2;
-
- status = nfsd4_encode_bitmap(xdr, supp[0], supp[1], supp[2]);
- if (status)
- goto out;
- }
-
-#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
- if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
- status = nfsd4_encode_security_label(xdr, rqstp, context,
- contextlen);
- if (status)
- goto out;
- }
-#endif
-
- if (bmval2 & FATTR4_WORD2_XATTR_SUPPORT) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- err = xattr_supports_user_prefix(d_inode(dentry));
- *p++ = cpu_to_be32(err == 0);
- }
-
*attrlen_p = cpu_to_be32(xdr->buf->len - attrlen_offset - XDR_UNIT);
status = nfs_ok;
out:
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
- if (context)
- security_release_secctx(context, contextlen);
+ if (args.context)
+ security_release_secctx(args.context, args.contextlen);
#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
- kfree(acl);
+ kfree(args.acl);
if (tempfh) {
fh_put(tempfh);
kfree(tempfh);
@@ -3514,12 +3668,28 @@ __be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words,
__be32 ret;
svcxdr_init_encode_from_buffer(&xdr, &dummy, *p, words << 2);
- ret = nfsd4_encode_fattr(&xdr, fhp, exp, dentry, bmval, rqstp,
- ignore_crossmnt);
+ ret = nfsd4_encode_fattr4(rqstp, &xdr, fhp, exp, dentry, bmval,
+ ignore_crossmnt);
*p = xdr.p;
return ret;
}
+/*
+ * The buffer space for this field was reserved during a previous
+ * call to nfsd4_encode_entry4().
+ */
+static void nfsd4_encode_entry4_nfs_cookie4(const struct nfsd4_readdir *readdir,
+ u64 offset)
+{
+ __be64 cookie = cpu_to_be64(offset);
+ struct xdr_stream *xdr = readdir->xdr;
+
+ if (!readdir->cookie_offset)
+ return;
+ write_bytes_to_xdr_buf(xdr->buf, readdir->cookie_offset, &cookie,
+ sizeof(cookie));
+}
+
static inline int attributes_need_mount(u32 *bmval)
{
if (bmval[0] & ~(FATTR4_WORD0_RDATTR_ERROR | FATTR4_WORD0_LEASE_TIME))
@@ -3530,8 +3700,8 @@ static inline int attributes_need_mount(u32 *bmval)
}
static __be32
-nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
- const char *name, int namlen)
+nfsd4_encode_entry4_fattr(struct nfsd4_readdir *cd, const char *name,
+ int namlen)
{
struct svc_export *exp = cd->rd_fhp->fh_export;
struct dentry *dentry;
@@ -3574,33 +3744,34 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
}
out_encode:
- nfserr = nfsd4_encode_fattr(xdr, NULL, exp, dentry, cd->rd_bmval,
- cd->rd_rqstp, ignore_crossmnt);
+ nfserr = nfsd4_encode_fattr4(cd->rd_rqstp, cd->xdr, NULL, exp, dentry,
+ cd->rd_bmval, ignore_crossmnt);
out_put:
dput(dentry);
exp_put(exp);
return nfserr;
}
-static __be32 *
-nfsd4_encode_rdattr_error(struct xdr_stream *xdr, __be32 nfserr)
+static __be32
+nfsd4_encode_entry4_rdattr_error(struct xdr_stream *xdr, __be32 nfserr)
{
- __be32 *p;
-
- p = xdr_reserve_space(xdr, 20);
- if (!p)
- return NULL;
- *p++ = htonl(2);
- *p++ = htonl(FATTR4_WORD0_RDATTR_ERROR); /* bmval0 */
- *p++ = htonl(0); /* bmval1 */
+ __be32 status;
- *p++ = htonl(4); /* attribute length */
- *p++ = nfserr; /* no htonl */
- return p;
+ /* attrmask */
+ status = nfsd4_encode_bitmap4(xdr, FATTR4_WORD0_RDATTR_ERROR, 0, 0);
+ if (status != nfs_ok)
+ return status;
+ /* attr_vals */
+ if (xdr_stream_encode_u32(xdr, XDR_UNIT) != XDR_UNIT)
+ return nfserr_resource;
+ /* rdattr_error */
+ if (xdr_stream_encode_be32(xdr, nfserr) != XDR_UNIT)
+ return nfserr_resource;
+ return nfs_ok;
}
static int
-nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
+nfsd4_encode_entry4(void *ccdv, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct readdir_cd *ccd = ccdv;
@@ -3611,8 +3782,6 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
u32 name_and_cookie;
int entry_bytes;
__be32 nfserr = nfserr_toosmall;
- __be64 wire_offset;
- __be32 *p;
/* In nfsv4, "." and ".." never make it onto the wire.. */
if (name && isdotent(name, namlen)) {
@@ -3620,24 +3789,19 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
return 0;
}
- if (cd->cookie_offset) {
- wire_offset = cpu_to_be64(offset);
- write_bytes_to_xdr_buf(xdr->buf, cd->cookie_offset,
- &wire_offset, 8);
- }
+ /* Encode the previous entry's cookie value */
+ nfsd4_encode_entry4_nfs_cookie4(cd, offset);
- p = xdr_reserve_space(xdr, 4);
- if (!p)
+ if (xdr_stream_encode_item_present(xdr) != XDR_UNIT)
goto fail;
- *p++ = xdr_one; /* mark entry present */
+
+ /* Reserve send buffer space for this entry's cookie value. */
cookie_offset = xdr->buf->len;
- p = xdr_reserve_space(xdr, 3*4 + namlen);
- if (!p)
+ if (nfsd4_encode_nfs_cookie4(xdr, OFFSET_MAX) != nfs_ok)
goto fail;
- p = xdr_encode_hyper(p, OFFSET_MAX); /* offset of next entry */
- p = xdr_encode_array(p, name, namlen); /* name length & name */
-
- nfserr = nfsd4_encode_dirent_fattr(xdr, cd, name, namlen);
+ if (nfsd4_encode_component4(xdr, name, namlen) != nfs_ok)
+ goto fail;
+ nfserr = nfsd4_encode_entry4_fattr(cd, name, namlen);
switch (nfserr) {
case nfs_ok:
break;
@@ -3668,8 +3832,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
*/
if (!(cd->rd_bmval[0] & FATTR4_WORD0_RDATTR_ERROR))
goto fail;
- p = nfsd4_encode_rdattr_error(xdr, nfserr);
- if (p == NULL) {
+ if (nfsd4_encode_entry4_rdattr_error(xdr, nfserr)) {
nfserr = nfserr_toosmall;
goto fail;
}
@@ -3727,18 +3890,26 @@ nfsd4_encode_clientid4(struct xdr_stream *xdr, const clientid_t *clientid)
return nfs_ok;
}
+/* This is a frequently-encoded item; open-coded for speed */
static __be32
-nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid)
+nfsd4_encode_stateid4(struct xdr_stream *xdr, const stateid_t *sid)
{
__be32 *p;
- p = xdr_reserve_space(xdr, sizeof(stateid_t));
+ p = xdr_reserve_space(xdr, NFS4_STATEID_SIZE);
if (!p)
return nfserr_resource;
*p++ = cpu_to_be32(sid->si_generation);
- p = xdr_encode_opaque_fixed(p, &sid->si_opaque,
- sizeof(stateid_opaque_t));
- return 0;
+ memcpy(p, &sid->si_opaque, sizeof(sid->si_opaque));
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_encode_sessionid4(struct xdr_stream *xdr,
+ const struct nfs4_sessionid *sessionid)
+{
+ return nfsd4_encode_opaque_fixed(xdr, sessionid->data,
+ NFS4_MAX_SESSIONID_LEN);
}
static __be32
@@ -3747,14 +3918,14 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr,
{
struct nfsd4_access *access = &u->access;
struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
+ __be32 status;
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(access->ac_supported);
- *p++ = cpu_to_be32(access->ac_resp_access);
- return 0;
+ /* supported */
+ status = nfsd4_encode_uint32_t(xdr, access->ac_supported);
+ if (status != nfs_ok)
+ return status;
+ /* access */
+ return nfsd4_encode_uint32_t(xdr, access->ac_resp_access);
}
static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr,
@@ -3762,17 +3933,16 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp,
{
struct nfsd4_bind_conn_to_session *bcts = &u->bind_conn_to_session;
struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
- p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 8);
- if (!p)
+ /* bctsr_sessid */
+ nfserr = nfsd4_encode_sessionid4(xdr, &bcts->sessionid);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* bctsr_dir */
+ if (xdr_stream_encode_u32(xdr, bcts->dir) != XDR_UNIT)
return nfserr_resource;
- p = xdr_encode_opaque_fixed(p, bcts->sessionid.data,
- NFS4_MAX_SESSIONID_LEN);
- *p++ = cpu_to_be32(bcts->dir);
- /* Upshifting from TCP to RDMA is not supported */
- *p++ = cpu_to_be32(0);
- return 0;
+ /* bctsr_use_conn_in_rdma_mode */
+ return nfsd4_encode_bool(xdr, false);
}
static __be32
@@ -3782,7 +3952,8 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_close *close = &u->close;
struct xdr_stream *xdr = resp->xdr;
- return nfsd4_encode_stateid(xdr, &close->cl_stateid);
+ /* open_stateid */
+ return nfsd4_encode_stateid4(xdr, &close->cl_stateid);
}
@@ -3802,11 +3973,13 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_create *create = &u->create;
struct xdr_stream *xdr = resp->xdr;
+ /* cinfo */
nfserr = nfsd4_encode_change_info4(xdr, &create->cr_cinfo);
if (nfserr)
return nfserr;
- return nfsd4_encode_bitmap(xdr, create->cr_bmval[0],
- create->cr_bmval[1], create->cr_bmval[2]);
+ /* attrset */
+ return nfsd4_encode_bitmap4(xdr, create->cr_bmval[0],
+ create->cr_bmval[1], create->cr_bmval[2]);
}
static __be32
@@ -3817,65 +3990,56 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr,
struct svc_fh *fhp = getattr->ga_fhp;
struct xdr_stream *xdr = resp->xdr;
- return nfsd4_encode_fattr(xdr, fhp, fhp->fh_export, fhp->fh_dentry,
- getattr->ga_bmval, resp->rqstp, 0);
+ /* obj_attributes */
+ return nfsd4_encode_fattr4(resp->rqstp, xdr, fhp, fhp->fh_export,
+ fhp->fh_dentry, getattr->ga_bmval, 0);
}
static __be32
nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr,
union nfsd4_op_u *u)
{
- struct svc_fh **fhpp = &u->getfh;
struct xdr_stream *xdr = resp->xdr;
- struct svc_fh *fhp = *fhpp;
- unsigned int len;
- __be32 *p;
+ struct svc_fh *fhp = u->getfh;
- len = fhp->fh_handle.fh_size;
- p = xdr_reserve_space(xdr, len + 4);
- if (!p)
- return nfserr_resource;
- p = xdr_encode_opaque(p, &fhp->fh_handle.fh_raw, len);
- return 0;
+ /* object */
+ return nfsd4_encode_nfs_fh4(xdr, &fhp->fh_handle);
}
-/*
-* Including all fields other than the name, a LOCK4denied structure requires
-* 8(clientid) + 4(namelen) + 8(offset) + 8(length) + 4(type) = 32 bytes.
-*/
static __be32
-nfsd4_encode_lock_denied(struct xdr_stream *xdr, struct nfsd4_lock_denied *ld)
+nfsd4_encode_lock_owner4(struct xdr_stream *xdr, const clientid_t *clientid,
+ const struct xdr_netobj *owner)
{
- struct xdr_netobj *conf = &ld->ld_owner;
- __be32 *p;
+ __be32 status;
-again:
- p = xdr_reserve_space(xdr, 32 + XDR_LEN(conf->len));
- if (!p) {
- /*
- * Don't fail to return the result just because we can't
- * return the conflicting open:
- */
- if (conf->len) {
- kfree(conf->data);
- conf->len = 0;
- conf->data = NULL;
- goto again;
- }
+ /* clientid */
+ status = nfsd4_encode_clientid4(xdr, clientid);
+ if (status != nfs_ok)
+ return status;
+ /* owner */
+ return nfsd4_encode_opaque(xdr, owner->data, owner->len);
+}
+
+static __be32
+nfsd4_encode_lock4denied(struct xdr_stream *xdr,
+ const struct nfsd4_lock_denied *ld)
+{
+ __be32 status;
+
+ /* offset */
+ status = nfsd4_encode_offset4(xdr, ld->ld_start);
+ if (status != nfs_ok)
+ return status;
+ /* length */
+ status = nfsd4_encode_length4(xdr, ld->ld_length);
+ if (status != nfs_ok)
+ return status;
+ /* locktype */
+ if (xdr_stream_encode_u32(xdr, ld->ld_type) != XDR_UNIT)
return nfserr_resource;
- }
- p = xdr_encode_hyper(p, ld->ld_start);
- p = xdr_encode_hyper(p, ld->ld_length);
- *p++ = cpu_to_be32(ld->ld_type);
- if (conf->len) {
- p = xdr_encode_opaque_fixed(p, &ld->ld_clientid, 8);
- p = xdr_encode_opaque(p, conf->data, conf->len);
- kfree(conf->data);
- } else { /* non - nfsv4 lock in conflict, no clientid nor owner */
- p = xdr_encode_hyper(p, (u64)0); /* clientid */
- *p++ = cpu_to_be32(0); /* length of owner name */
- }
- return nfserr_denied;
+ /* owner */
+ return nfsd4_encode_lock_owner4(xdr, &ld->ld_clientid,
+ &ld->ld_owner);
}
static __be32
@@ -3884,13 +4048,21 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr,
{
struct nfsd4_lock *lock = &u->lock;
struct xdr_stream *xdr = resp->xdr;
+ __be32 status;
- if (!nfserr)
- nfserr = nfsd4_encode_stateid(xdr, &lock->lk_resp_stateid);
- else if (nfserr == nfserr_denied)
- nfserr = nfsd4_encode_lock_denied(xdr, &lock->lk_denied);
-
- return nfserr;
+ switch (nfserr) {
+ case nfs_ok:
+ /* resok4 */
+ status = nfsd4_encode_stateid4(xdr, &lock->lk_resp_stateid);
+ break;
+ case nfserr_denied:
+ /* denied */
+ status = nfsd4_encode_lock4denied(xdr, &lock->lk_denied);
+ break;
+ default:
+ return nfserr;
+ }
+ return status != nfs_ok ? status : nfserr;
}
static __be32
@@ -3899,9 +4071,14 @@ nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr,
{
struct nfsd4_lockt *lockt = &u->lockt;
struct xdr_stream *xdr = resp->xdr;
+ __be32 status;
- if (nfserr == nfserr_denied)
- nfsd4_encode_lock_denied(xdr, &lockt->lt_denied);
+ if (nfserr == nfserr_denied) {
+ /* denied */
+ status = nfsd4_encode_lock4denied(xdr, &lockt->lt_denied);
+ if (status != nfs_ok)
+ return status;
+ }
return nfserr;
}
@@ -3912,7 +4089,8 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_locku *locku = &u->locku;
struct xdr_stream *xdr = resp->xdr;
- return nfsd4_encode_stateid(xdr, &locku->lu_stateid);
+ /* lock_stateid */
+ return nfsd4_encode_stateid4(xdr, &locku->lu_stateid);
}
@@ -3926,104 +4104,159 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr,
return nfsd4_encode_change_info4(xdr, &link->li_cinfo);
}
+/*
+ * This implementation does not yet support returning an ACE in an
+ * OPEN that offers a delegation.
+ */
+static __be32
+nfsd4_encode_open_nfsace4(struct xdr_stream *xdr)
+{
+ __be32 status;
+
+ /* type */
+ status = nfsd4_encode_acetype4(xdr, NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE);
+ if (status != nfs_ok)
+ return nfserr_resource;
+ /* flag */
+ status = nfsd4_encode_aceflag4(xdr, 0);
+ if (status != nfs_ok)
+ return nfserr_resource;
+ /* access mask */
+ status = nfsd4_encode_acemask4(xdr, 0);
+ if (status != nfs_ok)
+ return nfserr_resource;
+ /* who - empty for now */
+ if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
+ return nfserr_resource;
+ return nfs_ok;
+}
static __be32
-nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr,
- union nfsd4_op_u *u)
+nfsd4_encode_open_read_delegation4(struct xdr_stream *xdr, struct nfsd4_open *open)
{
- struct nfsd4_open *open = &u->open;
- struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
+ __be32 status;
- nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid);
- if (nfserr)
- return nfserr;
- nfserr = nfsd4_encode_change_info4(xdr, &open->op_cinfo);
- if (nfserr)
- return nfserr;
- if (xdr_stream_encode_u32(xdr, open->op_rflags) < 0)
+ /* stateid */
+ status = nfsd4_encode_stateid4(xdr, &open->op_delegate_stateid);
+ if (status != nfs_ok)
+ return status;
+ /* recall */
+ status = nfsd4_encode_bool(xdr, open->op_recall);
+ if (status != nfs_ok)
+ return status;
+ /* permissions */
+ return nfsd4_encode_open_nfsace4(xdr);
+}
+
+static __be32
+nfsd4_encode_nfs_space_limit4(struct xdr_stream *xdr, u64 filesize)
+{
+ /* limitby */
+ if (xdr_stream_encode_u32(xdr, NFS4_LIMIT_SIZE) != XDR_UNIT)
return nfserr_resource;
+ /* filesize */
+ return nfsd4_encode_uint64_t(xdr, filesize);
+}
- nfserr = nfsd4_encode_bitmap(xdr, open->op_bmval[0], open->op_bmval[1],
- open->op_bmval[2]);
- if (nfserr)
- return nfserr;
+static __be32
+nfsd4_encode_open_write_delegation4(struct xdr_stream *xdr,
+ struct nfsd4_open *open)
+{
+ __be32 status;
- p = xdr_reserve_space(xdr, 4);
- if (!p)
+ /* stateid */
+ status = nfsd4_encode_stateid4(xdr, &open->op_delegate_stateid);
+ if (status != nfs_ok)
+ return status;
+ /* recall */
+ status = nfsd4_encode_bool(xdr, open->op_recall);
+ if (status != nfs_ok)
+ return status;
+ /* space_limit */
+ status = nfsd4_encode_nfs_space_limit4(xdr, 0);
+ if (status != nfs_ok)
+ return status;
+ return nfsd4_encode_open_nfsace4(xdr);
+}
+
+static __be32
+nfsd4_encode_open_none_delegation4(struct xdr_stream *xdr,
+ struct nfsd4_open *open)
+{
+ __be32 status = nfs_ok;
+
+ /* ond_why */
+ if (xdr_stream_encode_u32(xdr, open->op_why_no_deleg) != XDR_UNIT)
return nfserr_resource;
+ switch (open->op_why_no_deleg) {
+ case WND4_CONTENTION:
+ /* ond_server_will_push_deleg */
+ status = nfsd4_encode_bool(xdr, false);
+ break;
+ case WND4_RESOURCE:
+ /* ond_server_will_signal_avail */
+ status = nfsd4_encode_bool(xdr, false);
+ }
+ return status;
+}
+
+static __be32
+nfsd4_encode_open_delegation4(struct xdr_stream *xdr, struct nfsd4_open *open)
+{
+ __be32 status;
- *p++ = cpu_to_be32(open->op_delegate_type);
+ /* delegation_type */
+ if (xdr_stream_encode_u32(xdr, open->op_delegate_type) != XDR_UNIT)
+ return nfserr_resource;
switch (open->op_delegate_type) {
case NFS4_OPEN_DELEGATE_NONE:
+ status = nfs_ok;
break;
case NFS4_OPEN_DELEGATE_READ:
- nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid);
- if (nfserr)
- return nfserr;
- p = xdr_reserve_space(xdr, 20);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(open->op_recall);
-
- /*
- * TODO: ACE's in delegations
- */
- *p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE);
- *p++ = cpu_to_be32(0);
- *p++ = cpu_to_be32(0);
- *p++ = cpu_to_be32(0); /* XXX: is NULL principal ok? */
+ /* read */
+ status = nfsd4_encode_open_read_delegation4(xdr, open);
break;
case NFS4_OPEN_DELEGATE_WRITE:
- nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid);
- if (nfserr)
- return nfserr;
-
- p = xdr_reserve_space(xdr, XDR_UNIT * 8);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(open->op_recall);
-
- /*
- * Always flush on close
- *
- * TODO: space_limit's in delegations
- */
- *p++ = cpu_to_be32(NFS4_LIMIT_SIZE);
- *p++ = xdr_zero;
- *p++ = xdr_zero;
-
- /*
- * TODO: ACE's in delegations
- */
- *p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE);
- *p++ = cpu_to_be32(0);
- *p++ = cpu_to_be32(0);
- *p++ = cpu_to_be32(0); /* XXX: is NULL principal ok? */
+ /* write */
+ status = nfsd4_encode_open_write_delegation4(xdr, open);
break;
- case NFS4_OPEN_DELEGATE_NONE_EXT: /* 4.1 */
- switch (open->op_why_no_deleg) {
- case WND4_CONTENTION:
- case WND4_RESOURCE:
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(open->op_why_no_deleg);
- /* deleg signaling not supported yet: */
- *p++ = cpu_to_be32(0);
- break;
- default:
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(open->op_why_no_deleg);
- }
+ case NFS4_OPEN_DELEGATE_NONE_EXT:
+ /* od_whynone */
+ status = nfsd4_encode_open_none_delegation4(xdr, open);
break;
default:
- BUG();
+ status = nfserr_serverfault;
}
- /* XXX save filehandle here */
- return 0;
+
+ return status;
+}
+
+static __be32
+nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_open *open = &u->open;
+ struct xdr_stream *xdr = resp->xdr;
+
+ /* stateid */
+ nfserr = nfsd4_encode_stateid4(xdr, &open->op_stateid);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* cinfo */
+ nfserr = nfsd4_encode_change_info4(xdr, &open->op_cinfo);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* rflags */
+ nfserr = nfsd4_encode_uint32_t(xdr, open->op_rflags);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* attrset */
+ nfserr = nfsd4_encode_bitmap4(xdr, open->op_bmval[0],
+ open->op_bmval[1], open->op_bmval[2]);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* delegation */
+ return nfsd4_encode_open_delegation4(xdr, open);
}
static __be32
@@ -4033,7 +4266,8 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_open_confirm *oc = &u->open_confirm;
struct xdr_stream *xdr = resp->xdr;
- return nfsd4_encode_stateid(xdr, &oc->oc_resp_stateid);
+ /* open_stateid */
+ return nfsd4_encode_stateid4(xdr, &oc->oc_resp_stateid);
}
static __be32
@@ -4043,7 +4277,8 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_open_downgrade *od = &u->open_downgrade;
struct xdr_stream *xdr = resp->xdr;
- return nfsd4_encode_stateid(xdr, &od->od_stateid);
+ /* open_stateid */
+ return nfsd4_encode_stateid4(xdr, &od->od_stateid);
}
/*
@@ -4227,90 +4462,83 @@ out_err:
return nfserr;
}
-static __be32
-nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr,
- union nfsd4_op_u *u)
+static __be32 nfsd4_encode_dirlist4(struct xdr_stream *xdr,
+ struct nfsd4_readdir *readdir,
+ u32 max_payload)
{
- struct nfsd4_readdir *readdir = &u->readdir;
- int maxcount;
- int bytes_left;
+ int bytes_left, maxcount, starting_len = xdr->buf->len;
loff_t offset;
- __be64 wire_offset;
- struct xdr_stream *xdr = resp->xdr;
- int starting_len = xdr->buf->len;
- __be32 *p;
-
- nfserr = nfsd4_encode_verifier4(xdr, &readdir->rd_verf);
- if (nfserr != nfs_ok)
- return nfserr;
+ __be32 status;
/*
* Number of bytes left for directory entries allowing for the
- * final 8 bytes of the readdir and a following failed op:
+ * final 8 bytes of the readdir and a following failed op.
*/
- bytes_left = xdr->buf->buflen - xdr->buf->len
- - COMPOUND_ERR_SLACK_SPACE - 8;
- if (bytes_left < 0) {
- nfserr = nfserr_resource;
- goto err_no_verf;
- }
- maxcount = svc_max_payload(resp->rqstp);
- maxcount = min_t(u32, readdir->rd_maxcount, maxcount);
+ bytes_left = xdr->buf->buflen - xdr->buf->len -
+ COMPOUND_ERR_SLACK_SPACE - XDR_UNIT * 2;
+ if (bytes_left < 0)
+ return nfserr_resource;
+ maxcount = min_t(u32, readdir->rd_maxcount, max_payload);
+
/*
- * Note the rfc defines rd_maxcount as the size of the
- * READDIR4resok structure, which includes the verifier above
- * and the 8 bytes encoded at the end of this function:
+ * The RFC defines rd_maxcount as the size of the
+ * READDIR4resok structure, which includes the verifier
+ * and the 8 bytes encoded at the end of this function.
*/
- if (maxcount < 16) {
- nfserr = nfserr_toosmall;
- goto err_no_verf;
- }
- maxcount = min_t(int, maxcount-16, bytes_left);
+ if (maxcount < XDR_UNIT * 4)
+ return nfserr_toosmall;
+ maxcount = min_t(int, maxcount - XDR_UNIT * 4, bytes_left);
- /* RFC 3530 14.2.24 allows us to ignore dircount when it's 0: */
+ /* RFC 3530 14.2.24 allows us to ignore dircount when it's 0 */
if (!readdir->rd_dircount)
- readdir->rd_dircount = svc_max_payload(resp->rqstp);
+ readdir->rd_dircount = max_payload;
+ /* *entries */
readdir->xdr = xdr;
readdir->rd_maxcount = maxcount;
readdir->common.err = 0;
readdir->cookie_offset = 0;
-
offset = readdir->rd_cookie;
- nfserr = nfsd_readdir(readdir->rd_rqstp, readdir->rd_fhp,
- &offset,
- &readdir->common, nfsd4_encode_dirent);
- if (nfserr == nfs_ok &&
- readdir->common.err == nfserr_toosmall &&
- xdr->buf->len == starting_len + 8) {
- /* nothing encoded; which limit did we hit?: */
- if (maxcount - 16 < bytes_left)
- /* It was the fault of rd_maxcount: */
- nfserr = nfserr_toosmall;
- else
- /* We ran out of buffer space: */
- nfserr = nfserr_resource;
+ status = nfsd_readdir(readdir->rd_rqstp, readdir->rd_fhp, &offset,
+ &readdir->common, nfsd4_encode_entry4);
+ if (status)
+ return status;
+ if (readdir->common.err == nfserr_toosmall &&
+ xdr->buf->len == starting_len) {
+ /* No entries were encoded. Which limit did we hit? */
+ if (maxcount - XDR_UNIT * 4 < bytes_left)
+ /* It was the fault of rd_maxcount */
+ return nfserr_toosmall;
+ /* We ran out of buffer space */
+ return nfserr_resource;
}
- if (nfserr)
- goto err_no_verf;
+ /* Encode the final entry's cookie value */
+ nfsd4_encode_entry4_nfs_cookie4(readdir, offset);
+ /* No entries follow */
+ if (xdr_stream_encode_item_absent(xdr) != XDR_UNIT)
+ return nfserr_resource;
- if (readdir->cookie_offset) {
- wire_offset = cpu_to_be64(offset);
- write_bytes_to_xdr_buf(xdr->buf, readdir->cookie_offset,
- &wire_offset, 8);
- }
+ /* eof */
+ return nfsd4_encode_bool(xdr, readdir->common.err == nfserr_eof);
+}
- p = xdr_reserve_space(xdr, 8);
- if (!p) {
- WARN_ON_ONCE(1);
- goto err_no_verf;
- }
- *p++ = 0; /* no more entries */
- *p++ = htonl(readdir->common.err == nfserr_eof);
+static __be32
+nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_readdir *readdir = &u->readdir;
+ struct xdr_stream *xdr = resp->xdr;
+ int starting_len = xdr->buf->len;
- return 0;
-err_no_verf:
- xdr_truncate_encode(xdr, starting_len);
+ /* cookieverf */
+ nfserr = nfsd4_encode_verifier4(xdr, &readdir->rd_verf);
+ if (nfserr != nfs_ok)
+ return nfserr;
+
+ /* reply */
+ nfserr = nfsd4_encode_dirlist4(xdr, readdir, svc_max_payload(resp->rqstp));
+ if (nfserr != nfs_ok)
+ xdr_truncate_encode(xdr, starting_len);
return nfserr;
}
@@ -4338,13 +4566,34 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr,
}
static __be32
+nfsd4_encode_rpcsec_gss_info(struct xdr_stream *xdr,
+ struct rpcsec_gss_info *info)
+{
+ __be32 status;
+
+ /* oid */
+ if (xdr_stream_encode_opaque(xdr, info->oid.data, info->oid.len) < 0)
+ return nfserr_resource;
+ /* qop */
+ status = nfsd4_encode_qop4(xdr, info->qop);
+ if (status != nfs_ok)
+ return status;
+ /* service */
+ if (xdr_stream_encode_u32(xdr, info->service) != XDR_UNIT)
+ return nfserr_resource;
+
+ return nfs_ok;
+}
+
+static __be32
nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp)
{
u32 i, nflavs, supported;
struct exp_flavor_info *flavs;
struct exp_flavor_info def_flavs[2];
- __be32 *p, *flavorsp;
static bool report = true;
+ __be32 *flavorsp;
+ __be32 status;
if (exp->ex_nflavors) {
flavs = exp->ex_flavors;
@@ -4367,10 +4616,9 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp)
}
supported = 0;
- p = xdr_reserve_space(xdr, 4);
- if (!p)
+ flavorsp = xdr_reserve_space(xdr, XDR_UNIT);
+ if (!flavorsp)
return nfserr_resource;
- flavorsp = p++; /* to be backfilled later */
for (i = 0; i < nflavs; i++) {
rpc_authflavor_t pf = flavs[i].pseudoflavor;
@@ -4378,20 +4626,22 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp)
if (rpcauth_get_gssinfo(pf, &info) == 0) {
supported++;
- p = xdr_reserve_space(xdr, 4 + 4 +
- XDR_LEN(info.oid.len) + 4 + 4);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(RPC_AUTH_GSS);
- p = xdr_encode_opaque(p, info.oid.data, info.oid.len);
- *p++ = cpu_to_be32(info.qop);
- *p++ = cpu_to_be32(info.service);
+
+ /* flavor */
+ status = nfsd4_encode_uint32_t(xdr, RPC_AUTH_GSS);
+ if (status != nfs_ok)
+ return status;
+ /* flavor_info */
+ status = nfsd4_encode_rpcsec_gss_info(xdr, &info);
+ if (status != nfs_ok)
+ return status;
} else if (pf < RPC_AUTH_MAXFLAVOR) {
supported++;
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(pf);
+
+ /* flavor */
+ status = nfsd4_encode_uint32_t(xdr, pf);
+ if (status != nfs_ok)
+ return status;
} else {
if (report)
pr_warn("NFS: SECINFO: security flavor %u "
@@ -4401,7 +4651,7 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp)
if (nflavs != supported)
report = false;
- *flavorsp = htonl(supported);
+ *flavorsp = cpu_to_be32(supported);
return 0;
}
@@ -4425,34 +4675,25 @@ nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
return nfsd4_do_encode_secinfo(xdr, secinfo->sin_exp);
}
-/*
- * The SETATTR encode routine is special -- it always encodes a bitmap,
- * regardless of the error status.
- */
static __be32
nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr,
union nfsd4_op_u *u)
{
struct nfsd4_setattr *setattr = &u->setattr;
- struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
+ __be32 status;
- p = xdr_reserve_space(xdr, 16);
- if (!p)
- return nfserr_resource;
- if (nfserr) {
- *p++ = cpu_to_be32(3);
- *p++ = cpu_to_be32(0);
- *p++ = cpu_to_be32(0);
- *p++ = cpu_to_be32(0);
- }
- else {
- *p++ = cpu_to_be32(3);
- *p++ = cpu_to_be32(setattr->sa_bmval[0]);
- *p++ = cpu_to_be32(setattr->sa_bmval[1]);
- *p++ = cpu_to_be32(setattr->sa_bmval[2]);
+ switch (nfserr) {
+ case nfs_ok:
+ /* attrsset */
+ status = nfsd4_encode_bitmap4(resp->xdr, setattr->sa_bmval[0],
+ setattr->sa_bmval[1],
+ setattr->sa_bmval[2]);
+ break;
+ default:
+ /* attrsset */
+ status = nfsd4_encode_bitmap4(resp->xdr, 0, 0, 0);
}
- return nfserr;
+ return status != nfs_ok ? status : nfserr;
}
static __be32
@@ -4488,86 +4729,148 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr,
union nfsd4_op_u *u)
{
struct nfsd4_write *write = &u->write;
+ struct xdr_stream *xdr = resp->xdr;
- if (xdr_stream_encode_u32(resp->xdr, write->wr_bytes_written) < 0)
- return nfserr_resource;
- if (xdr_stream_encode_u32(resp->xdr, write->wr_how_written) < 0)
+ /* count */
+ nfserr = nfsd4_encode_count4(xdr, write->wr_bytes_written);
+ if (nfserr)
+ return nfserr;
+ /* committed */
+ if (xdr_stream_encode_u32(xdr, write->wr_how_written) != XDR_UNIT)
return nfserr_resource;
- return nfsd4_encode_verifier4(resp->xdr, &write->wr_verifier);
+ /* writeverf */
+ return nfsd4_encode_verifier4(xdr, &write->wr_verifier);
}
static __be32
-nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
- union nfsd4_op_u *u)
+nfsd4_encode_state_protect_ops4(struct xdr_stream *xdr,
+ struct nfsd4_exchange_id *exid)
{
- struct nfsd4_exchange_id *exid = &u->exchange_id;
- struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
- char *major_id;
- char *server_scope;
- int major_id_sz;
- int server_scope_sz;
- uint64_t minor_id = 0;
- struct nfsd_net *nn = net_generic(SVC_NET(resp->rqstp), nfsd_net_id);
+ __be32 status;
- major_id = nn->nfsd_name;
- major_id_sz = strlen(nn->nfsd_name);
- server_scope = nn->nfsd_name;
- server_scope_sz = strlen(nn->nfsd_name);
+ /* spo_must_enforce */
+ status = nfsd4_encode_bitmap4(xdr, exid->spo_must_enforce[0],
+ exid->spo_must_enforce[1],
+ exid->spo_must_enforce[2]);
+ if (status != nfs_ok)
+ return status;
+ /* spo_must_allow */
+ return nfsd4_encode_bitmap4(xdr, exid->spo_must_allow[0],
+ exid->spo_must_allow[1],
+ exid->spo_must_allow[2]);
+}
- if (nfsd4_encode_clientid4(xdr, &exid->clientid) != nfs_ok)
- return nfserr_resource;
- if (xdr_stream_encode_u32(xdr, exid->seqid) < 0)
- return nfserr_resource;
- if (xdr_stream_encode_u32(xdr, exid->flags) < 0)
- return nfserr_resource;
+static __be32
+nfsd4_encode_state_protect4_r(struct xdr_stream *xdr, struct nfsd4_exchange_id *exid)
+{
+ __be32 status;
- if (xdr_stream_encode_u32(xdr, exid->spa_how) < 0)
+ if (xdr_stream_encode_u32(xdr, exid->spa_how) != XDR_UNIT)
return nfserr_resource;
switch (exid->spa_how) {
case SP4_NONE:
+ status = nfs_ok;
break;
case SP4_MACH_CRED:
- /* spo_must_enforce bitmap: */
- nfserr = nfsd4_encode_bitmap(xdr,
- exid->spo_must_enforce[0],
- exid->spo_must_enforce[1],
- exid->spo_must_enforce[2]);
- if (nfserr)
- return nfserr;
- /* spo_must_allow bitmap: */
- nfserr = nfsd4_encode_bitmap(xdr,
- exid->spo_must_allow[0],
- exid->spo_must_allow[1],
- exid->spo_must_allow[2]);
- if (nfserr)
- return nfserr;
+ /* spr_mach_ops */
+ status = nfsd4_encode_state_protect_ops4(xdr, exid);
break;
default:
- WARN_ON_ONCE(1);
+ status = nfserr_serverfault;
}
+ return status;
+}
- p = xdr_reserve_space(xdr,
- 8 /* so_minor_id */ +
- 4 /* so_major_id.len */ +
- (XDR_QUADLEN(major_id_sz) * 4) +
- 4 /* eir_server_scope.len */ +
- (XDR_QUADLEN(server_scope_sz) * 4) +
- 4 /* eir_server_impl_id.count (0) */);
- if (!p)
+static __be32
+nfsd4_encode_server_owner4(struct xdr_stream *xdr, struct svc_rqst *rqstp)
+{
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ __be32 status;
+
+ /* so_minor_id */
+ status = nfsd4_encode_uint64_t(xdr, 0);
+ if (status != nfs_ok)
+ return status;
+ /* so_major_id */
+ return nfsd4_encode_opaque(xdr, nn->nfsd_name, strlen(nn->nfsd_name));
+}
+
+static __be32
+nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
+{
+ struct nfsd_net *nn = net_generic(SVC_NET(resp->rqstp), nfsd_net_id);
+ struct nfsd4_exchange_id *exid = &u->exchange_id;
+ struct xdr_stream *xdr = resp->xdr;
+
+ /* eir_clientid */
+ nfserr = nfsd4_encode_clientid4(xdr, &exid->clientid);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* eir_sequenceid */
+ nfserr = nfsd4_encode_sequenceid4(xdr, exid->seqid);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* eir_flags */
+ nfserr = nfsd4_encode_uint32_t(xdr, exid->flags);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* eir_state_protect */
+ nfserr = nfsd4_encode_state_protect4_r(xdr, exid);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* eir_server_owner */
+ nfserr = nfsd4_encode_server_owner4(xdr, resp->rqstp);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* eir_server_scope */
+ nfserr = nfsd4_encode_opaque(xdr, nn->nfsd_name,
+ strlen(nn->nfsd_name));
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* eir_server_impl_id<1> */
+ if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
return nfserr_resource;
- /* The server_owner struct */
- p = xdr_encode_hyper(p, minor_id); /* Minor id */
- /* major id */
- p = xdr_encode_opaque(p, major_id, major_id_sz);
+ return nfs_ok;
+}
- /* Server scope */
- p = xdr_encode_opaque(p, server_scope, server_scope_sz);
+static __be32
+nfsd4_encode_channel_attrs4(struct xdr_stream *xdr,
+ const struct nfsd4_channel_attrs *attrs)
+{
+ __be32 status;
- /* Implementation id */
- *p++ = cpu_to_be32(0); /* zero length nfs_impl_id4 array */
- return 0;
+ /* ca_headerpadsize */
+ status = nfsd4_encode_count4(xdr, 0);
+ if (status != nfs_ok)
+ return status;
+ /* ca_maxrequestsize */
+ status = nfsd4_encode_count4(xdr, attrs->maxreq_sz);
+ if (status != nfs_ok)
+ return status;
+ /* ca_maxresponsesize */
+ status = nfsd4_encode_count4(xdr, attrs->maxresp_sz);
+ if (status != nfs_ok)
+ return status;
+ /* ca_maxresponsesize_cached */
+ status = nfsd4_encode_count4(xdr, attrs->maxresp_cached);
+ if (status != nfs_ok)
+ return status;
+ /* ca_maxoperations */
+ status = nfsd4_encode_count4(xdr, attrs->maxops);
+ if (status != nfs_ok)
+ return status;
+ /* ca_maxrequests */
+ status = nfsd4_encode_count4(xdr, attrs->maxreqs);
+ if (status != nfs_ok)
+ return status;
+ /* ca_rdma_ird<1> */
+ if (xdr_stream_encode_u32(xdr, attrs->nr_rdma_attrs) != XDR_UNIT)
+ return nfserr_resource;
+ if (attrs->nr_rdma_attrs)
+ return nfsd4_encode_uint32_t(xdr, attrs->rdma_attrs);
+ return nfs_ok;
}
static __be32
@@ -4576,52 +4879,25 @@ nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr,
{
struct nfsd4_create_session *sess = &u->create_session;
struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
-
- p = xdr_reserve_space(xdr, 24);
- if (!p)
- return nfserr_resource;
- p = xdr_encode_opaque_fixed(p, sess->sessionid.data,
- NFS4_MAX_SESSIONID_LEN);
- *p++ = cpu_to_be32(sess->seqid);
- *p++ = cpu_to_be32(sess->flags);
- p = xdr_reserve_space(xdr, 28);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(0); /* headerpadsz */
- *p++ = cpu_to_be32(sess->fore_channel.maxreq_sz);
- *p++ = cpu_to_be32(sess->fore_channel.maxresp_sz);
- *p++ = cpu_to_be32(sess->fore_channel.maxresp_cached);
- *p++ = cpu_to_be32(sess->fore_channel.maxops);
- *p++ = cpu_to_be32(sess->fore_channel.maxreqs);
- *p++ = cpu_to_be32(sess->fore_channel.nr_rdma_attrs);
-
- if (sess->fore_channel.nr_rdma_attrs) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(sess->fore_channel.rdma_attrs);
- }
-
- p = xdr_reserve_space(xdr, 28);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(0); /* headerpadsz */
- *p++ = cpu_to_be32(sess->back_channel.maxreq_sz);
- *p++ = cpu_to_be32(sess->back_channel.maxresp_sz);
- *p++ = cpu_to_be32(sess->back_channel.maxresp_cached);
- *p++ = cpu_to_be32(sess->back_channel.maxops);
- *p++ = cpu_to_be32(sess->back_channel.maxreqs);
- *p++ = cpu_to_be32(sess->back_channel.nr_rdma_attrs);
-
- if (sess->back_channel.nr_rdma_attrs) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(sess->back_channel.rdma_attrs);
- }
- return 0;
+ /* csr_sessionid */
+ nfserr = nfsd4_encode_sessionid4(xdr, &sess->sessionid);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* csr_sequence */
+ nfserr = nfsd4_encode_sequenceid4(xdr, sess->seqid);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* csr_flags */
+ nfserr = nfsd4_encode_uint32_t(xdr, sess->flags);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* csr_fore_chan_attrs */
+ nfserr = nfsd4_encode_channel_attrs4(xdr, &sess->fore_channel);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* csr_back_chan_attrs */
+ return nfsd4_encode_channel_attrs4(xdr, &sess->back_channel);
}
static __be32
@@ -4630,22 +4906,35 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr,
{
struct nfsd4_sequence *seq = &u->sequence;
struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
- p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 20);
- if (!p)
- return nfserr_resource;
- p = xdr_encode_opaque_fixed(p, seq->sessionid.data,
- NFS4_MAX_SESSIONID_LEN);
- *p++ = cpu_to_be32(seq->seqid);
- *p++ = cpu_to_be32(seq->slotid);
+ /* sr_sessionid */
+ nfserr = nfsd4_encode_sessionid4(xdr, &seq->sessionid);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* sr_sequenceid */
+ nfserr = nfsd4_encode_sequenceid4(xdr, seq->seqid);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* sr_slotid */
+ nfserr = nfsd4_encode_slotid4(xdr, seq->slotid);
+ if (nfserr != nfs_ok)
+ return nfserr;
/* Note slotid's are numbered from zero: */
- *p++ = cpu_to_be32(seq->maxslots - 1); /* sr_highest_slotid */
- *p++ = cpu_to_be32(seq->maxslots - 1); /* sr_target_highest_slotid */
- *p++ = cpu_to_be32(seq->status_flags);
+ /* sr_highest_slotid */
+ nfserr = nfsd4_encode_slotid4(xdr, seq->maxslots - 1);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* sr_target_highest_slotid */
+ nfserr = nfsd4_encode_slotid4(xdr, seq->maxslots - 1);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* sr_status_flags */
+ nfserr = nfsd4_encode_uint32_t(xdr, seq->status_flags);
+ if (nfserr != nfs_ok)
+ return nfserr;
resp->cstate.data_offset = xdr->buf->len; /* DRC cache data pointer */
- return 0;
+ return nfs_ok;
}
static __be32
@@ -4653,125 +4942,132 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
union nfsd4_op_u *u)
{
struct nfsd4_test_stateid *test_stateid = &u->test_stateid;
- struct xdr_stream *xdr = resp->xdr;
struct nfsd4_test_stateid_id *stateid, *next;
- __be32 *p;
+ struct xdr_stream *xdr = resp->xdr;
- p = xdr_reserve_space(xdr, 4 + (4 * test_stateid->ts_num_ids));
- if (!p)
+ /* tsr_status_codes<> */
+ if (xdr_stream_encode_u32(xdr, test_stateid->ts_num_ids) != XDR_UNIT)
return nfserr_resource;
- *p++ = htonl(test_stateid->ts_num_ids);
-
- list_for_each_entry_safe(stateid, next, &test_stateid->ts_stateid_list, ts_id_list) {
- *p++ = stateid->ts_id_status;
+ list_for_each_entry_safe(stateid, next,
+ &test_stateid->ts_stateid_list, ts_id_list) {
+ if (xdr_stream_encode_be32(xdr, stateid->ts_id_status) != XDR_UNIT)
+ return nfserr_resource;
}
-
- return 0;
+ return nfs_ok;
}
#ifdef CONFIG_NFSD_PNFS
static __be32
-nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
- union nfsd4_op_u *u)
+nfsd4_encode_device_addr4(struct xdr_stream *xdr,
+ const struct nfsd4_getdeviceinfo *gdev)
{
- struct nfsd4_getdeviceinfo *gdev = &u->getdeviceinfo;
- struct xdr_stream *xdr = resp->xdr;
+ u32 needed_len, starting_len = xdr->buf->len;
const struct nfsd4_layout_ops *ops;
- u32 starting_len = xdr->buf->len, needed_len;
- __be32 *p;
+ __be32 status;
- p = xdr_reserve_space(xdr, 4);
- if (!p)
+ /* da_layout_type */
+ if (xdr_stream_encode_u32(xdr, gdev->gd_layout_type) != XDR_UNIT)
return nfserr_resource;
-
- *p++ = cpu_to_be32(gdev->gd_layout_type);
-
+ /* da_addr_body */
ops = nfsd4_layout_ops[gdev->gd_layout_type];
- nfserr = ops->encode_getdeviceinfo(xdr, gdev);
- if (nfserr) {
+ status = ops->encode_getdeviceinfo(xdr, gdev);
+ if (status != nfs_ok) {
/*
- * We don't bother to burden the layout drivers with
- * enforcing gd_maxcount, just tell the client to
- * come back with a bigger buffer if it's not enough.
+ * Don't burden the layout drivers with enforcing
+ * gd_maxcount. Just tell the client to come back
+ * with a bigger buffer if it's not enough.
*/
- if (xdr->buf->len + 4 > gdev->gd_maxcount)
+ if (xdr->buf->len + XDR_UNIT > gdev->gd_maxcount)
goto toosmall;
- return nfserr;
+ return status;
}
- if (gdev->gd_notify_types) {
- p = xdr_reserve_space(xdr, 4 + 4);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(1); /* bitmap length */
- *p++ = cpu_to_be32(gdev->gd_notify_types);
- } else {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- return nfserr_resource;
- *p++ = 0;
- }
+ return nfs_ok;
- return 0;
toosmall:
- dprintk("%s: maxcount too small\n", __func__);
- needed_len = xdr->buf->len + 4 /* notifications */;
+ needed_len = xdr->buf->len + XDR_UNIT; /* notifications */
xdr_truncate_encode(xdr, starting_len);
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(needed_len);
+
+ status = nfsd4_encode_count4(xdr, needed_len);
+ if (status != nfs_ok)
+ return status;
return nfserr_toosmall;
}
static __be32
-nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
+nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
union nfsd4_op_u *u)
{
- struct nfsd4_layoutget *lgp = &u->layoutget;
+ struct nfsd4_getdeviceinfo *gdev = &u->getdeviceinfo;
struct xdr_stream *xdr = resp->xdr;
- const struct nfsd4_layout_ops *ops;
- __be32 *p;
-
- p = xdr_reserve_space(xdr, 36 + sizeof(stateid_opaque_t));
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(1); /* we always set return-on-close */
- *p++ = cpu_to_be32(lgp->lg_sid.si_generation);
- p = xdr_encode_opaque_fixed(p, &lgp->lg_sid.si_opaque,
- sizeof(stateid_opaque_t));
+ /* gdir_device_addr */
+ nfserr = nfsd4_encode_device_addr4(xdr, gdev);
+ if (nfserr)
+ return nfserr;
+ /* gdir_notification */
+ return nfsd4_encode_bitmap4(xdr, gdev->gd_notify_types, 0, 0);
+}
- *p++ = cpu_to_be32(1); /* we always return a single layout */
- p = xdr_encode_hyper(p, lgp->lg_seg.offset);
- p = xdr_encode_hyper(p, lgp->lg_seg.length);
- *p++ = cpu_to_be32(lgp->lg_seg.iomode);
- *p++ = cpu_to_be32(lgp->lg_layout_type);
+static __be32
+nfsd4_encode_layout4(struct xdr_stream *xdr, const struct nfsd4_layoutget *lgp)
+{
+ const struct nfsd4_layout_ops *ops = nfsd4_layout_ops[lgp->lg_layout_type];
+ __be32 status;
- ops = nfsd4_layout_ops[lgp->lg_layout_type];
+ /* lo_offset */
+ status = nfsd4_encode_offset4(xdr, lgp->lg_seg.offset);
+ if (status != nfs_ok)
+ return status;
+ /* lo_length */
+ status = nfsd4_encode_length4(xdr, lgp->lg_seg.length);
+ if (status != nfs_ok)
+ return status;
+ /* lo_iomode */
+ if (xdr_stream_encode_u32(xdr, lgp->lg_seg.iomode) != XDR_UNIT)
+ return nfserr_resource;
+ /* lo_content */
+ if (xdr_stream_encode_u32(xdr, lgp->lg_layout_type) != XDR_UNIT)
+ return nfserr_resource;
return ops->encode_layoutget(xdr, lgp);
}
static __be32
+nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_layoutget *lgp = &u->layoutget;
+ struct xdr_stream *xdr = resp->xdr;
+
+ /* logr_return_on_close */
+ nfserr = nfsd4_encode_bool(xdr, true);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* logr_stateid */
+ nfserr = nfsd4_encode_stateid4(xdr, &lgp->lg_sid);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* logr_layout<> */
+ if (xdr_stream_encode_u32(xdr, 1) != XDR_UNIT)
+ return nfserr_resource;
+ return nfsd4_encode_layout4(xdr, lgp);
+}
+
+static __be32
nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
union nfsd4_op_u *u)
{
struct nfsd4_layoutcommit *lcp = &u->layoutcommit;
struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(lcp->lc_size_chg);
- if (lcp->lc_size_chg) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- return nfserr_resource;
- p = xdr_encode_hyper(p, lcp->lc_newsize);
- }
-
- return 0;
+ /* ns_sizechanged */
+ nfserr = nfsd4_encode_bool(xdr, lcp->lc_size_chg);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ if (lcp->lc_size_chg)
+ /* ns_size */
+ return nfsd4_encode_length4(xdr, lcp->lc_newsize);
+ return nfs_ok;
}
static __be32
@@ -4780,103 +5076,108 @@ nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
{
struct nfsd4_layoutreturn *lrp = &u->layoutreturn;
struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(lrp->lrs_present);
+ /* lrs_present */
+ nfserr = nfsd4_encode_bool(xdr, lrp->lrs_present);
+ if (nfserr != nfs_ok)
+ return nfserr;
if (lrp->lrs_present)
- return nfsd4_encode_stateid(xdr, &lrp->lr_sid);
- return 0;
+ /* lrs_stateid */
+ return nfsd4_encode_stateid4(xdr, &lrp->lr_sid);
+ return nfs_ok;
}
#endif /* CONFIG_NFSD_PNFS */
static __be32
-nfsd42_encode_write_res(struct nfsd4_compoundres *resp,
- struct nfsd42_write_res *write, bool sync)
+nfsd4_encode_write_response4(struct xdr_stream *xdr,
+ const struct nfsd4_copy *copy)
{
- __be32 *p;
- p = xdr_reserve_space(resp->xdr, 4);
- if (!p)
- return nfserr_resource;
+ const struct nfsd42_write_res *write = &copy->cp_res;
+ u32 count = nfsd4_copy_is_sync(copy) ? 0 : 1;
+ __be32 status;
- if (sync)
- *p++ = cpu_to_be32(0);
- else {
- __be32 nfserr;
- *p++ = cpu_to_be32(1);
- nfserr = nfsd4_encode_stateid(resp->xdr, &write->cb_stateid);
- if (nfserr)
- return nfserr;
+ /* wr_callback_id<1> */
+ if (xdr_stream_encode_u32(xdr, count) != XDR_UNIT)
+ return nfserr_resource;
+ if (count) {
+ status = nfsd4_encode_stateid4(xdr, &write->cb_stateid);
+ if (status != nfs_ok)
+ return status;
}
- p = xdr_reserve_space(resp->xdr, 8 + 4 + NFS4_VERIFIER_SIZE);
- if (!p)
+
+ /* wr_count */
+ status = nfsd4_encode_length4(xdr, write->wr_bytes_written);
+ if (status != nfs_ok)
+ return status;
+ /* wr_committed */
+ if (xdr_stream_encode_u32(xdr, write->wr_stable_how) != XDR_UNIT)
return nfserr_resource;
+ /* wr_writeverf */
+ return nfsd4_encode_verifier4(xdr, &write->wr_verifier);
+}
- p = xdr_encode_hyper(p, write->wr_bytes_written);
- *p++ = cpu_to_be32(write->wr_stable_how);
- p = xdr_encode_opaque_fixed(p, write->wr_verifier.data,
- NFS4_VERIFIER_SIZE);
- return nfs_ok;
+static __be32 nfsd4_encode_copy_requirements4(struct xdr_stream *xdr,
+ const struct nfsd4_copy *copy)
+{
+ __be32 status;
+
+ /* cr_consecutive */
+ status = nfsd4_encode_bool(xdr, true);
+ if (status != nfs_ok)
+ return status;
+ /* cr_synchronous */
+ return nfsd4_encode_bool(xdr, nfsd4_copy_is_sync(copy));
}
static __be32
-nfsd42_encode_nl4_server(struct nfsd4_compoundres *resp, struct nl4_server *ns)
+nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
- struct xdr_stream *xdr = resp->xdr;
- struct nfs42_netaddr *addr;
- __be32 *p;
+ struct nfsd4_copy *copy = &u->copy;
- p = xdr_reserve_space(xdr, 4);
- *p++ = cpu_to_be32(ns->nl4_type);
+ nfserr = nfsd4_encode_write_response4(resp->xdr, copy);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ return nfsd4_encode_copy_requirements4(resp->xdr, copy);
+}
+static __be32
+nfsd4_encode_netloc4(struct xdr_stream *xdr, const struct nl4_server *ns)
+{
+ __be32 status;
+
+ if (xdr_stream_encode_u32(xdr, ns->nl4_type) != XDR_UNIT)
+ return nfserr_resource;
switch (ns->nl4_type) {
case NL4_NETADDR:
- addr = &ns->u.nl4_addr;
-
- /* netid_len, netid, uaddr_len, uaddr (port included
- * in RPCBIND_MAXUADDRLEN)
- */
- p = xdr_reserve_space(xdr,
- 4 /* netid len */ +
- (XDR_QUADLEN(addr->netid_len) * 4) +
- 4 /* uaddr len */ +
- (XDR_QUADLEN(addr->addr_len) * 4));
- if (!p)
- return nfserr_resource;
-
- *p++ = cpu_to_be32(addr->netid_len);
- p = xdr_encode_opaque_fixed(p, addr->netid,
- addr->netid_len);
- *p++ = cpu_to_be32(addr->addr_len);
- p = xdr_encode_opaque_fixed(p, addr->addr,
- addr->addr_len);
+ /* nl_addr */
+ status = nfsd4_encode_netaddr4(xdr, &ns->u.nl4_addr);
break;
default:
- WARN_ON_ONCE(ns->nl4_type != NL4_NETADDR);
- return nfserr_inval;
+ status = nfserr_serverfault;
}
-
- return 0;
+ return status;
}
static __be32
-nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
- union nfsd4_op_u *u)
+nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
- struct nfsd4_copy *copy = &u->copy;
- __be32 *p;
+ struct nfsd4_copy_notify *cn = &u->copy_notify;
+ struct xdr_stream *xdr = resp->xdr;
- nfserr = nfsd42_encode_write_res(resp, &copy->cp_res,
- nfsd4_copy_is_sync(copy));
+ /* cnr_lease_time */
+ nfserr = nfsd4_encode_nfstime4(xdr, &cn->cpn_lease_time);
if (nfserr)
return nfserr;
-
- p = xdr_reserve_space(resp->xdr, 4 + 4);
- *p++ = xdr_one; /* cr_consecutive */
- *p = nfsd4_copy_is_sync(copy) ? xdr_one : xdr_zero;
- return 0;
+ /* cnr_stateid */
+ nfserr = nfsd4_encode_stateid4(xdr, &cn->cpn_cnr_stateid);
+ if (nfserr)
+ return nfserr;
+ /* cnr_source_server<> */
+ if (xdr_stream_encode_u32(xdr, 1) != XDR_UNIT)
+ return nfserr_resource;
+ return nfsd4_encode_netloc4(xdr, cn->cpn_src);
}
static __be32
@@ -4885,14 +5186,15 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
{
struct nfsd4_offload_status *os = &u->offload_status;
struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
- p = xdr_reserve_space(xdr, 8 + 4);
- if (!p)
+ /* osr_count */
+ nfserr = nfsd4_encode_length4(xdr, os->count);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* osr_complete<1> */
+ if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
return nfserr_resource;
- p = xdr_encode_hyper(p, os->count);
- *p++ = cpu_to_be32(0);
- return nfserr;
+ return nfs_ok;
}
static __be32
@@ -4970,53 +5272,18 @@ out:
}
static __be32
-nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr,
- union nfsd4_op_u *u)
-{
- struct nfsd4_copy_notify *cn = &u->copy_notify;
- struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
-
- if (nfserr)
- return nfserr;
-
- /* 8 sec, 4 nsec */
- p = xdr_reserve_space(xdr, 12);
- if (!p)
- return nfserr_resource;
-
- /* cnr_lease_time */
- p = xdr_encode_hyper(p, cn->cpn_sec);
- *p++ = cpu_to_be32(cn->cpn_nsec);
-
- /* cnr_stateid */
- nfserr = nfsd4_encode_stateid(xdr, &cn->cpn_cnr_stateid);
- if (nfserr)
- return nfserr;
-
- /* cnr_src.nl_nsvr */
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- return nfserr_resource;
-
- *p++ = cpu_to_be32(1);
-
- nfserr = nfsd42_encode_nl4_server(resp, cn->cpn_src);
- return nfserr;
-}
-
-static __be32
nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
union nfsd4_op_u *u)
{
struct nfsd4_seek *seek = &u->seek;
- __be32 *p;
-
- p = xdr_reserve_space(resp->xdr, 4 + 8);
- *p++ = cpu_to_be32(seek->seek_eof);
- p = xdr_encode_hyper(p, seek->seek_pos);
+ struct xdr_stream *xdr = resp->xdr;
- return 0;
+ /* sr_eof */
+ nfserr = nfsd4_encode_bool(xdr, seek->seek_eof);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* sr_offset */
+ return nfsd4_encode_offset4(xdr, seek->seek_pos);
}
static __be32
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 80621a709510..d3273a396659 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -201,26 +201,29 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)
{
unsigned int hashsize;
unsigned int i;
- int status = 0;
nn->max_drc_entries = nfsd_cache_size_limit();
atomic_set(&nn->num_drc_entries, 0);
hashsize = nfsd_hashsize(nn->max_drc_entries);
nn->maskbits = ilog2(hashsize);
- nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan;
- nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count;
- nn->nfsd_reply_cache_shrinker.seeks = 1;
- status = register_shrinker(&nn->nfsd_reply_cache_shrinker,
- "nfsd-reply:%s", nn->nfsd_name);
- if (status)
- return status;
-
nn->drc_hashtbl = kvzalloc(array_size(hashsize,
sizeof(*nn->drc_hashtbl)), GFP_KERNEL);
if (!nn->drc_hashtbl)
+ return -ENOMEM;
+
+ nn->nfsd_reply_cache_shrinker = shrinker_alloc(0, "nfsd-reply:%s",
+ nn->nfsd_name);
+ if (!nn->nfsd_reply_cache_shrinker)
goto out_shrinker;
+ nn->nfsd_reply_cache_shrinker->scan_objects = nfsd_reply_cache_scan;
+ nn->nfsd_reply_cache_shrinker->count_objects = nfsd_reply_cache_count;
+ nn->nfsd_reply_cache_shrinker->seeks = 1;
+ nn->nfsd_reply_cache_shrinker->private_data = nn;
+
+ shrinker_register(nn->nfsd_reply_cache_shrinker);
+
for (i = 0; i < hashsize; i++) {
INIT_LIST_HEAD(&nn->drc_hashtbl[i].lru_head);
spin_lock_init(&nn->drc_hashtbl[i].cache_lock);
@@ -229,7 +232,7 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)
return 0;
out_shrinker:
- unregister_shrinker(&nn->nfsd_reply_cache_shrinker);
+ kvfree(nn->drc_hashtbl);
printk(KERN_ERR "nfsd: failed to allocate reply cache\n");
return -ENOMEM;
}
@@ -239,7 +242,7 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn)
struct nfsd_cacherep *rp;
unsigned int i;
- unregister_shrinker(&nn->nfsd_reply_cache_shrinker);
+ shrinker_free(nn->nfsd_reply_cache_shrinker);
for (i = 0; i < nn->drc_hashsize; i++) {
struct list_head *head = &nn->drc_hashtbl[i].lru_head;
@@ -323,8 +326,7 @@ nfsd_prune_bucket_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b,
static unsigned long
nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
{
- struct nfsd_net *nn = container_of(shrink,
- struct nfsd_net, nfsd_reply_cache_shrinker);
+ struct nfsd_net *nn = shrink->private_data;
return atomic_read(&nn->num_drc_entries);
}
@@ -343,8 +345,7 @@ nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
static unsigned long
nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
{
- struct nfsd_net *nn = container_of(shrink,
- struct nfsd_net, nfsd_reply_cache_shrinker);
+ struct nfsd_net *nn = shrink->private_data;
unsigned long freed = 0;
LIST_HEAD(dispose);
unsigned int i;
@@ -368,33 +369,52 @@ nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
return freed;
}
-/*
- * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes
+/**
+ * nfsd_cache_csum - Checksum incoming NFS Call arguments
+ * @buf: buffer containing a whole RPC Call message
+ * @start: starting byte of the NFS Call header
+ * @remaining: size of the NFS Call header, in bytes
+ *
+ * Compute a weak checksum of the leading bytes of an NFS procedure
+ * call header to help verify that a retransmitted Call matches an
+ * entry in the duplicate reply cache.
+ *
+ * To avoid assumptions about how the RPC message is laid out in
+ * @buf and what else it might contain (eg, a GSS MIC suffix), the
+ * caller passes us the exact location and length of the NFS Call
+ * header.
+ *
+ * Returns a 32-bit checksum value, as defined in RFC 793.
*/
-static __wsum
-nfsd_cache_csum(struct svc_rqst *rqstp)
+static __wsum nfsd_cache_csum(struct xdr_buf *buf, unsigned int start,
+ unsigned int remaining)
{
+ unsigned int base, len;
+ struct xdr_buf subbuf;
+ __wsum csum = 0;
+ void *p;
int idx;
- unsigned int base;
- __wsum csum;
- struct xdr_buf *buf = &rqstp->rq_arg;
- const unsigned char *p = buf->head[0].iov_base;
- size_t csum_len = min_t(size_t, buf->head[0].iov_len + buf->page_len,
- RC_CSUMLEN);
- size_t len = min(buf->head[0].iov_len, csum_len);
+
+ if (remaining > RC_CSUMLEN)
+ remaining = RC_CSUMLEN;
+ if (xdr_buf_subsegment(buf, &subbuf, start, remaining))
+ return csum;
/* rq_arg.head first */
- csum = csum_partial(p, len, 0);
- csum_len -= len;
+ if (subbuf.head[0].iov_len) {
+ len = min_t(unsigned int, subbuf.head[0].iov_len, remaining);
+ csum = csum_partial(subbuf.head[0].iov_base, len, csum);
+ remaining -= len;
+ }
/* Continue into page array */
- idx = buf->page_base / PAGE_SIZE;
- base = buf->page_base & ~PAGE_MASK;
- while (csum_len) {
- p = page_address(buf->pages[idx]) + base;
- len = min_t(size_t, PAGE_SIZE - base, csum_len);
+ idx = subbuf.page_base / PAGE_SIZE;
+ base = subbuf.page_base & ~PAGE_MASK;
+ while (remaining) {
+ p = page_address(subbuf.pages[idx]) + base;
+ len = min_t(unsigned int, PAGE_SIZE - base, remaining);
csum = csum_partial(p, len, csum);
- csum_len -= len;
+ remaining -= len;
base = 0;
++idx;
}
@@ -465,6 +485,8 @@ out:
/**
* nfsd_cache_lookup - Find an entry in the duplicate reply cache
* @rqstp: Incoming Call to find
+ * @start: starting byte in @rqstp->rq_arg of the NFS Call header
+ * @len: size of the NFS Call header, in bytes
* @cacherep: OUT: DRC entry for this request
*
* Try to find an entry matching the current call in the cache. When none
@@ -478,7 +500,8 @@ out:
* %RC_REPLY: Reply from cache
* %RC_DROPIT: Do not process the request further
*/
-int nfsd_cache_lookup(struct svc_rqst *rqstp, struct nfsd_cacherep **cacherep)
+int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
+ unsigned int len, struct nfsd_cacherep **cacherep)
{
struct nfsd_net *nn;
struct nfsd_cacherep *rp, *found;
@@ -494,7 +517,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, struct nfsd_cacherep **cacherep)
goto out;
}
- csum = nfsd_cache_csum(rqstp);
+ csum = nfsd_cache_csum(&rqstp->rq_arg, start, len);
/*
* Since the common case is a cache miss followed by an insert,
@@ -640,24 +663,17 @@ void nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp,
return;
}
-/*
- * Copy cached reply to current reply buffer. Should always fit.
- * FIXME as reply is in a page, we should just attach the page, and
- * keep a refcount....
- */
static int
nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data)
{
- struct kvec *vec = &rqstp->rq_res.head[0];
-
- if (vec->iov_len + data->iov_len > PAGE_SIZE) {
- printk(KERN_WARNING "nfsd: cached reply too large (%zd).\n",
- data->iov_len);
- return 0;
- }
- memcpy((char*)vec->iov_base + vec->iov_len, data->iov_base, data->iov_len);
- vec->iov_len += data->iov_len;
- return 1;
+ __be32 *p;
+
+ p = xdr_reserve_space(&rqstp->rq_res_stream, data->iov_len);
+ if (unlikely(!p))
+ return false;
+ memcpy(p, data->iov_base, data->iov_len);
+ xdr_commit_encode(&rqstp->rq_res_stream);
+ return true;
}
/*
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 7ed02fb88a36..87fed75808ff 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -26,6 +26,7 @@
#include "pnfs.h"
#include "filecache.h"
#include "trace.h"
+#include "netlink.h"
/*
* We have a single directory with several nodes in it.
@@ -692,6 +693,7 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred
char *mesg = buf;
int fd, err;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct svc_serv *serv;
err = get_int(&mesg, &fd);
if (err != 0 || fd < 0)
@@ -702,13 +704,15 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred
if (err != 0)
return err;
- err = svc_addsock(nn->nfsd_serv, net, fd, buf, SIMPLE_TRANSACTION_LIMIT, cred);
+ serv = nn->nfsd_serv;
+ err = svc_addsock(serv, net, fd, buf, SIMPLE_TRANSACTION_LIMIT, cred);
- if (err >= 0 &&
- !nn->nfsd_serv->sv_nrthreads && !xchg(&nn->keep_active, 1))
- svc_get(nn->nfsd_serv);
+ if (err < 0 && !serv->sv_nrthreads && !nn->keep_active)
+ nfsd_last_thread(net);
+ else if (err >= 0 && !serv->sv_nrthreads && !xchg(&nn->keep_active, 1))
+ svc_get(serv);
- nfsd_put(net);
+ svc_put(serv);
return err;
}
@@ -722,6 +726,7 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr
struct svc_xprt *xprt;
int port, err;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct svc_serv *serv;
if (sscanf(buf, "%15s %5u", transport, &port) != 2)
return -EINVAL;
@@ -734,29 +739,33 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr
if (err != 0)
return err;
- err = svc_xprt_create(nn->nfsd_serv, transport, net,
+ serv = nn->nfsd_serv;
+ err = svc_xprt_create(serv, transport, net,
PF_INET, port, SVC_SOCK_ANONYMOUS, cred);
if (err < 0)
goto out_err;
- err = svc_xprt_create(nn->nfsd_serv, transport, net,
+ err = svc_xprt_create(serv, transport, net,
PF_INET6, port, SVC_SOCK_ANONYMOUS, cred);
if (err < 0 && err != -EAFNOSUPPORT)
goto out_close;
- if (!nn->nfsd_serv->sv_nrthreads && !xchg(&nn->keep_active, 1))
- svc_get(nn->nfsd_serv);
+ if (!serv->sv_nrthreads && !xchg(&nn->keep_active, 1))
+ svc_get(serv);
- nfsd_put(net);
+ svc_put(serv);
return 0;
out_close:
- xprt = svc_find_xprt(nn->nfsd_serv, transport, net, PF_INET, port);
+ xprt = svc_find_xprt(serv, transport, net, PF_INET, port);
if (xprt != NULL) {
svc_xprt_close(xprt);
svc_xprt_put(xprt);
}
out_err:
- nfsd_put(net);
+ if (!serv->sv_nrthreads && !nn->keep_active)
+ nfsd_last_thread(net);
+
+ svc_put(serv);
return err;
}
@@ -1132,7 +1141,7 @@ static struct inode *nfsd_get_inode(struct super_block *sb, umode_t mode)
/* Following advice from simple_fill_super documentation: */
inode->i_ino = iunique(sb, NFSD_MaxReserved);
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
switch (mode & S_IFMT) {
case S_IFDIR:
inode->i_fop = &simple_dir_operations;
@@ -1496,6 +1505,200 @@ static int create_proc_exports_entry(void)
unsigned int nfsd_net_id;
/**
+ * nfsd_nl_rpc_status_get_start - Prepare rpc_status_get dumpit
+ * @cb: netlink metadata and command arguments
+ *
+ * Return values:
+ * %0: The rpc_status_get command may proceed
+ * %-ENODEV: There is no NFSD running in this namespace
+ */
+int nfsd_nl_rpc_status_get_start(struct netlink_callback *cb)
+{
+ struct nfsd_net *nn = net_generic(sock_net(cb->skb->sk), nfsd_net_id);
+ int ret = -ENODEV;
+
+ mutex_lock(&nfsd_mutex);
+ if (nn->nfsd_serv)
+ ret = 0;
+ else
+ mutex_unlock(&nfsd_mutex);
+
+ return ret;
+}
+
+static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct nfsd_genl_rqstp *rqstp)
+{
+ void *hdr;
+ u32 i;
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &nfsd_nl_family, 0, NFSD_CMD_RPC_STATUS_GET);
+ if (!hdr)
+ return -ENOBUFS;
+
+ if (nla_put_be32(skb, NFSD_A_RPC_STATUS_XID, rqstp->rq_xid) ||
+ nla_put_u32(skb, NFSD_A_RPC_STATUS_FLAGS, rqstp->rq_flags) ||
+ nla_put_u32(skb, NFSD_A_RPC_STATUS_PROG, rqstp->rq_prog) ||
+ nla_put_u32(skb, NFSD_A_RPC_STATUS_PROC, rqstp->rq_proc) ||
+ nla_put_u8(skb, NFSD_A_RPC_STATUS_VERSION, rqstp->rq_vers) ||
+ nla_put_s64(skb, NFSD_A_RPC_STATUS_SERVICE_TIME,
+ ktime_to_us(rqstp->rq_stime),
+ NFSD_A_RPC_STATUS_PAD))
+ return -ENOBUFS;
+
+ switch (rqstp->rq_saddr.sa_family) {
+ case AF_INET: {
+ const struct sockaddr_in *s_in, *d_in;
+
+ s_in = (const struct sockaddr_in *)&rqstp->rq_saddr;
+ d_in = (const struct sockaddr_in *)&rqstp->rq_daddr;
+ if (nla_put_in_addr(skb, NFSD_A_RPC_STATUS_SADDR4,
+ s_in->sin_addr.s_addr) ||
+ nla_put_in_addr(skb, NFSD_A_RPC_STATUS_DADDR4,
+ d_in->sin_addr.s_addr) ||
+ nla_put_be16(skb, NFSD_A_RPC_STATUS_SPORT,
+ s_in->sin_port) ||
+ nla_put_be16(skb, NFSD_A_RPC_STATUS_DPORT,
+ d_in->sin_port))
+ return -ENOBUFS;
+ break;
+ }
+ case AF_INET6: {
+ const struct sockaddr_in6 *s_in, *d_in;
+
+ s_in = (const struct sockaddr_in6 *)&rqstp->rq_saddr;
+ d_in = (const struct sockaddr_in6 *)&rqstp->rq_daddr;
+ if (nla_put_in6_addr(skb, NFSD_A_RPC_STATUS_SADDR6,
+ &s_in->sin6_addr) ||
+ nla_put_in6_addr(skb, NFSD_A_RPC_STATUS_DADDR6,
+ &d_in->sin6_addr) ||
+ nla_put_be16(skb, NFSD_A_RPC_STATUS_SPORT,
+ s_in->sin6_port) ||
+ nla_put_be16(skb, NFSD_A_RPC_STATUS_DPORT,
+ d_in->sin6_port))
+ return -ENOBUFS;
+ break;
+ }
+ }
+
+ for (i = 0; i < rqstp->rq_opcnt; i++)
+ if (nla_put_u32(skb, NFSD_A_RPC_STATUS_COMPOUND_OPS,
+ rqstp->rq_opnum[i]))
+ return -ENOBUFS;
+
+ genlmsg_end(skb, hdr);
+ return 0;
+}
+
+/**
+ * nfsd_nl_rpc_status_get_dumpit - Handle rpc_status_get dumpit
+ * @skb: reply buffer
+ * @cb: netlink metadata and command arguments
+ *
+ * Returns the size of the reply or a negative errno.
+ */
+int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct nfsd_net *nn = net_generic(sock_net(skb->sk), nfsd_net_id);
+ int i, ret, rqstp_index = 0;
+
+ rcu_read_lock();
+
+ for (i = 0; i < nn->nfsd_serv->sv_nrpools; i++) {
+ struct svc_rqst *rqstp;
+
+ if (i < cb->args[0]) /* already consumed */
+ continue;
+
+ rqstp_index = 0;
+ list_for_each_entry_rcu(rqstp,
+ &nn->nfsd_serv->sv_pools[i].sp_all_threads,
+ rq_all) {
+ struct nfsd_genl_rqstp genl_rqstp;
+ unsigned int status_counter;
+
+ if (rqstp_index++ < cb->args[1]) /* already consumed */
+ continue;
+ /*
+ * Acquire rq_status_counter before parsing the rqst
+ * fields. rq_status_counter is set to an odd value in
+ * order to notify the consumers the rqstp fields are
+ * meaningful.
+ */
+ status_counter =
+ smp_load_acquire(&rqstp->rq_status_counter);
+ if (!(status_counter & 1))
+ continue;
+
+ genl_rqstp.rq_xid = rqstp->rq_xid;
+ genl_rqstp.rq_flags = rqstp->rq_flags;
+ genl_rqstp.rq_vers = rqstp->rq_vers;
+ genl_rqstp.rq_prog = rqstp->rq_prog;
+ genl_rqstp.rq_proc = rqstp->rq_proc;
+ genl_rqstp.rq_stime = rqstp->rq_stime;
+ genl_rqstp.rq_opcnt = 0;
+ memcpy(&genl_rqstp.rq_daddr, svc_daddr(rqstp),
+ sizeof(struct sockaddr));
+ memcpy(&genl_rqstp.rq_saddr, svc_addr(rqstp),
+ sizeof(struct sockaddr));
+
+#ifdef CONFIG_NFSD_V4
+ if (rqstp->rq_vers == NFS4_VERSION &&
+ rqstp->rq_proc == NFSPROC4_COMPOUND) {
+ /* NFSv4 compound */
+ struct nfsd4_compoundargs *args;
+ int j;
+
+ args = rqstp->rq_argp;
+ genl_rqstp.rq_opcnt = args->opcnt;
+ for (j = 0; j < genl_rqstp.rq_opcnt; j++)
+ genl_rqstp.rq_opnum[j] =
+ args->ops[j].opnum;
+ }
+#endif /* CONFIG_NFSD_V4 */
+
+ /*
+ * Acquire rq_status_counter before reporting the rqst
+ * fields to the user.
+ */
+ if (smp_load_acquire(&rqstp->rq_status_counter) !=
+ status_counter)
+ continue;
+
+ ret = nfsd_genl_rpc_status_compose_msg(skb, cb,
+ &genl_rqstp);
+ if (ret)
+ goto out;
+ }
+ }
+
+ cb->args[0] = i;
+ cb->args[1] = rqstp_index;
+ ret = skb->len;
+out:
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/**
+ * nfsd_nl_rpc_status_get_done - rpc_status_get dumpit post-processing
+ * @cb: netlink metadata and command arguments
+ *
+ * Return values:
+ * %0: Success
+ */
+int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb)
+{
+ mutex_unlock(&nfsd_mutex);
+
+ return 0;
+}
+
+/**
* nfsd_net_init - Prepare the nfsd_net portion of a new net namespace
* @net: a freshly-created network namespace
*
@@ -1589,6 +1792,10 @@ static int __init init_nfsd(void)
retval = register_filesystem(&nfsd_fs_type);
if (retval)
goto out_free_all;
+ retval = genl_register_family(&nfsd_nl_family);
+ if (retval)
+ goto out_free_all;
+
return 0;
out_free_all:
nfsd4_destroy_laundry_wq();
@@ -1613,6 +1820,7 @@ out_free_slabs:
static void __exit exit_nfsd(void)
{
+ genl_unregister_family(&nfsd_nl_family);
unregister_filesystem(&nfsd_fs_type);
nfsd4_destroy_laundry_wq();
unregister_cld_notifier();
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 11c14faa6c67..9ed0e08d16c2 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -62,6 +62,23 @@ struct readdir_cd {
__be32 err; /* 0, nfserr, or nfserr_eof */
};
+/* Maximum number of operations per session compound */
+#define NFSD_MAX_OPS_PER_COMPOUND 50
+
+struct nfsd_genl_rqstp {
+ struct sockaddr rq_daddr;
+ struct sockaddr rq_saddr;
+ unsigned long rq_flags;
+ ktime_t rq_stime;
+ __be32 rq_xid;
+ u32 rq_vers;
+ u32 rq_prog;
+ u32 rq_proc;
+
+ /* NFSv4 compound */
+ u32 rq_opcnt;
+ u32 rq_opnum[NFSD_MAX_OPS_PER_COMPOUND];
+};
extern struct svc_program nfsd_program;
extern const struct svc_version nfsd_version2, nfsd_version3, nfsd_version4;
@@ -96,13 +113,6 @@ int nfsd_pool_stats_open(struct inode *, struct file *);
int nfsd_pool_stats_release(struct inode *, struct file *);
void nfsd_shutdown_threads(struct net *net);
-static inline void nfsd_put(struct net *net)
-{
- struct nfsd_net *nn = net_generic(net, nfsd_net_id);
-
- svc_put(nn->nfsd_serv);
-}
-
bool i_am_nfsd(void);
struct nfsdfs_client {
@@ -138,6 +148,7 @@ int nfsd_vers(struct nfsd_net *nn, int vers, enum vers_op change);
int nfsd_minorversion(struct nfsd_net *nn, u32 minorversion, enum vers_op change);
void nfsd_reset_versions(struct nfsd_net *nn);
int nfsd_create_serv(struct net *net);
+void nfsd_last_thread(struct net *net);
extern int nfsd_max_blksize;
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 355bf0db3235..dbfa0ac13564 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -771,7 +771,7 @@ enum fsid_source fsid_source(const struct svc_fh *fhp)
* assume that the new change attr is always logged to stable storage in some
* fashion before the results can be seen.
*/
-u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode)
+u64 nfsd4_change_attribute(const struct kstat *stat, const struct inode *inode)
{
u64 chattr;
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 40426f899e76..6ebdf7ea27bf 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -293,7 +293,8 @@ static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp)
fhp->fh_pre_saved = false;
}
-u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode);
+u64 nfsd4_change_attribute(const struct kstat *stat,
+ const struct inode *inode);
__be32 __must_check fh_fill_pre_attrs(struct svc_fh *fhp);
__be32 fh_fill_post_attrs(struct svc_fh *fhp);
__be32 __must_check fh_fill_both_attrs(struct svc_fh *fhp);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index c7af1095f6b5..7a2bc8e82a63 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -542,7 +542,7 @@ static struct notifier_block nfsd_inet6addr_notifier = {
/* Only used under nfsd_mutex, so this atomic may be overkill: */
static atomic_t nfsd_notifier_refcount = ATOMIC_INIT(0);
-static void nfsd_last_thread(struct net *net)
+void nfsd_last_thread(struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
struct svc_serv *serv = nn->nfsd_serv;
@@ -572,7 +572,6 @@ static void nfsd_last_thread(struct net *net)
return;
nfsd_shutdown_net(net);
- pr_info("nfsd: last server has exited, flushing export cache\n");
nfsd_export_flush(net);
}
@@ -713,14 +712,13 @@ int nfsd_nrpools(struct net *net)
int nfsd_get_nrthreads(int n, int *nthreads, struct net *net)
{
- int i = 0;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct svc_serv *serv = nn->nfsd_serv;
+ int i;
- if (nn->nfsd_serv != NULL) {
- for (i = 0; i < nn->nfsd_serv->sv_nrpools && i < n; i++)
- nthreads[i] = nn->nfsd_serv->sv_pools[i].sp_nrthreads;
- }
-
+ if (serv)
+ for (i = 0; i < serv->sv_nrpools && i < n; i++)
+ nthreads[i] = atomic_read(&serv->sv_pools[i].sp_nrthreads);
return 0;
}
@@ -787,7 +785,6 @@ int
nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
{
int error;
- bool nfsd_up_before;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
struct svc_serv *serv;
@@ -807,8 +804,6 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
error = nfsd_create_serv(net);
if (error)
goto out;
-
- nfsd_up_before = nn->nfsd_net_up;
serv = nn->nfsd_serv;
error = nfsd_startup_net(net, cred);
@@ -816,17 +811,15 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
goto out_put;
error = svc_set_num_threads(serv, NULL, nrservs);
if (error)
- goto out_shutdown;
+ goto out_put;
error = serv->sv_nrthreads;
- if (error == 0)
- nfsd_last_thread(net);
-out_shutdown:
- if (error < 0 && !nfsd_up_before)
- nfsd_shutdown_net(net);
out_put:
/* Threads now hold service active */
if (xchg(&nn->keep_active, 0))
svc_put(serv);
+
+ if (serv->sv_nrthreads == 0)
+ nfsd_last_thread(net);
svc_put(serv);
out:
mutex_unlock(&nfsd_mutex);
@@ -957,12 +950,11 @@ nfsd(void *vrqstp)
/*
* The main request loop
*/
- while (!kthread_should_stop()) {
+ while (!svc_thread_should_stop(rqstp)) {
/* Update sv_maxconn if it has changed */
rqstp->rq_server->sv_maxconn = nn->max_connections;
svc_recv(rqstp);
- validate_process_creds();
}
atomic_dec(&nfsdstats.th_cnt);
@@ -988,6 +980,8 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
const struct svc_procedure *proc = rqstp->rq_procinfo;
__be32 *statp = rqstp->rq_accept_statp;
struct nfsd_cacherep *rp;
+ unsigned int start, len;
+ __be32 *nfs_reply;
/*
* Give the xdr decoder a chance to change this if it wants
@@ -995,11 +989,27 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
*/
rqstp->rq_cachetype = proc->pc_cachetype;
+ /*
+ * ->pc_decode advances the argument stream past the NFS
+ * Call header, so grab the header's starting location and
+ * size now for the call to nfsd_cache_lookup().
+ */
+ start = xdr_stream_pos(&rqstp->rq_arg_stream);
+ len = xdr_stream_remaining(&rqstp->rq_arg_stream);
if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream))
goto out_decode_err;
+ /*
+ * Release rq_status_counter setting it to an odd value after the rpc
+ * request has been properly parsed. rq_status_counter is used to
+ * notify the consumers if the rqstp fields are stable
+ * (rq_status_counter is odd) or not meaningful (rq_status_counter
+ * is even).
+ */
+ smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter | 1);
+
rp = NULL;
- switch (nfsd_cache_lookup(rqstp, &rp)) {
+ switch (nfsd_cache_lookup(rqstp, start, len, &rp)) {
case RC_DOIT:
break;
case RC_REPLY:
@@ -1008,6 +1018,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
goto out_dropit;
}
+ nfs_reply = xdr_inline_decode(&rqstp->rq_res_stream, 0);
*statp = proc->pc_func(rqstp);
if (test_bit(RQ_DROPME, &rqstp->rq_flags))
goto out_update_drop;
@@ -1015,7 +1026,13 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream))
goto out_encode_err;
- nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, statp + 1);
+ /*
+ * Release rq_status_counter setting it to an even value after the rpc
+ * request has been properly processed.
+ */
+ smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter + 1);
+
+ nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, nfs_reply);
out_cached_reply:
return 1;
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
index 4f4282d4eeca..de1e0dfed06a 100644
--- a/fs/nfsd/pnfs.h
+++ b/fs/nfsd/pnfs.h
@@ -27,12 +27,12 @@ struct nfsd4_layout_ops {
struct nfs4_client *clp,
struct nfsd4_getdeviceinfo *gdevp);
__be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
- struct nfsd4_getdeviceinfo *gdevp);
+ const struct nfsd4_getdeviceinfo *gdevp);
__be32 (*proc_layoutget)(struct inode *, const struct svc_fh *fhp,
struct nfsd4_layoutget *lgp);
- __be32 (*encode_layoutget)(struct xdr_stream *,
- struct nfsd4_layoutget *lgp);
+ __be32 (*encode_layoutget)(struct xdr_stream *xdr,
+ const struct nfsd4_layoutget *lgp);
__be32 (*proc_layoutcommit)(struct inode *inode,
struct nfsd4_layoutcommit *lcp);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index cbddcf484dba..41bdc913fa71 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -174,8 +174,6 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s)
/* Maximum number of slots per session. 160 is useful for long haul TCP */
#define NFSD_MAX_SLOTS_PER_SESSION 160
-/* Maximum number of operations per session compound */
-#define NFSD_MAX_OPS_PER_COMPOUND 50
/* Maximum session per slot cache size */
#define NFSD_SLOT_CACHE_SIZE 2048
/* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index 63797635e1c3..12d79f5d4eb1 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -60,7 +60,7 @@ static int nfsd_show(struct seq_file *seq, void *v)
#ifdef CONFIG_NFSD_V4
/* Show count for individual nfsv4 operations */
/* Writing operation numbers 0 1 2 also for maintaining uniformity */
- seq_printf(seq,"proc4ops %u", LAST_NFS4_OP + 1);
+ seq_printf(seq, "proc4ops %u", LAST_NFS4_OP + 1);
for (i = 0; i <= LAST_NFS4_OP; i++) {
seq_printf(seq, " %lld",
percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_NFS4_OP(i)]));
@@ -76,7 +76,7 @@ static int nfsd_show(struct seq_file *seq, void *v)
DEFINE_PROC_SHOW_ATTRIBUTE(nfsd);
-int nfsd_percpu_counters_init(struct percpu_counter counters[], int num)
+int nfsd_percpu_counters_init(struct percpu_counter *counters, int num)
{
int i, err = 0;
diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
index cf5524e7ca06..14f50c660b61 100644
--- a/fs/nfsd/stats.h
+++ b/fs/nfsd/stats.h
@@ -37,9 +37,9 @@ extern struct nfsd_stats nfsdstats;
extern struct svc_stat nfsd_svcstats;
-int nfsd_percpu_counters_init(struct percpu_counter counters[], int num);
-void nfsd_percpu_counters_reset(struct percpu_counter counters[], int num);
-void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num);
+int nfsd_percpu_counters_init(struct percpu_counter *counters, int num);
+void nfsd_percpu_counters_reset(struct percpu_counter *counters, int num);
+void nfsd_percpu_counters_destroy(struct percpu_counter *counters, int num);
int nfsd_stat_init(void);
void nfsd_stat_shutdown(void);
@@ -61,22 +61,22 @@ static inline void nfsd_stats_rc_nocache_inc(void)
static inline void nfsd_stats_fh_stale_inc(struct svc_export *exp)
{
percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_FH_STALE]);
- if (exp)
- percpu_counter_inc(&exp->ex_stats.counter[EXP_STATS_FH_STALE]);
+ if (exp && exp->ex_stats)
+ percpu_counter_inc(&exp->ex_stats->counter[EXP_STATS_FH_STALE]);
}
static inline void nfsd_stats_io_read_add(struct svc_export *exp, s64 amount)
{
percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_READ], amount);
- if (exp)
- percpu_counter_add(&exp->ex_stats.counter[EXP_STATS_IO_READ], amount);
+ if (exp && exp->ex_stats)
+ percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_READ], amount);
}
static inline void nfsd_stats_io_write_add(struct svc_export *exp, s64 amount)
{
percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_WRITE], amount);
- if (exp)
- percpu_counter_add(&exp->ex_stats.counter[EXP_STATS_IO_WRITE], amount);
+ if (exp && exp->ex_stats)
+ percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_WRITE], amount);
}
static inline void nfsd_stats_payload_misses_inc(struct nfsd_net *nn)
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 803904348871..fbc0ccb40424 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -1863,6 +1863,93 @@ TRACE_EVENT(nfsd_end_grace,
)
);
+DECLARE_EVENT_CLASS(nfsd_copy_class,
+ TP_PROTO(
+ const struct nfsd4_copy *copy
+ ),
+ TP_ARGS(copy),
+ TP_STRUCT__entry(
+ __field(bool, intra)
+ __field(bool, async)
+ __field(u32, src_cl_boot)
+ __field(u32, src_cl_id)
+ __field(u32, src_so_id)
+ __field(u32, src_si_generation)
+ __field(u32, dst_cl_boot)
+ __field(u32, dst_cl_id)
+ __field(u32, dst_so_id)
+ __field(u32, dst_si_generation)
+ __field(u64, src_cp_pos)
+ __field(u64, dst_cp_pos)
+ __field(u64, cp_count)
+ __sockaddr(addr, sizeof(struct sockaddr_in6))
+ ),
+ TP_fast_assign(
+ const stateid_t *src_stp = &copy->cp_src_stateid;
+ const stateid_t *dst_stp = &copy->cp_dst_stateid;
+
+ __entry->intra = test_bit(NFSD4_COPY_F_INTRA, &copy->cp_flags);
+ __entry->async = !test_bit(NFSD4_COPY_F_SYNCHRONOUS, &copy->cp_flags);
+ __entry->src_cl_boot = src_stp->si_opaque.so_clid.cl_boot;
+ __entry->src_cl_id = src_stp->si_opaque.so_clid.cl_id;
+ __entry->src_so_id = src_stp->si_opaque.so_id;
+ __entry->src_si_generation = src_stp->si_generation;
+ __entry->dst_cl_boot = dst_stp->si_opaque.so_clid.cl_boot;
+ __entry->dst_cl_id = dst_stp->si_opaque.so_clid.cl_id;
+ __entry->dst_so_id = dst_stp->si_opaque.so_id;
+ __entry->dst_si_generation = dst_stp->si_generation;
+ __entry->src_cp_pos = copy->cp_src_pos;
+ __entry->dst_cp_pos = copy->cp_dst_pos;
+ __entry->cp_count = copy->cp_count;
+ __assign_sockaddr(addr, &copy->cp_clp->cl_addr,
+ sizeof(struct sockaddr_in6));
+ ),
+ TP_printk("client=%pISpc intra=%d async=%d "
+ "src_stateid[si_generation:0x%x cl_boot:0x%x cl_id:0x%x so_id:0x%x] "
+ "dst_stateid[si_generation:0x%x cl_boot:0x%x cl_id:0x%x so_id:0x%x] "
+ "cp_src_pos=%llu cp_dst_pos=%llu cp_count=%llu",
+ __get_sockaddr(addr), __entry->intra, __entry->async,
+ __entry->src_si_generation, __entry->src_cl_boot,
+ __entry->src_cl_id, __entry->src_so_id,
+ __entry->dst_si_generation, __entry->dst_cl_boot,
+ __entry->dst_cl_id, __entry->dst_so_id,
+ __entry->src_cp_pos, __entry->dst_cp_pos, __entry->cp_count
+ )
+);
+
+#define DEFINE_COPY_EVENT(name) \
+DEFINE_EVENT(nfsd_copy_class, nfsd_copy_##name, \
+ TP_PROTO(const struct nfsd4_copy *copy), \
+ TP_ARGS(copy))
+
+DEFINE_COPY_EVENT(inter);
+DEFINE_COPY_EVENT(intra);
+DEFINE_COPY_EVENT(do_async);
+
+TRACE_EVENT(nfsd_copy_done,
+ TP_PROTO(
+ const struct nfsd4_copy *copy,
+ __be32 status
+ ),
+ TP_ARGS(copy, status),
+ TP_STRUCT__entry(
+ __field(int, status)
+ __field(bool, intra)
+ __field(bool, async)
+ __sockaddr(addr, sizeof(struct sockaddr_in6))
+ ),
+ TP_fast_assign(
+ __entry->status = be32_to_cpu(status);
+ __entry->intra = test_bit(NFSD4_COPY_F_INTRA, &copy->cp_flags);
+ __entry->async = !test_bit(NFSD4_COPY_F_SYNCHRONOUS, &copy->cp_flags);
+ __assign_sockaddr(addr, &copy->cp_clp->cl_addr,
+ sizeof(struct sockaddr_in6));
+ ),
+ TP_printk("addr=%pISpc status=%d intra=%d async=%d ",
+ __get_sockaddr(addr), __entry->status, __entry->intra, __entry->async
+ )
+);
+
#endif /* _NFSD_TRACE_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 02f5fcaad03f..e01e4e2acbd9 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -337,6 +337,24 @@ out:
return err;
}
+static void
+commit_reset_write_verifier(struct nfsd_net *nn, struct svc_rqst *rqstp,
+ int err)
+{
+ switch (err) {
+ case -EAGAIN:
+ case -ESTALE:
+ /*
+ * Neither of these are the result of a problem with
+ * durable storage, so avoid a write verifier reset.
+ */
+ break;
+ default:
+ nfsd_reset_write_verifier(nn);
+ trace_nfsd_writeverf_reset(nn, rqstp, err);
+ }
+}
+
/*
* Commit metadata changes to stable storage.
*/
@@ -520,7 +538,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
nfsd_sanitize_attrs(inode, iap);
- if (check_guard && guardtime != inode_get_ctime(inode).tv_sec)
+ if (check_guard && guardtime != inode_get_ctime_sec(inode))
return nfserr_notsync;
/*
@@ -647,8 +665,7 @@ __be32 nfsd4_clone_file_range(struct svc_rqst *rqstp,
&nfsd4_get_cstate(rqstp)->current_fh,
dst_pos,
count, status);
- nfsd_reset_write_verifier(nn);
- trace_nfsd_writeverf_reset(nn, rqstp, status);
+ commit_reset_write_verifier(nn, rqstp, status);
ret = nfserrno(status);
}
}
@@ -823,7 +840,7 @@ int nfsd_open_break_lease(struct inode *inode, int access)
* and additional flags.
* N.B. After this call fhp needs an fh_put
*/
-static __be32
+static int
__nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
int may_flags, struct file **filp)
{
@@ -831,14 +848,12 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
struct inode *inode;
struct file *file;
int flags = O_RDONLY|O_LARGEFILE;
- __be32 err;
- int host_err = 0;
+ int host_err = -EPERM;
path.mnt = fhp->fh_export->ex_path.mnt;
path.dentry = fhp->fh_dentry;
inode = d_inode(path.dentry);
- err = nfserr_perm;
if (IS_APPEND(inode) && (may_flags & NFSD_MAY_WRITE))
goto out;
@@ -847,7 +862,7 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
host_err = nfsd_open_break_lease(inode, may_flags);
if (host_err) /* NOMEM or WOULDBLOCK */
- goto out_nfserr;
+ goto out;
if (may_flags & NFSD_MAY_WRITE) {
if (may_flags & NFSD_MAY_READ)
@@ -859,13 +874,13 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
file = dentry_open(&path, flags, current_cred());
if (IS_ERR(file)) {
host_err = PTR_ERR(file);
- goto out_nfserr;
+ goto out;
}
host_err = ima_file_check(file, may_flags);
if (host_err) {
fput(file);
- goto out_nfserr;
+ goto out;
}
if (may_flags & NFSD_MAY_64BIT_COOKIE)
@@ -874,10 +889,8 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
file->f_mode |= FMODE_32BITHASH;
*filp = file;
-out_nfserr:
- err = nfserrno(host_err);
out:
- return err;
+ return host_err;
}
__be32
@@ -885,9 +898,9 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
int may_flags, struct file **filp)
{
__be32 err;
+ int host_err;
bool retried = false;
- validate_process_creds();
/*
* If we get here, then the client has already done an "open",
* and (hopefully) checked permission - so allow OWNER_OVERRIDE
@@ -904,14 +917,14 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
retry:
err = fh_verify(rqstp, fhp, type, may_flags);
if (!err) {
- err = __nfsd_open(rqstp, fhp, type, may_flags, filp);
- if (err == nfserr_stale && !retried) {
+ host_err = __nfsd_open(rqstp, fhp, type, may_flags, filp);
+ if (host_err == -EOPENSTALE && !retried) {
retried = true;
fh_put(fhp);
goto retry;
}
+ err = nfserrno(host_err);
}
- validate_process_creds();
return err;
}
@@ -922,18 +935,13 @@ retry:
* @may_flags: internal permission flags
* @filp: OUT: open "struct file *"
*
- * Returns an nfsstat value in network byte order.
+ * Returns zero on success, or a negative errno value.
*/
-__be32
+int
nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, int may_flags,
struct file **filp)
{
- __be32 err;
-
- validate_process_creds();
- err = __nfsd_open(rqstp, fhp, S_IFREG, may_flags, filp);
- validate_process_creds();
- return err;
+ return __nfsd_open(rqstp, fhp, S_IFREG, may_flags, filp);
}
/*
@@ -1172,8 +1180,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
host_err = vfs_iter_write(file, &iter, &pos, flags);
file_end_write(file);
if (host_err < 0) {
- nfsd_reset_write_verifier(nn);
- trace_nfsd_writeverf_reset(nn, rqstp, host_err);
+ commit_reset_write_verifier(nn, rqstp, host_err);
goto out_nfserr;
}
*cnt = host_err;
@@ -1185,10 +1192,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
if (stable && use_wgather) {
host_err = wait_for_concurrent_writes(file);
- if (host_err < 0) {
- nfsd_reset_write_verifier(nn);
- trace_nfsd_writeverf_reset(nn, rqstp, host_err);
- }
+ if (host_err < 0)
+ commit_reset_write_verifier(nn, rqstp, host_err);
}
out_nfserr:
@@ -1331,8 +1336,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
err = nfserr_notsupp;
break;
default:
- nfsd_reset_write_verifier(nn);
- trace_nfsd_writeverf_reset(nn, rqstp, err2);
+ commit_reset_write_verifier(nn, rqstp, err2);
err = nfserrno(err2);
}
} else
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index a6890ea7b765..e3c29596f4df 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -104,8 +104,8 @@ __be32 nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
int nfsd_open_break_lease(struct inode *, int);
__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t,
int, struct file **);
-__be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *,
- int, struct file **);
+int nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ int may_flags, struct file **filp);
__be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct file *file, loff_t offset,
unsigned long *count,
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 9d918a79dc16..80e859dc84d8 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -50,6 +50,134 @@
#define HAS_CSTATE_FLAG(c, f) ((c)->sid_flags & (f))
#define CLEAR_CSTATE_FLAG(c, f) ((c)->sid_flags &= ~(f))
+/**
+ * nfsd4_encode_bool - Encode an XDR bool type result
+ * @xdr: target XDR stream
+ * @val: boolean value to encode
+ *
+ * Return values:
+ * %nfs_ok: @val encoded; @xdr advanced to next position
+ * %nfserr_resource: stream buffer space exhausted
+ */
+static __always_inline __be32
+nfsd4_encode_bool(struct xdr_stream *xdr, bool val)
+{
+ __be32 *p = xdr_reserve_space(xdr, XDR_UNIT);
+
+ if (unlikely(p == NULL))
+ return nfserr_resource;
+ *p = val ? xdr_one : xdr_zero;
+ return nfs_ok;
+}
+
+/**
+ * nfsd4_encode_uint32_t - Encode an XDR uint32_t type result
+ * @xdr: target XDR stream
+ * @val: integer value to encode
+ *
+ * Return values:
+ * %nfs_ok: @val encoded; @xdr advanced to next position
+ * %nfserr_resource: stream buffer space exhausted
+ */
+static __always_inline __be32
+nfsd4_encode_uint32_t(struct xdr_stream *xdr, u32 val)
+{
+ __be32 *p = xdr_reserve_space(xdr, XDR_UNIT);
+
+ if (unlikely(p == NULL))
+ return nfserr_resource;
+ *p = cpu_to_be32(val);
+ return nfs_ok;
+}
+
+#define nfsd4_encode_aceflag4(x, v) nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_acemask4(x, v) nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_acetype4(x, v) nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_count4(x, v) nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_mode4(x, v) nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_nfs_lease4(x, v) nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_qop4(x, v) nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_sequenceid4(x, v) nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_slotid4(x, v) nfsd4_encode_uint32_t(x, v)
+
+/**
+ * nfsd4_encode_uint64_t - Encode an XDR uint64_t type result
+ * @xdr: target XDR stream
+ * @val: integer value to encode
+ *
+ * Return values:
+ * %nfs_ok: @val encoded; @xdr advanced to next position
+ * %nfserr_resource: stream buffer space exhausted
+ */
+static __always_inline __be32
+nfsd4_encode_uint64_t(struct xdr_stream *xdr, u64 val)
+{
+ __be32 *p = xdr_reserve_space(xdr, XDR_UNIT * 2);
+
+ if (unlikely(p == NULL))
+ return nfserr_resource;
+ put_unaligned_be64(val, p);
+ return nfs_ok;
+}
+
+#define nfsd4_encode_changeid4(x, v) nfsd4_encode_uint64_t(x, v)
+#define nfsd4_encode_nfs_cookie4(x, v) nfsd4_encode_uint64_t(x, v)
+#define nfsd4_encode_length4(x, v) nfsd4_encode_uint64_t(x, v)
+#define nfsd4_encode_offset4(x, v) nfsd4_encode_uint64_t(x, v)
+
+/**
+ * nfsd4_encode_opaque_fixed - Encode a fixed-length XDR opaque type result
+ * @xdr: target XDR stream
+ * @data: pointer to data
+ * @size: length of data in bytes
+ *
+ * Return values:
+ * %nfs_ok: @data encoded; @xdr advanced to next position
+ * %nfserr_resource: stream buffer space exhausted
+ */
+static __always_inline __be32
+nfsd4_encode_opaque_fixed(struct xdr_stream *xdr, const void *data,
+ size_t size)
+{
+ __be32 *p = xdr_reserve_space(xdr, xdr_align_size(size));
+ size_t pad = xdr_pad_size(size);
+
+ if (unlikely(p == NULL))
+ return nfserr_resource;
+ memcpy(p, data, size);
+ if (pad)
+ memset((char *)p + size, 0, pad);
+ return nfs_ok;
+}
+
+/**
+ * nfsd4_encode_opaque - Encode a variable-length XDR opaque type result
+ * @xdr: target XDR stream
+ * @data: pointer to data
+ * @size: length of data in bytes
+ *
+ * Return values:
+ * %nfs_ok: @data encoded; @xdr advanced to next position
+ * %nfserr_resource: stream buffer space exhausted
+ */
+static __always_inline __be32
+nfsd4_encode_opaque(struct xdr_stream *xdr, const void *data, size_t size)
+{
+ size_t pad = xdr_pad_size(size);
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, XDR_UNIT + xdr_align_size(size));
+ if (unlikely(p == NULL))
+ return nfserr_resource;
+ *p++ = cpu_to_be32(size);
+ memcpy(p, data, size);
+ if (pad)
+ memset((char *)p + size, 0, pad);
+ return nfs_ok;
+}
+
+#define nfsd4_encode_component4(x, d, s) nfsd4_encode_opaque(x, d, s)
+
struct nfsd4_compound_state {
struct svc_fh current_fh;
struct svc_fh save_fh;
@@ -170,12 +298,8 @@ struct nfsd4_lock {
} v;
/* response */
- union {
- struct {
- stateid_t stateid;
- } ok;
- struct nfsd4_lock_denied denied;
- } u;
+ stateid_t lk_resp_stateid;
+ struct nfsd4_lock_denied lk_denied;
};
#define lk_new_open_seqid v.new.open_seqid
#define lk_new_open_stateid v.new.open_stateid
@@ -185,20 +309,15 @@ struct nfsd4_lock {
#define lk_old_lock_stateid v.old.lock_stateid
#define lk_old_lock_seqid v.old.lock_seqid
-#define lk_resp_stateid u.ok.stateid
-#define lk_denied u.denied
-
-
struct nfsd4_lockt {
u32 lt_type;
clientid_t lt_clientid;
struct xdr_netobj lt_owner;
u64 lt_offset;
u64 lt_length;
- struct nfsd4_lock_denied lt_denied;
+ struct nfsd4_lock_denied lt_denied;
};
-
struct nfsd4_locku {
u32 lu_type;
u32 lu_seqid;
@@ -267,9 +386,9 @@ struct nfsd4_open {
u32 op_deleg_want; /* request */
stateid_t op_stateid; /* response */
__be32 op_xdr_error; /* see nfsd4_open_omfg() */
- u32 op_recall; /* recall */
struct nfsd4_change_info op_cinfo; /* response */
u32 op_rflags; /* response */
+ bool op_recall; /* response */
bool op_truncate; /* used during processing */
bool op_created; /* used during processing */
struct nfs4_openowner *op_openowner; /* used during processing */
@@ -496,7 +615,7 @@ struct nfsd4_layoutcommit {
u32 lc_layout_type; /* request */
u32 lc_up_len; /* layout length */
void *lc_up_layout; /* decoded by callback */
- u32 lc_size_chg; /* boolean for response */
+ bool lc_size_chg; /* response */
u64 lc_newsize; /* response */
};
@@ -508,7 +627,7 @@ struct nfsd4_layoutreturn {
u32 lrf_body_len; /* request */
void *lrf_body; /* request */
stateid_t lr_sid; /* request/response */
- u32 lrs_present; /* response */
+ bool lrs_present; /* response */
};
struct nfsd4_fallocate {
@@ -626,8 +745,7 @@ struct nfsd4_copy_notify {
/* response */
stateid_t cpn_cnr_stateid;
- u64 cpn_sec;
- u32 cpn_nsec;
+ struct timespec64 cpn_lease_time;
struct nl4_server *cpn_src;
};
@@ -820,8 +938,10 @@ extern __be32 nfsd4_open_downgrade(struct svc_rqst *rqstp,
struct nfsd4_compound_state *, union nfsd4_op_u *u);
extern __be32 nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *,
union nfsd4_op_u *u);
+extern void nfsd4_lock_release(union nfsd4_op_u *u);
extern __be32 nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *,
union nfsd4_op_u *u);
+extern void nfsd4_lockt_release(union nfsd4_op_u *u);
extern __be32 nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *,
union nfsd4_op_u *u);
extern __be32
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index bce734b68f08..de2073c47651 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -429,7 +429,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
nilfs_set_de_type(de, inode);
nilfs_commit_chunk(page, mapping, from, to);
nilfs_put_page(page);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
}
/*
@@ -519,7 +519,7 @@ got_it:
de->inode = cpu_to_le64(inode->i_ino);
nilfs_set_de_type(de, inode);
nilfs_commit_chunk(page, page->mapping, from, to);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
nilfs_mark_inode_dirty(dir);
/* OFFSET_CACHE */
out_put:
@@ -567,7 +567,7 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
pde->rec_len = nilfs_rec_len_to_disk(to - from);
dir->inode = 0;
nilfs_commit_chunk(page, mapping, from, to);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
out:
nilfs_put_page(page);
return err;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 1a8bd5993476..f861f3a0bf5c 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -366,7 +366,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
atomic64_inc(&root->inodes_count);
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
inode->i_ino = ino;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
err = nilfs_bmap_read(ii->i_bmap, NULL);
@@ -449,12 +449,12 @@ int nilfs_read_inode_common(struct inode *inode,
i_gid_write(inode, le32_to_cpu(raw_inode->i_gid));
set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
inode->i_size = le64_to_cpu(raw_inode->i_size);
- inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
+ inode_set_atime(inode, le64_to_cpu(raw_inode->i_mtime),
+ le32_to_cpu(raw_inode->i_mtime_nsec));
inode_set_ctime(inode, le64_to_cpu(raw_inode->i_ctime),
le32_to_cpu(raw_inode->i_ctime_nsec));
- inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
- inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
- inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
+ inode_set_mtime(inode, le64_to_cpu(raw_inode->i_mtime),
+ le32_to_cpu(raw_inode->i_mtime_nsec));
if (nilfs_is_metadata_file_inode(inode) && !S_ISREG(inode->i_mode))
return -EIO; /* this inode is for metadata and corrupted */
if (inode->i_nlink == 0)
@@ -768,10 +768,10 @@ void nilfs_write_inode_common(struct inode *inode,
raw_inode->i_gid = cpu_to_le32(i_gid_read(inode));
raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
raw_inode->i_size = cpu_to_le64(inode->i_size);
- raw_inode->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
- raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
- raw_inode->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
- raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ raw_inode->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+ raw_inode->i_mtime = cpu_to_le64(inode_get_mtime_sec(inode));
+ raw_inode->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
+ raw_inode->i_mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
raw_inode->i_blocks = cpu_to_le64(inode->i_blocks);
raw_inode->i_flags = cpu_to_le32(ii->i_flags);
@@ -875,7 +875,7 @@ void nilfs_truncate(struct inode *inode)
nilfs_truncate_bmap(ii, blkoff);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
if (IS_SYNC(inode))
nilfs_set_transaction_flag(NILFS_TI_SYNC);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 19c8158605ed..c97c77a39668 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -356,30 +356,28 @@ int nilfs_mdt_delete_block(struct inode *inode, unsigned long block)
*/
int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
{
- pgoff_t index = (pgoff_t)block >>
- (PAGE_SHIFT - inode->i_blkbits);
- struct page *page;
- unsigned long first_block;
+ pgoff_t index = block >> (PAGE_SHIFT - inode->i_blkbits);
+ struct folio *folio;
+ struct buffer_head *bh;
int ret = 0;
int still_dirty;
- page = find_lock_page(inode->i_mapping, index);
- if (!page)
+ folio = filemap_lock_folio(inode->i_mapping, index);
+ if (IS_ERR(folio))
return -ENOENT;
- wait_on_page_writeback(page);
-
- first_block = (unsigned long)index <<
- (PAGE_SHIFT - inode->i_blkbits);
- if (page_has_buffers(page)) {
- struct buffer_head *bh;
+ folio_wait_writeback(folio);
- bh = nilfs_page_get_nth_block(page, block - first_block);
+ bh = folio_buffers(folio);
+ if (bh) {
+ unsigned long first_block = index <<
+ (PAGE_SHIFT - inode->i_blkbits);
+ bh = get_nth_bh(bh, block - first_block);
nilfs_forget_buffer(bh);
}
- still_dirty = PageDirty(page);
- unlock_page(page);
- put_page(page);
+ still_dirty = folio_test_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
if (still_dirty ||
invalidate_inode_pages2_range(inode->i_mapping, index, index) != 0)
@@ -560,17 +558,19 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
{
struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow;
struct buffer_head *bh_frozen;
- struct page *page;
+ struct folio *folio;
int blkbits = inode->i_blkbits;
- page = grab_cache_page(shadow->inode->i_mapping, bh->b_folio->index);
- if (!page)
- return -ENOMEM;
+ folio = filemap_grab_folio(shadow->inode->i_mapping,
+ bh->b_folio->index);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
- if (!page_has_buffers(page))
- create_empty_buffers(page, 1 << blkbits, 0);
+ bh_frozen = folio_buffers(folio);
+ if (!bh_frozen)
+ bh_frozen = create_empty_buffers(folio, 1 << blkbits, 0);
- bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits);
+ bh_frozen = get_nth_bh(bh_frozen, bh_offset(bh) >> blkbits);
if (!buffer_uptodate(bh_frozen))
nilfs_copy_buffer(bh_frozen, bh);
@@ -582,8 +582,8 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
brelse(bh_frozen); /* already frozen */
}
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return 0;
}
@@ -592,17 +592,19 @@ nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh)
{
struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow;
struct buffer_head *bh_frozen = NULL;
- struct page *page;
+ struct folio *folio;
int n;
- page = find_lock_page(shadow->inode->i_mapping, bh->b_folio->index);
- if (page) {
- if (page_has_buffers(page)) {
+ folio = filemap_lock_folio(shadow->inode->i_mapping,
+ bh->b_folio->index);
+ if (!IS_ERR(folio)) {
+ bh_frozen = folio_buffers(folio);
+ if (bh_frozen) {
n = bh_offset(bh) >> inode->i_blkbits;
- bh_frozen = nilfs_page_get_nth_block(page, n);
+ bh_frozen = get_nth_bh(bh_frozen, n);
}
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
}
return bh_frozen;
}
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index b4e54d079b7d..06b04758f289 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -25,19 +25,19 @@
(BIT(BH_Uptodate) | BIT(BH_Mapped) | BIT(BH_NILFS_Node) | \
BIT(BH_NILFS_Volatile) | BIT(BH_NILFS_Checked))
-static struct buffer_head *
-__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
- int blkbits, unsigned long b_state)
+static struct buffer_head *__nilfs_get_folio_block(struct folio *folio,
+ unsigned long block, pgoff_t index, int blkbits,
+ unsigned long b_state)
{
unsigned long first_block;
- struct buffer_head *bh;
+ struct buffer_head *bh = folio_buffers(folio);
- if (!page_has_buffers(page))
- create_empty_buffers(page, 1 << blkbits, b_state);
+ if (!bh)
+ bh = create_empty_buffers(folio, 1 << blkbits, b_state);
first_block = (unsigned long)index << (PAGE_SHIFT - blkbits);
- bh = nilfs_page_get_nth_block(page, block - first_block);
+ bh = get_nth_bh(bh, block - first_block);
touch_buffer(bh);
wait_on_buffer(bh);
@@ -51,17 +51,17 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
{
int blkbits = inode->i_blkbits;
pgoff_t index = blkoff >> (PAGE_SHIFT - blkbits);
- struct page *page;
+ struct folio *folio;
struct buffer_head *bh;
- page = grab_cache_page(mapping, index);
- if (unlikely(!page))
+ folio = filemap_grab_folio(mapping, index);
+ if (IS_ERR(folio))
return NULL;
- bh = __nilfs_get_page_block(page, blkoff, index, blkbits, b_state);
+ bh = __nilfs_get_folio_block(folio, blkoff, index, blkbits, b_state);
if (unlikely(!bh)) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return NULL;
}
return bh;
@@ -184,30 +184,32 @@ void nilfs_page_bug(struct page *page)
}
/**
- * nilfs_copy_page -- copy the page with buffers
- * @dst: destination page
- * @src: source page
- * @copy_dirty: flag whether to copy dirty states on the page's buffer heads.
+ * nilfs_copy_folio -- copy the folio with buffers
+ * @dst: destination folio
+ * @src: source folio
+ * @copy_dirty: flag whether to copy dirty states on the folio's buffer heads.
*
- * This function is for both data pages and btnode pages. The dirty flag
- * should be treated by caller. The page must not be under i/o.
- * Both src and dst page must be locked
+ * This function is for both data folios and btnode folios. The dirty flag
+ * should be treated by caller. The folio must not be under i/o.
+ * Both src and dst folio must be locked
*/
-static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty)
+static void nilfs_copy_folio(struct folio *dst, struct folio *src,
+ bool copy_dirty)
{
struct buffer_head *dbh, *dbufs, *sbh;
unsigned long mask = NILFS_BUFFER_INHERENT_BITS;
- BUG_ON(PageWriteback(dst));
+ BUG_ON(folio_test_writeback(dst));
- sbh = page_buffers(src);
- if (!page_has_buffers(dst))
- create_empty_buffers(dst, sbh->b_size, 0);
+ sbh = folio_buffers(src);
+ dbh = folio_buffers(dst);
+ if (!dbh)
+ dbh = create_empty_buffers(dst, sbh->b_size, 0);
if (copy_dirty)
mask |= BIT(BH_Dirty);
- dbh = dbufs = page_buffers(dst);
+ dbufs = dbh;
do {
lock_buffer(sbh);
lock_buffer(dbh);
@@ -218,16 +220,16 @@ static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty)
dbh = dbh->b_this_page;
} while (dbh != dbufs);
- copy_highpage(dst, src);
+ folio_copy(dst, src);
- if (PageUptodate(src) && !PageUptodate(dst))
- SetPageUptodate(dst);
- else if (!PageUptodate(src) && PageUptodate(dst))
- ClearPageUptodate(dst);
- if (PageMappedToDisk(src) && !PageMappedToDisk(dst))
- SetPageMappedToDisk(dst);
- else if (!PageMappedToDisk(src) && PageMappedToDisk(dst))
- ClearPageMappedToDisk(dst);
+ if (folio_test_uptodate(src) && !folio_test_uptodate(dst))
+ folio_mark_uptodate(dst);
+ else if (!folio_test_uptodate(src) && folio_test_uptodate(dst))
+ folio_clear_uptodate(dst);
+ if (folio_test_mappedtodisk(src) && !folio_test_mappedtodisk(dst))
+ folio_set_mappedtodisk(dst);
+ else if (!folio_test_mappedtodisk(src) && folio_test_mappedtodisk(dst))
+ folio_clear_mappedtodisk(dst);
do {
unlock_buffer(sbh);
@@ -269,7 +271,7 @@ repeat:
NILFS_PAGE_BUG(&folio->page,
"found empty page in dat page cache");
- nilfs_copy_page(&dfolio->page, &folio->page, 1);
+ nilfs_copy_folio(dfolio, folio, true);
filemap_dirty_folio(folio_mapping(dfolio), dfolio);
folio_unlock(dfolio);
@@ -314,7 +316,7 @@ repeat:
if (!IS_ERR(dfolio)) {
/* overwrite existing folio in the destination cache */
WARN_ON(folio_test_dirty(dfolio));
- nilfs_copy_page(&dfolio->page, &folio->page, 0);
+ nilfs_copy_folio(dfolio, folio, false);
folio_unlock(dfolio);
folio_put(dfolio);
/* Do we not need to remove folio from smap here? */
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 21ddcdd4d63e..d249ea1cefff 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -52,15 +52,4 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
#define NILFS_PAGE_BUG(page, m, a...) \
do { nilfs_page_bug(page); BUG(); } while (0)
-static inline struct buffer_head *
-nilfs_page_get_nth_block(struct page *page, unsigned int count)
-{
- struct buffer_head *bh = page_buffers(page);
-
- while (count-- > 0)
- bh = bh->b_this_page;
- get_bh(bh);
- return bh;
-}
-
#endif /* _NILFS_PAGE_H */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 7ec16879756e..55e31cc903d1 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -731,10 +731,9 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
continue;
}
head = folio_buffers(folio);
- if (!head) {
- create_empty_buffers(&folio->page, i_blocksize(inode), 0);
- head = folio_buffers(folio);
- }
+ if (!head)
+ head = create_empty_buffers(folio,
+ i_blocksize(inode), 0);
folio_unlock(folio);
bh = head;
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 2c6078a6b8ec..58ca7c936393 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -501,15 +501,38 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
down_write(&NILFS_MDT(sufile)->mi_sem);
ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
- if (!ret) {
- mark_buffer_dirty(bh);
- nilfs_mdt_mark_dirty(sufile);
- kaddr = kmap_atomic(bh->b_page);
- su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
+ if (ret)
+ goto out_sem;
+
+ kaddr = kmap_atomic(bh->b_page);
+ su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
+ if (unlikely(nilfs_segment_usage_error(su))) {
+ struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
+
+ kunmap_atomic(kaddr);
+ brelse(bh);
+ if (nilfs_segment_is_active(nilfs, segnum)) {
+ nilfs_error(sufile->i_sb,
+ "active segment %llu is erroneous",
+ (unsigned long long)segnum);
+ } else {
+ /*
+ * Segments marked erroneous are never allocated by
+ * nilfs_sufile_alloc(); only active segments, ie,
+ * the segments indexed by ns_segnum or ns_nextnum,
+ * can be erroneous here.
+ */
+ WARN_ON_ONCE(1);
+ }
+ ret = -EIO;
+ } else {
nilfs_segment_usage_set_dirty(su);
kunmap_atomic(kaddr);
+ mark_buffer_dirty(bh);
+ nilfs_mdt_mark_dirty(sufile);
brelse(bh);
}
+out_sem:
up_write(&NILFS_MDT(sufile)->mi_sem);
return ret;
}
@@ -536,9 +559,14 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
kaddr = kmap_atomic(bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
- WARN_ON(nilfs_segment_usage_error(su));
- if (modtime)
+ if (modtime) {
+ /*
+ * Check segusage error and set su_lastmod only when updating
+ * this entry with a valid timestamp, not for cancellation.
+ */
+ WARN_ON_ONCE(nilfs_segment_usage_error(su));
su->su_lastmod = cpu_to_le64(modtime);
+ }
su->su_nblocks = cpu_to_le32(nblocks);
kunmap_atomic(kaddr);
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 0f0667957c81..71400496ed36 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -716,7 +716,11 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
goto failed_sbh;
}
nilfs_release_super_block(nilfs);
- sb_set_blocksize(sb, blocksize);
+ if (!sb_set_blocksize(sb, blocksize)) {
+ nilfs_err(sb, "bad blocksize %d", blocksize);
+ err = -EINVAL;
+ goto out;
+ }
err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
if (err)
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index ebdcc25df0f7..1cb9ad7e884e 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -39,9 +39,9 @@ static void __init dnotify_sysctl_init(void)
#define dnotify_sysctl_init() do { } while (0)
#endif
-static struct kmem_cache *dnotify_struct_cache __read_mostly;
-static struct kmem_cache *dnotify_mark_cache __read_mostly;
-static struct fsnotify_group *dnotify_group __read_mostly;
+static struct kmem_cache *dnotify_struct_cache __ro_after_init;
+static struct kmem_cache *dnotify_mark_cache __ro_after_init;
+static struct fsnotify_group *dnotify_group __ro_after_init;
/*
* dnotify will attach one of these to each inode (i_fsnotify_marks) which
@@ -265,7 +265,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
struct dnotify_struct *dn;
struct inode *inode;
fl_owner_t id = current->files;
- struct file *f;
+ struct file *f = NULL;
int destroy = 0, error = 0;
__u32 mask;
@@ -345,7 +345,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
}
rcu_read_lock();
- f = lookup_fd_rcu(fd);
+ f = lookup_fdget_rcu(fd);
rcu_read_unlock();
/* if (f != filp) means that we lost a race and another task/thread
@@ -392,6 +392,8 @@ out_err:
fsnotify_put_mark(new_fsn_mark);
if (dn)
kmem_cache_free(dnotify_struct_cache, dn);
+ if (f)
+ fput(f);
return error;
}
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index e8a3c28c5d12..6936671e148d 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -275,9 +275,9 @@ static inline void fanotify_init_event(struct fanotify_event *event,
#define FANOTIFY_INLINE_FH(name, size) \
struct { \
- struct fanotify_fh (name); \
+ struct fanotify_fh name; \
/* Space for object_fh.buf[] - access with fanotify_fh_buf() */ \
- unsigned char _inline_fh_buf[(size)]; \
+ unsigned char _inline_fh_buf[size]; \
}
struct fanotify_fid_event {
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 62fe0b679e58..4d765c72496f 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -112,10 +112,10 @@ static void __init fanotify_sysctls_init(void)
extern const struct fsnotify_ops fanotify_fsnotify_ops;
-struct kmem_cache *fanotify_mark_cache __read_mostly;
-struct kmem_cache *fanotify_fid_event_cachep __read_mostly;
-struct kmem_cache *fanotify_path_event_cachep __read_mostly;
-struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
+struct kmem_cache *fanotify_mark_cache __ro_after_init;
+struct kmem_cache *fanotify_fid_event_cachep __ro_after_init;
+struct kmem_cache *fanotify_path_event_cachep __ro_after_init;
+struct kmem_cache *fanotify_perm_event_cachep __ro_after_init;
#define FANOTIFY_EVENT_ALIGN 4
#define FANOTIFY_FID_INFO_HDR_LEN \
@@ -1595,7 +1595,7 @@ static int fanotify_test_fid(struct dentry *dentry, unsigned int flags)
* file handles so user can use name_to_handle_at() to compare fids
* reported with events to the file handle of watched objects.
*/
- if (!nop)
+ if (!exportfs_can_encode_fid(nop))
return -EOPNOTSUPP;
/*
@@ -1603,7 +1603,7 @@ static int fanotify_test_fid(struct dentry *dentry, unsigned int flags)
* supports decoding file handles, so user has a way to map back the
* reported fids to filesystem objects.
*/
- if (mark_type != FAN_MARK_INODE && !nop->fh_to_dentry)
+ if (mark_type != FAN_MARK_INODE && !exportfs_can_decode_fh(nop))
return -EOPNOTSUPP;
return 0;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 1c4bfdab008d..a3809ae92170 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -49,7 +49,7 @@
/* configurable via /proc/sys/fs/inotify/ */
static int inotify_max_queued_events __read_mostly;
-struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
+struct kmem_cache *inotify_inode_mark_cachep __ro_after_init;
#ifdef CONFIG_SYSCTL
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 647a22433bd8..9a4b228d42fa 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -84,7 +84,7 @@ slow:
return -ENOMEM;
}
inode->i_ino = ns->inum;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_flags |= S_IMMUTABLE;
inode->i_mode = S_IFREG | S_IRUGO;
inode->i_fop = &ns_file_operations;
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 4e158bce4192..71e31e789b29 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -145,13 +145,12 @@ still_busy:
}
/**
- * ntfs_read_block - fill a @page of an address space with data
- * @page: page cache page to fill with data
+ * ntfs_read_block - fill a @folio of an address space with data
+ * @folio: page cache folio to fill with data
*
- * Fill the page @page of the address space belonging to the @page->host inode.
* We read each buffer asynchronously and when all buffers are read in, our io
* completion handler ntfs_end_buffer_read_async(), if required, automatically
- * applies the mst fixups to the page before finally marking it uptodate and
+ * applies the mst fixups to the folio before finally marking it uptodate and
* unlocking it.
*
* We only enforce allocated_size limit because i_size is checked for in
@@ -161,7 +160,7 @@ still_busy:
*
* Contains an adapted version of fs/buffer.c::block_read_full_folio().
*/
-static int ntfs_read_block(struct page *page)
+static int ntfs_read_block(struct folio *folio)
{
loff_t i_size;
VCN vcn;
@@ -178,7 +177,7 @@ static int ntfs_read_block(struct page *page)
int i, nr;
unsigned char blocksize_bits;
- vi = page->mapping->host;
+ vi = folio->mapping->host;
ni = NTFS_I(vi);
vol = ni->vol;
@@ -188,15 +187,10 @@ static int ntfs_read_block(struct page *page)
blocksize = vol->sb->s_blocksize;
blocksize_bits = vol->sb->s_blocksize_bits;
- if (!page_has_buffers(page)) {
- create_empty_buffers(page, blocksize, 0);
- if (unlikely(!page_has_buffers(page))) {
- unlock_page(page);
- return -ENOMEM;
- }
- }
- bh = head = page_buffers(page);
- BUG_ON(!bh);
+ head = folio_buffers(folio);
+ if (!head)
+ head = create_empty_buffers(folio, blocksize, 0);
+ bh = head;
/*
* We may be racing with truncate. To avoid some of the problems we
@@ -205,11 +199,11 @@ static int ntfs_read_block(struct page *page)
* may leave some buffers unmapped which are now allocated. This is
* not a problem since these buffers will just get mapped when a write
* occurs. In case of a shrinking truncate, we will detect this later
- * on due to the runlist being incomplete and if the page is being
+ * on due to the runlist being incomplete and if the folio is being
* fully truncated, truncate will throw it away as soon as we unlock
* it so no need to worry what we do with it.
*/
- iblock = (s64)page->index << (PAGE_SHIFT - blocksize_bits);
+ iblock = (s64)folio->index << (PAGE_SHIFT - blocksize_bits);
read_lock_irqsave(&ni->size_lock, flags);
lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits;
init_size = ni->initialized_size;
@@ -221,7 +215,7 @@ static int ntfs_read_block(struct page *page)
}
zblock = (init_size + blocksize - 1) >> blocksize_bits;
- /* Loop through all the buffers in the page. */
+ /* Loop through all the buffers in the folio. */
rl = NULL;
nr = i = 0;
do {
@@ -299,7 +293,7 @@ lock_retry_remap:
if (!err)
err = -EIO;
bh->b_blocknr = -1;
- SetPageError(page);
+ folio_set_error(folio);
ntfs_error(vol->sb, "Failed to read from inode 0x%lx, "
"attribute type 0x%x, vcn 0x%llx, "
"offset 0x%x because its location on "
@@ -312,13 +306,13 @@ lock_retry_remap:
/*
* Either iblock was outside lblock limits or
* ntfs_rl_vcn_to_lcn() returned error. Just zero that portion
- * of the page and set the buffer uptodate.
+ * of the folio and set the buffer uptodate.
*/
handle_hole:
bh->b_blocknr = -1UL;
clear_buffer_mapped(bh);
handle_zblock:
- zero_user(page, i * blocksize, blocksize);
+ folio_zero_range(folio, i * blocksize, blocksize);
if (likely(!err))
set_buffer_uptodate(bh);
} while (i++, iblock++, (bh = bh->b_this_page) != head);
@@ -349,11 +343,11 @@ handle_zblock:
return 0;
}
/* No i/o was scheduled on any of the buffers. */
- if (likely(!PageError(page)))
- SetPageUptodate(page);
+ if (likely(!folio_test_error(folio)))
+ folio_mark_uptodate(folio);
else /* Signal synchronous i/o error. */
nr = -EIO;
- unlock_page(page);
+ folio_unlock(folio);
return nr;
}
@@ -433,7 +427,7 @@ retry_readpage:
/* NInoNonResident() == NInoIndexAllocPresent() */
if (NInoNonResident(ni)) {
/* Normal, non-resident data stream. */
- return ntfs_read_block(page);
+ return ntfs_read_block(folio);
}
/*
* Attribute is resident, implying it is not compressed or encrypted.
@@ -507,28 +501,29 @@ err_out:
#ifdef NTFS_RW
/**
- * ntfs_write_block - write a @page to the backing store
- * @page: page cache page to write out
+ * ntfs_write_block - write a @folio to the backing store
+ * @folio: page cache folio to write out
* @wbc: writeback control structure
*
- * This function is for writing pages belonging to non-resident, non-mst
+ * This function is for writing folios belonging to non-resident, non-mst
* protected attributes to their backing store.
*
- * For a page with buffers, map and write the dirty buffers asynchronously
- * under page writeback. For a page without buffers, create buffers for the
- * page, then proceed as above.
+ * For a folio with buffers, map and write the dirty buffers asynchronously
+ * under folio writeback. For a folio without buffers, create buffers for the
+ * folio, then proceed as above.
*
- * If a page doesn't have buffers the page dirty state is definitive. If a page
- * does have buffers, the page dirty state is just a hint, and the buffer dirty
- * state is definitive. (A hint which has rules: dirty buffers against a clean
- * page is illegal. Other combinations are legal and need to be handled. In
- * particular a dirty page containing clean buffers for example.)
+ * If a folio doesn't have buffers the folio dirty state is definitive. If
+ * a folio does have buffers, the folio dirty state is just a hint,
+ * and the buffer dirty state is definitive. (A hint which has rules:
+ * dirty buffers against a clean folio is illegal. Other combinations are
+ * legal and need to be handled. In particular a dirty folio containing
+ * clean buffers for example.)
*
* Return 0 on success and -errno on error.
*
* Based on ntfs_read_block() and __block_write_full_folio().
*/
-static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
+static int ntfs_write_block(struct folio *folio, struct writeback_control *wbc)
{
VCN vcn;
LCN lcn;
@@ -546,41 +541,29 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
bool need_end_writeback;
unsigned char blocksize_bits;
- vi = page->mapping->host;
+ vi = folio->mapping->host;
ni = NTFS_I(vi);
vol = ni->vol;
ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
- "0x%lx.", ni->mft_no, ni->type, page->index);
+ "0x%lx.", ni->mft_no, ni->type, folio->index);
BUG_ON(!NInoNonResident(ni));
BUG_ON(NInoMstProtected(ni));
blocksize = vol->sb->s_blocksize;
blocksize_bits = vol->sb->s_blocksize_bits;
- if (!page_has_buffers(page)) {
- BUG_ON(!PageUptodate(page));
- create_empty_buffers(page, blocksize,
+ head = folio_buffers(folio);
+ if (!head) {
+ BUG_ON(!folio_test_uptodate(folio));
+ head = create_empty_buffers(folio, blocksize,
(1 << BH_Uptodate) | (1 << BH_Dirty));
- if (unlikely(!page_has_buffers(page))) {
- ntfs_warning(vol->sb, "Error allocating page "
- "buffers. Redirtying page so we try "
- "again later.");
- /*
- * Put the page back on mapping->dirty_pages, but leave
- * its buffers' dirty state as-is.
- */
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return 0;
- }
}
- bh = head = page_buffers(page);
- BUG_ON(!bh);
+ bh = head;
/* NOTE: Different naming scheme to ntfs_read_block()! */
- /* The first block in the page. */
- block = (s64)page->index << (PAGE_SHIFT - blocksize_bits);
+ /* The first block in the folio. */
+ block = (s64)folio->index << (PAGE_SHIFT - blocksize_bits);
read_lock_irqsave(&ni->size_lock, flags);
i_size = i_size_read(vi);
@@ -597,14 +580,14 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
* Be very careful. We have no exclusion from block_dirty_folio
* here, and the (potentially unmapped) buffers may become dirty at
* any time. If a buffer becomes dirty here after we've inspected it
- * then we just miss that fact, and the page stays dirty.
+ * then we just miss that fact, and the folio stays dirty.
*
* Buffers outside i_size may be dirtied by block_dirty_folio;
* handle that here by just cleaning them.
*/
/*
- * Loop through all the buffers in the page, mapping all the dirty
+ * Loop through all the buffers in the folio, mapping all the dirty
* buffers to disk addresses and handling any aliases from the
* underlying block device's mapping.
*/
@@ -616,13 +599,13 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
if (unlikely(block >= dblock)) {
/*
* Mapped buffers outside i_size will occur, because
- * this page can be outside i_size when there is a
+ * this folio can be outside i_size when there is a
* truncate in progress. The contents of such buffers
* were zeroed by ntfs_writepage().
*
* FIXME: What about the small race window where
* ntfs_writepage() has not done any clearing because
- * the page was within i_size but before we get here,
+ * the folio was within i_size but before we get here,
* vmtruncate() modifies i_size?
*/
clear_buffer_dirty(bh);
@@ -638,38 +621,38 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
if (unlikely((block >= iblock) &&
(initialized_size < i_size))) {
/*
- * If this page is fully outside initialized
- * size, zero out all pages between the current
- * initialized size and the current page. Just
+ * If this folio is fully outside initialized
+ * size, zero out all folios between the current
+ * initialized size and the current folio. Just
* use ntfs_read_folio() to do the zeroing
* transparently.
*/
if (block > iblock) {
// TODO:
- // For each page do:
- // - read_cache_page()
- // Again for each page do:
- // - wait_on_page_locked()
- // - Check (PageUptodate(page) &&
- // !PageError(page))
+ // For each folio do:
+ // - read_cache_folio()
+ // Again for each folio do:
+ // - wait_on_folio_locked()
+ // - Check (folio_test_uptodate(folio) &&
+ // !folio_test_error(folio))
// Update initialized size in the attribute and
// in the inode.
- // Again, for each page do:
+ // Again, for each folio do:
// block_dirty_folio();
- // put_page()
+ // folio_put()
// We don't need to wait on the writes.
// Update iblock.
}
/*
- * The current page straddles initialized size. Zero
+ * The current folio straddles initialized size. Zero
* all non-uptodate buffers and set them uptodate (and
* dirty?). Note, there aren't any non-uptodate buffers
- * if the page is uptodate.
- * FIXME: For an uptodate page, the buffers may need to
+ * if the folio is uptodate.
+ * FIXME: For an uptodate folio, the buffers may need to
* be written out because they were not initialized on
* disk before.
*/
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
// TODO:
// Zero any non-uptodate buffers up to i_size.
// Set them uptodate and dirty.
@@ -727,14 +710,14 @@ lock_retry_remap:
unsigned long *bpos, *bend;
/* Check if the buffer is zero. */
- kaddr = kmap_atomic(page);
- bpos = (unsigned long *)(kaddr + bh_offset(bh));
- bend = (unsigned long *)((u8*)bpos + blocksize);
+ kaddr = kmap_local_folio(folio, bh_offset(bh));
+ bpos = (unsigned long *)kaddr;
+ bend = (unsigned long *)(kaddr + blocksize);
do {
if (unlikely(*bpos))
break;
} while (likely(++bpos < bend));
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
if (bpos == bend) {
/*
* Buffer is zero and sparse, no need to write
@@ -774,7 +757,7 @@ lock_retry_remap:
if (err == -ENOENT || lcn == LCN_ENOENT) {
bh->b_blocknr = -1;
clear_buffer_dirty(bh);
- zero_user(page, bh_offset(bh), blocksize);
+ folio_zero_range(folio, bh_offset(bh), blocksize);
set_buffer_uptodate(bh);
err = 0;
continue;
@@ -801,7 +784,7 @@ lock_retry_remap:
bh = head;
/* Just an optimization, so ->read_folio() is not called later. */
- if (unlikely(!PageUptodate(page))) {
+ if (unlikely(!folio_test_uptodate(folio))) {
int uptodate = 1;
do {
if (!buffer_uptodate(bh)) {
@@ -811,7 +794,7 @@ lock_retry_remap:
}
} while ((bh = bh->b_this_page) != head);
if (uptodate)
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
}
/* Setup all mapped, dirty buffers for async write i/o. */
@@ -826,7 +809,7 @@ lock_retry_remap:
} else if (unlikely(err)) {
/*
* For the error case. The buffer may have been set
- * dirty during attachment to a dirty page.
+ * dirty during attachment to a dirty folio.
*/
if (err != -ENOMEM)
clear_buffer_dirty(bh);
@@ -839,20 +822,20 @@ lock_retry_remap:
err = 0;
else if (err == -ENOMEM) {
ntfs_warning(vol->sb, "Error allocating memory. "
- "Redirtying page so we try again "
+ "Redirtying folio so we try again "
"later.");
/*
- * Put the page back on mapping->dirty_pages, but
+ * Put the folio back on mapping->dirty_pages, but
* leave its buffer's dirty state as-is.
*/
- redirty_page_for_writepage(wbc, page);
+ folio_redirty_for_writepage(wbc, folio);
err = 0;
} else
- SetPageError(page);
+ folio_set_error(folio);
}
- BUG_ON(PageWriteback(page));
- set_page_writeback(page); /* Keeps try_to_free_buffers() away. */
+ BUG_ON(folio_test_writeback(folio));
+ folio_start_writeback(folio); /* Keeps try_to_free_buffers() away. */
/* Submit the prepared buffers for i/o. */
need_end_writeback = true;
@@ -864,11 +847,11 @@ lock_retry_remap:
}
bh = next;
} while (bh != head);
- unlock_page(page);
+ folio_unlock(folio);
- /* If no i/o was started, need to end_page_writeback(). */
+ /* If no i/o was started, need to end writeback here. */
if (unlikely(need_end_writeback))
- end_page_writeback(page);
+ folio_end_writeback(folio);
ntfs_debug("Done.");
return err;
@@ -1337,8 +1320,9 @@ done:
*/
static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
{
+ struct folio *folio = page_folio(page);
loff_t i_size;
- struct inode *vi = page->mapping->host;
+ struct inode *vi = folio->mapping->host;
ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi);
char *addr;
ntfs_attr_search_ctx *ctx = NULL;
@@ -1347,14 +1331,13 @@ static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
int err;
retry_writepage:
- BUG_ON(!PageLocked(page));
+ BUG_ON(!folio_test_locked(folio));
i_size = i_size_read(vi);
- /* Is the page fully outside i_size? (truncate in progress) */
- if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >>
+ /* Is the folio fully outside i_size? (truncate in progress) */
+ if (unlikely(folio->index >= (i_size + PAGE_SIZE - 1) >>
PAGE_SHIFT)) {
- struct folio *folio = page_folio(page);
/*
- * The page may have dirty, unmapped buffers. Make them
+ * The folio may have dirty, unmapped buffers. Make them
* freeable here, so the page does not leak.
*/
block_invalidate_folio(folio, 0, folio_size(folio));
@@ -1373,7 +1356,7 @@ retry_writepage:
if (ni->type != AT_INDEX_ALLOCATION) {
/* If file is encrypted, deny access, just like NT4. */
if (NInoEncrypted(ni)) {
- unlock_page(page);
+ folio_unlock(folio);
BUG_ON(ni->type != AT_DATA);
ntfs_debug("Denying write access to encrypted file.");
return -EACCES;
@@ -1384,14 +1367,14 @@ retry_writepage:
BUG_ON(ni->name_len);
// TODO: Implement and replace this with
// return ntfs_write_compressed_block(page);
- unlock_page(page);
+ folio_unlock(folio);
ntfs_error(vi->i_sb, "Writing to compressed files is "
"not supported yet. Sorry.");
return -EOPNOTSUPP;
}
// TODO: Implement and remove this check.
if (NInoNonResident(ni) && NInoSparse(ni)) {
- unlock_page(page);
+ folio_unlock(folio);
ntfs_error(vi->i_sb, "Writing to sparse files is not "
"supported yet. Sorry.");
return -EOPNOTSUPP;
@@ -1400,34 +1383,34 @@ retry_writepage:
/* NInoNonResident() == NInoIndexAllocPresent() */
if (NInoNonResident(ni)) {
/* We have to zero every time due to mmap-at-end-of-file. */
- if (page->index >= (i_size >> PAGE_SHIFT)) {
- /* The page straddles i_size. */
- unsigned int ofs = i_size & ~PAGE_MASK;
- zero_user_segment(page, ofs, PAGE_SIZE);
+ if (folio->index >= (i_size >> PAGE_SHIFT)) {
+ /* The folio straddles i_size. */
+ unsigned int ofs = i_size & (folio_size(folio) - 1);
+ folio_zero_segment(folio, ofs, folio_size(folio));
}
/* Handle mst protected attributes. */
if (NInoMstProtected(ni))
return ntfs_write_mst_block(page, wbc);
/* Normal, non-resident data stream. */
- return ntfs_write_block(page, wbc);
+ return ntfs_write_block(folio, wbc);
}
/*
* Attribute is resident, implying it is not compressed, encrypted, or
* mst protected. This also means the attribute is smaller than an mft
- * record and hence smaller than a page, so can simply return error on
- * any pages with index above 0. Note the attribute can actually be
+ * record and hence smaller than a folio, so can simply return error on
+ * any folios with index above 0. Note the attribute can actually be
* marked compressed but if it is resident the actual data is not
* compressed so we are ok to ignore the compressed flag here.
*/
- BUG_ON(page_has_buffers(page));
- BUG_ON(!PageUptodate(page));
- if (unlikely(page->index > 0)) {
- ntfs_error(vi->i_sb, "BUG()! page->index (0x%lx) > 0. "
- "Aborting write.", page->index);
- BUG_ON(PageWriteback(page));
- set_page_writeback(page);
- unlock_page(page);
- end_page_writeback(page);
+ BUG_ON(folio_buffers(folio));
+ BUG_ON(!folio_test_uptodate(folio));
+ if (unlikely(folio->index > 0)) {
+ ntfs_error(vi->i_sb, "BUG()! folio->index (0x%lx) > 0. "
+ "Aborting write.", folio->index);
+ BUG_ON(folio_test_writeback(folio));
+ folio_start_writeback(folio);
+ folio_unlock(folio);
+ folio_end_writeback(folio);
return -EIO;
}
if (!NInoAttr(ni))
@@ -1460,12 +1443,12 @@ retry_writepage:
if (unlikely(err))
goto err_out;
/*
- * Keep the VM happy. This must be done otherwise the radix-tree tag
- * PAGECACHE_TAG_DIRTY remains set even though the page is clean.
+ * Keep the VM happy. This must be done otherwise
+ * PAGECACHE_TAG_DIRTY remains set even though the folio is clean.
*/
- BUG_ON(PageWriteback(page));
- set_page_writeback(page);
- unlock_page(page);
+ BUG_ON(folio_test_writeback(folio));
+ folio_start_writeback(folio);
+ folio_unlock(folio);
attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
i_size = i_size_read(vi);
if (unlikely(attr_len > i_size)) {
@@ -1480,18 +1463,18 @@ retry_writepage:
/* Shrinking cannot fail. */
BUG_ON(err);
}
- addr = kmap_atomic(page);
- /* Copy the data from the page to the mft record. */
+ addr = kmap_local_folio(folio, 0);
+ /* Copy the data from the folio to the mft record. */
memcpy((u8*)ctx->attr +
le16_to_cpu(ctx->attr->data.resident.value_offset),
addr, attr_len);
- /* Zero out of bounds area in the page cache page. */
- memset(addr + attr_len, 0, PAGE_SIZE - attr_len);
- kunmap_atomic(addr);
- flush_dcache_page(page);
+ /* Zero out of bounds area in the page cache folio. */
+ memset(addr + attr_len, 0, folio_size(folio) - attr_len);
+ kunmap_local(addr);
+ flush_dcache_folio(folio);
flush_dcache_mft_record_page(ctx->ntfs_ino);
- /* We are done with the page. */
- end_page_writeback(page);
+ /* We are done with the folio. */
+ folio_end_writeback(folio);
/* Finally, mark the mft record dirty, so it gets written back. */
mark_mft_record_dirty(ctx->ntfs_ino);
ntfs_attr_put_search_ctx(ctx);
@@ -1502,18 +1485,18 @@ err_out:
ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying "
"page so we try again later.");
/*
- * Put the page back on mapping->dirty_pages, but leave its
+ * Put the folio back on mapping->dirty_pages, but leave its
* buffers' dirty state as-is.
*/
- redirty_page_for_writepage(wbc, page);
+ folio_redirty_for_writepage(wbc, folio);
err = 0;
} else {
ntfs_error(vi->i_sb, "Resident attribute write failed with "
"error %i.", err);
- SetPageError(page);
+ folio_set_error(folio);
NVolSetErrors(ni->vol);
}
- unlock_page(page);
+ folio_unlock(folio);
if (ctx)
ntfs_attr_put_search_ctx(ctx);
if (m)
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index cbc545999cfe..297c0b9db621 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -567,7 +567,7 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
LCN lcn;
s64 bh_pos, vcn_len, end, initialized_size;
sector_t lcn_block;
- struct page *page;
+ struct folio *folio;
struct inode *vi;
ntfs_inode *ni, *base_ni = NULL;
ntfs_volume *vol;
@@ -601,20 +601,6 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
(long long)pos, bytes);
blocksize = vol->sb->s_blocksize;
blocksize_bits = vol->sb->s_blocksize_bits;
- u = 0;
- do {
- page = pages[u];
- BUG_ON(!page);
- /*
- * create_empty_buffers() will create uptodate/dirty buffers if
- * the page is uptodate/dirty.
- */
- if (!page_has_buffers(page)) {
- create_empty_buffers(page, blocksize, 0);
- if (unlikely(!page_has_buffers(page)))
- return -ENOMEM;
- }
- } while (++u < nr_pages);
rl_write_locked = false;
rl = NULL;
err = 0;
@@ -626,14 +612,21 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
end = pos + bytes;
cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits;
/*
- * Loop over each page and for each page over each buffer. Use goto to
+ * Loop over each buffer in each folio. Use goto to
* reduce indentation.
*/
u = 0;
-do_next_page:
- page = pages[u];
- bh_pos = (s64)page->index << PAGE_SHIFT;
- bh = head = page_buffers(page);
+do_next_folio:
+ folio = page_folio(pages[u]);
+ bh_pos = folio_pos(folio);
+ head = folio_buffers(folio);
+ if (!head)
+ /*
+ * create_empty_buffers() will create uptodate/dirty
+ * buffers if the folio is uptodate/dirty.
+ */
+ head = create_empty_buffers(folio, blocksize, 0);
+ bh = head;
do {
VCN cdelta;
s64 bh_end;
@@ -653,15 +646,15 @@ do_next_page:
if (buffer_uptodate(bh))
continue;
/*
- * The buffer is not uptodate. If the page is uptodate
+ * The buffer is not uptodate. If the folio is uptodate
* set the buffer uptodate and otherwise ignore it.
*/
- if (PageUptodate(page)) {
+ if (folio_test_uptodate(folio)) {
set_buffer_uptodate(bh);
continue;
}
/*
- * Neither the page nor the buffer are uptodate. If
+ * Neither the folio nor the buffer are uptodate. If
* the buffer is only partially being written to, we
* need to read it in before the write, i.e. now.
*/
@@ -679,7 +672,7 @@ do_next_page:
ntfs_submit_bh_for_read(bh);
*wait_bh++ = bh;
} else {
- zero_user(page, bh_offset(bh),
+ folio_zero_range(folio, bh_offset(bh),
blocksize);
set_buffer_uptodate(bh);
}
@@ -706,7 +699,7 @@ map_buffer_cached:
(bh_cofs >> blocksize_bits);
set_buffer_mapped(bh);
/*
- * If the page is uptodate so is the buffer. If the
+ * If the folio is uptodate so is the buffer. If the
* buffer is fully outside the write, we ignore it if
* it was already allocated and we mark it dirty so it
* gets written out if we allocated it. On the other
@@ -714,7 +707,7 @@ map_buffer_cached:
* marking it dirty we set buffer_new so we can do
* error recovery.
*/
- if (PageUptodate(page)) {
+ if (folio_test_uptodate(folio)) {
if (!buffer_uptodate(bh))
set_buffer_uptodate(bh);
if (unlikely(was_hole)) {
@@ -754,7 +747,8 @@ map_buffer_cached:
ntfs_submit_bh_for_read(bh);
*wait_bh++ = bh;
} else {
- zero_user(page, bh_offset(bh),
+ folio_zero_range(folio,
+ bh_offset(bh),
blocksize);
set_buffer_uptodate(bh);
}
@@ -773,7 +767,7 @@ map_buffer_cached:
*/
if (bh_end <= pos || bh_pos >= end) {
if (!buffer_uptodate(bh)) {
- zero_user(page, bh_offset(bh),
+ folio_zero_range(folio, bh_offset(bh),
blocksize);
set_buffer_uptodate(bh);
}
@@ -786,7 +780,7 @@ map_buffer_cached:
u8 *kaddr;
unsigned pofs;
- kaddr = kmap_atomic(page);
+ kaddr = kmap_local_folio(folio, 0);
if (bh_pos < pos) {
pofs = bh_pos & ~PAGE_MASK;
memset(kaddr + pofs, 0, pos - bh_pos);
@@ -795,8 +789,8 @@ map_buffer_cached:
pofs = end & ~PAGE_MASK;
memset(kaddr + pofs, 0, bh_end - end);
}
- kunmap_atomic(kaddr);
- flush_dcache_page(page);
+ kunmap_local(kaddr);
+ flush_dcache_folio(folio);
}
continue;
}
@@ -809,11 +803,12 @@ map_buffer_cached:
initialized_size = ni->allocated_size;
read_unlock_irqrestore(&ni->size_lock, flags);
if (bh_pos > initialized_size) {
- if (PageUptodate(page)) {
+ if (folio_test_uptodate(folio)) {
if (!buffer_uptodate(bh))
set_buffer_uptodate(bh);
} else if (!buffer_uptodate(bh)) {
- zero_user(page, bh_offset(bh), blocksize);
+ folio_zero_range(folio, bh_offset(bh),
+ blocksize);
set_buffer_uptodate(bh);
}
continue;
@@ -927,17 +922,17 @@ rl_not_mapped_enoent:
bh->b_blocknr = -1;
/*
* If the buffer is uptodate we skip it. If it
- * is not but the page is uptodate, we can set
- * the buffer uptodate. If the page is not
+ * is not but the folio is uptodate, we can set
+ * the buffer uptodate. If the folio is not
* uptodate, we can clear the buffer and set it
* uptodate. Whether this is worthwhile is
* debatable and this could be removed.
*/
- if (PageUptodate(page)) {
+ if (folio_test_uptodate(folio)) {
if (!buffer_uptodate(bh))
set_buffer_uptodate(bh);
} else if (!buffer_uptodate(bh)) {
- zero_user(page, bh_offset(bh),
+ folio_zero_range(folio, bh_offset(bh),
blocksize);
set_buffer_uptodate(bh);
}
@@ -1167,7 +1162,7 @@ rl_not_mapped_enoent:
} while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
/* If there are no errors, do the next page. */
if (likely(!err && ++u < nr_pages))
- goto do_next_page;
+ goto do_next_folio;
/* If there are no errors, release the runlist lock if we took it. */
if (likely(!err)) {
if (unlikely(rl_write_locked)) {
@@ -1185,9 +1180,8 @@ rl_not_mapped_enoent:
bh = *--wait_bh;
wait_on_buffer(bh);
if (likely(buffer_uptodate(bh))) {
- page = bh->b_page;
- bh_pos = ((s64)page->index << PAGE_SHIFT) +
- bh_offset(bh);
+ folio = bh->b_folio;
+ bh_pos = folio_pos(folio) + bh_offset(bh);
/*
* If the buffer overflows the initialized size, need
* to zero the overflowing region.
@@ -1197,7 +1191,7 @@ rl_not_mapped_enoent:
if (likely(bh_pos < initialized_size))
ofs = initialized_size - bh_pos;
- zero_user_segment(page, bh_offset(bh) + ofs,
+ folio_zero_segment(folio, bh_offset(bh) + ofs,
blocksize);
}
} else /* if (unlikely(!buffer_uptodate(bh))) */
@@ -1324,21 +1318,20 @@ rl_not_mapped_enoent:
u = 0;
end = bh_cpos << vol->cluster_size_bits;
do {
- page = pages[u];
- bh = head = page_buffers(page);
+ folio = page_folio(pages[u]);
+ bh = head = folio_buffers(folio);
do {
if (u == nr_pages &&
- ((s64)page->index << PAGE_SHIFT) +
- bh_offset(bh) >= end)
+ folio_pos(folio) + bh_offset(bh) >= end)
break;
if (!buffer_new(bh))
continue;
clear_buffer_new(bh);
if (!buffer_uptodate(bh)) {
- if (PageUptodate(page))
+ if (folio_test_uptodate(folio))
set_buffer_uptodate(bh);
else {
- zero_user(page, bh_offset(bh),
+ folio_zero_range(folio, bh_offset(bh),
blocksize);
set_buffer_uptodate(bh);
}
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 99ac6ea277c4..aba1e22db4e9 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -648,7 +648,7 @@ static int ntfs_read_locked_inode(struct inode *vi)
* mtime is the last change of the data within the file. Not changed
* when only metadata is changed, e.g. a rename doesn't affect mtime.
*/
- vi->i_mtime = ntfs2utc(si->last_data_change_time);
+ inode_set_mtime_to_ts(vi, ntfs2utc(si->last_data_change_time));
/*
* ctime is the last change of the metadata of the file. This obviously
* always changes, when mtime is changed. ctime can be changed on its
@@ -659,7 +659,7 @@ static int ntfs_read_locked_inode(struct inode *vi)
* Last access to the data within the file. Not changed during a rename
* for example but changed whenever the file is written to.
*/
- vi->i_atime = ntfs2utc(si->last_access_time);
+ inode_set_atime_to_ts(vi, ntfs2utc(si->last_access_time));
/* Find the attribute list attribute if present. */
ntfs_attr_reinit_search_ctx(ctx);
@@ -1217,9 +1217,9 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
vi->i_uid = base_vi->i_uid;
vi->i_gid = base_vi->i_gid;
set_nlink(vi, base_vi->i_nlink);
- vi->i_mtime = base_vi->i_mtime;
+ inode_set_mtime_to_ts(vi, inode_get_mtime(base_vi));
inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi));
- vi->i_atime = base_vi->i_atime;
+ inode_set_atime_to_ts(vi, inode_get_atime(base_vi));
vi->i_generation = ni->seq_no = base_ni->seq_no;
/* Set inode type to zero but preserve permissions. */
@@ -1483,9 +1483,9 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
vi->i_uid = base_vi->i_uid;
vi->i_gid = base_vi->i_gid;
set_nlink(vi, base_vi->i_nlink);
- vi->i_mtime = base_vi->i_mtime;
+ inode_set_mtime_to_ts(vi, inode_get_mtime(base_vi));
inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi));
- vi->i_atime = base_vi->i_atime;
+ inode_set_atime_to_ts(vi, inode_get_atime(base_vi));
vi->i_generation = ni->seq_no = base_ni->seq_no;
/* Set inode type to zero but preserve permissions. */
vi->i_mode = base_vi->i_mode & ~S_IFMT;
@@ -2805,13 +2805,14 @@ done:
if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) {
struct timespec64 now = current_time(VFS_I(base_ni));
struct timespec64 ctime = inode_get_ctime(VFS_I(base_ni));
+ struct timespec64 mtime = inode_get_mtime(VFS_I(base_ni));
int sync_it = 0;
- if (!timespec64_equal(&VFS_I(base_ni)->i_mtime, &now) ||
+ if (!timespec64_equal(&mtime, &now) ||
!timespec64_equal(&ctime, &now))
sync_it = 1;
inode_set_ctime_to_ts(VFS_I(base_ni), now);
- VFS_I(base_ni)->i_mtime = now;
+ inode_set_mtime_to_ts(VFS_I(base_ni), now);
if (sync_it)
mark_inode_dirty_sync(VFS_I(base_ni));
@@ -2925,9 +2926,9 @@ int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
}
}
if (ia_valid & ATTR_ATIME)
- vi->i_atime = attr->ia_atime;
+ inode_set_atime_to_ts(vi, attr->ia_atime);
if (ia_valid & ATTR_MTIME)
- vi->i_mtime = attr->ia_mtime;
+ inode_set_mtime_to_ts(vi, attr->ia_mtime);
if (ia_valid & ATTR_CTIME)
inode_set_ctime_to_ts(vi, attr->ia_ctime);
mark_inode_dirty(vi);
@@ -2996,7 +2997,7 @@ int __ntfs_write_inode(struct inode *vi, int sync)
si = (STANDARD_INFORMATION*)((u8*)ctx->attr +
le16_to_cpu(ctx->attr->data.resident.value_offset));
/* Update the access times if they have changed. */
- nt = utc2ntfs(vi->i_mtime);
+ nt = utc2ntfs(inode_get_mtime(vi));
if (si->last_data_change_time != nt) {
ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, "
"new = 0x%llx", vi->i_ino, (long long)
@@ -3014,7 +3015,7 @@ int __ntfs_write_inode(struct inode *vi, int sync)
si->last_mft_change_time = nt;
modified = true;
}
- nt = utc2ntfs(vi->i_atime);
+ nt = utc2ntfs(inode_get_atime(vi));
if (si->last_access_time != nt) {
ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, "
"new = 0x%llx", vi->i_ino,
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index ad1a8f72da22..6fd1dc4b08c8 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -2682,7 +2682,7 @@ mft_rec_already_initialized:
vi->i_mode &= ~S_IWUGO;
/* Set the inode times to the current time. */
- vi->i_atime = vi->i_mtime = inode_set_ctime_current(vi);
+ simple_inode_init_ts(vi);
/*
* Set the file size to 0, the ntfs inode sizes are set to 0 by
* the call to ntfs_init_big_inode() below.
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index ab44f2db533b..d7498ddc4a72 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -384,6 +384,7 @@ static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid,
* and due to using iget() whereas NTFS needs ntfs_iget().
*/
const struct export_operations ntfs_export_ops = {
+ .encode_fh = generic_encode_ino32_fh,
.get_parent = ntfs_get_parent, /* Find the parent of a given
directory. */
.fh_to_dentry = ntfs_fh_to_dentry,
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index 1f7a194983c5..a5a30a24ce5d 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -187,7 +187,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
struct buffer_head *head, *bh;
u32 bh_next, bh_off, to;
sector_t iblock;
- struct page *page;
+ struct folio *folio;
for (; idx < idx_end; idx += 1, from = 0) {
page_off = (loff_t)idx << PAGE_SHIFT;
@@ -195,16 +195,17 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
PAGE_SIZE;
iblock = page_off >> inode->i_blkbits;
- page = find_or_create_page(mapping, idx,
- mapping_gfp_constraint(mapping,
- ~__GFP_FS));
- if (!page)
- return -ENOMEM;
+ folio = __filemap_get_folio(mapping, idx,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ mapping_gfp_constraint(mapping, ~__GFP_FS));
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
- if (!page_has_buffers(page))
- create_empty_buffers(page, blocksize, 0);
+ head = folio_buffers(folio);
+ if (!head)
+ head = create_empty_buffers(folio, blocksize, 0);
- bh = head = page_buffers(page);
+ bh = head;
bh_off = 0;
do {
bh_next = bh_off + blocksize;
@@ -220,14 +221,14 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
}
/* Ok, it's mapped. Make sure it's up-to-date. */
- if (PageUptodate(page))
+ if (folio_test_uptodate(folio))
set_buffer_uptodate(bh);
if (!buffer_uptodate(bh)) {
err = bh_read(bh, 0);
if (err < 0) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
goto out;
}
}
@@ -237,10 +238,10 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
} while (bh_off = bh_next, iblock += 1,
head != (bh = bh->b_this_page));
- zero_user_segment(page, from, to);
+ folio_zero_segment(folio, from, to);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
cond_resched();
}
out:
@@ -342,7 +343,7 @@ static int ntfs_extend(struct inode *inode, loff_t pos, size_t count,
err = 0;
}
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
if (IS_SYNC(inode)) {
@@ -400,7 +401,7 @@ static int ntfs_truncate(struct inode *inode, loff_t new_size)
ni_unlock(ni);
ni->std_fa |= FILE_ATTRIBUTE_ARCHIVE;
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
if (!IS_DIRSYNC(inode)) {
dirty = 1;
} else {
@@ -642,7 +643,7 @@ out:
filemap_invalidate_unlock(mapping);
if (!err) {
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
}
diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
index dad976a68985..3df2d9e34b91 100644
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
@@ -3271,7 +3271,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
if (is_rec_inuse(ni->mi.mrec) &&
!(sbi->flags & NTFS_FLAGS_LOG_REPLAYING) && inode->i_nlink) {
bool modified = false;
- struct timespec64 ctime = inode_get_ctime(inode);
+ struct timespec64 ts;
/* Update times in standard attribute. */
std = ni_std(ni);
@@ -3281,19 +3281,22 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
}
/* Update the access times if they have changed. */
- dup.m_time = kernel2nt(&inode->i_mtime);
+ ts = inode_get_mtime(inode);
+ dup.m_time = kernel2nt(&ts);
if (std->m_time != dup.m_time) {
std->m_time = dup.m_time;
modified = true;
}
- dup.c_time = kernel2nt(&ctime);
+ ts = inode_get_mtime(inode);
+ dup.c_time = kernel2nt(&ts);
if (std->c_time != dup.c_time) {
std->c_time = dup.c_time;
modified = true;
}
- dup.a_time = kernel2nt(&inode->i_atime);
+ ts = inode_get_atime(inode);
+ dup.a_time = kernel2nt(&ts);
if (std->a_time != dup.a_time) {
std->a_time = dup.a_time;
modified = true;
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index d6d021e19aaa..5e3d71374918 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -44,7 +44,7 @@ static struct inode *ntfs_read_mft(struct inode *inode,
u64 t64;
struct MFT_REC *rec;
struct runs_tree *run;
- struct timespec64 ctime;
+ struct timespec64 ts;
inode->i_op = NULL;
/* Setup 'uid' and 'gid' */
@@ -169,10 +169,12 @@ next_attr:
#ifdef STATX_BTIME
nt2kernel(std5->cr_time, &ni->i_crtime);
#endif
- nt2kernel(std5->a_time, &inode->i_atime);
- nt2kernel(std5->c_time, &ctime);
- inode_set_ctime_to_ts(inode, ctime);
- nt2kernel(std5->m_time, &inode->i_mtime);
+ nt2kernel(std5->a_time, &ts);
+ inode_set_atime_to_ts(inode, ts);
+ nt2kernel(std5->c_time, &ts);
+ inode_set_ctime_to_ts(inode, ts);
+ nt2kernel(std5->m_time, &ts);
+ inode_set_mtime_to_ts(inode, ts);
ni->std_fa = std5->fa;
@@ -960,7 +962,8 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos,
if (err >= 0) {
if (!(ni->std_fa & FILE_ATTRIBUTE_ARCHIVE)) {
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode,
+ inode_set_ctime_current(inode));
ni->std_fa |= FILE_ATTRIBUTE_ARCHIVE;
dirty = true;
}
@@ -1660,9 +1663,11 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir,
d_instantiate(dentry, inode);
/* Set original time. inode times (i_ctime) may be changed in ntfs_init_acl. */
- inode->i_atime = inode->i_mtime =
- inode_set_ctime_to_ts(inode, ni->i_crtime);
- dir->i_mtime = inode_set_ctime_to_ts(dir, ni->i_crtime);
+ inode_set_atime_to_ts(inode, ni->i_crtime);
+ inode_set_ctime_to_ts(inode, ni->i_crtime);
+ inode_set_mtime_to_ts(inode, ni->i_crtime);
+ inode_set_mtime_to_ts(dir, ni->i_crtime);
+ inode_set_ctime_to_ts(dir, ni->i_crtime);
mark_inode_dirty(dir);
mark_inode_dirty(inode);
@@ -1768,7 +1773,7 @@ int ntfs_unlink_inode(struct inode *dir, const struct dentry *dentry)
if (!err) {
drop_nlink(inode);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
if (inode->i_nlink)
diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index eedacf94edd8..ee3093be5170 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -156,8 +156,8 @@ static int ntfs_link(struct dentry *ode, struct inode *dir, struct dentry *de)
err = ntfs_link_inode(inode, de);
if (!err) {
- dir->i_mtime = inode_set_ctime_to_ts(
- inode, inode_set_ctime_current(dir));
+ inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(inode);
mark_inode_dirty(dir);
d_instantiate(de, inode);
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index 0e6a2777870c..f6706143d14b 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -872,7 +872,7 @@ int ntfs_init_acl(struct mnt_idmap *idmap, struct inode *inode,
int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry);
ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
-extern const struct xattr_handler *ntfs_xattr_handlers[];
+extern const struct xattr_handler * const ntfs_xattr_handlers[];
int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size);
void ntfs_get_wsl_perm(struct inode *inode);
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index f763e3256ccc..9153dffde950 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -811,6 +811,7 @@ static int ntfs_nfs_commit_metadata(struct inode *inode)
}
static const struct export_operations ntfs_export_ops = {
+ .encode_fh = generic_encode_ino32_fh,
.fh_to_dentry = ntfs_fh_to_dentry,
.fh_to_parent = ntfs_fh_to_parent,
.get_parent = ntfs3_get_parent,
diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c
index 4920548192a0..4274b6f31cfa 100644
--- a/fs/ntfs3/xattr.c
+++ b/fs/ntfs3/xattr.c
@@ -1021,7 +1021,7 @@ static const struct xattr_handler ntfs_other_xattr_handler = {
.list = ntfs_xattr_user_list,
};
-const struct xattr_handler *ntfs_xattr_handlers[] = {
+const struct xattr_handler * const ntfs_xattr_handlers[] = {
&ntfs_other_xattr_handler,
NULL,
};
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index e75137a8e7cb..62464d194da3 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -193,8 +193,8 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
inode->i_mode = new_mode;
inode_set_ctime_current(inode);
di->i_mode = cpu_to_le16(inode->i_mode);
- di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
- di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+ di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+ di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index aef58f1395c8..91b32b2377ac 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -967,7 +967,14 @@ int ocfs2_num_free_extents(struct ocfs2_extent_tree *et)
el = &eb->h_list;
}
- BUG_ON(el->l_tree_depth != 0);
+ if (el->l_tree_depth != 0) {
+ retval = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has leaf extent block %llu with an invalid l_tree_depth of %u\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ (unsigned long long)last_eb_blk,
+ le16_to_cpu(el->l_tree_depth));
+ goto bail;
+ }
retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
bail:
@@ -7436,10 +7443,10 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
}
inode->i_blocks = ocfs2_inode_sector_count(inode);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
- di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
- di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+ di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(inode));
+ di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_journal_dirty(handle, di_bh);
@@ -7642,7 +7649,7 @@ out_mutex:
goto next_group;
}
out:
- range->len = trimmed * sb->s_blocksize;
+ range->len = trimmed * osb->s_clustersize;
return ret;
}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 0fdba30740ab..ba790219d528 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -568,10 +568,10 @@ static void ocfs2_clear_page_regions(struct page *page,
* read-in the blocks at the tail of our file. Avoid reading them by
* testing i_size against each block offset.
*/
-static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
+static int ocfs2_should_read_blk(struct inode *inode, struct folio *folio,
unsigned int block_start)
{
- u64 offset = page_offset(page) + block_start;
+ u64 offset = folio_pos(folio) + block_start;
if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
return 1;
@@ -593,15 +593,16 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
struct inode *inode, unsigned int from,
unsigned int to, int new)
{
+ struct folio *folio = page_folio(page);
int ret = 0;
struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
unsigned int block_end, block_start;
unsigned int bsize = i_blocksize(inode);
- if (!page_has_buffers(page))
- create_empty_buffers(page, bsize, 0);
+ head = folio_buffers(folio);
+ if (!head)
+ head = create_empty_buffers(folio, bsize, 0);
- head = page_buffers(page);
for (bh = head, block_start = 0; bh != head || !block_start;
bh = bh->b_this_page, block_start += bsize) {
block_end = block_start + bsize;
@@ -613,7 +614,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
* they may belong to unallocated clusters.
*/
if (block_start >= to || block_end <= from) {
- if (PageUptodate(page))
+ if (folio_test_uptodate(folio))
set_buffer_uptodate(bh);
continue;
}
@@ -630,11 +631,11 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
clean_bdev_bh_alias(bh);
}
- if (PageUptodate(page)) {
+ if (folio_test_uptodate(folio)) {
set_buffer_uptodate(bh);
} else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
!buffer_new(bh) &&
- ocfs2_should_read_blk(inode, page, block_start) &&
+ ocfs2_should_read_blk(inode, folio, block_start) &&
(block_start < from || block_end > to)) {
bh_read_nowait(bh, 0);
*wait_bh++=bh;
@@ -668,7 +669,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
if (block_start >= to)
break;
- zero_user(page, block_start, bh->b_size);
+ folio_zero_range(folio, block_start, bh->b_size);
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
@@ -2048,9 +2049,9 @@ out_write_size:
}
inode->i_blocks = ocfs2_inode_sector_count(inode);
di->i_size = cpu_to_le64((u64)i_size_read(inode));
- inode->i_mtime = inode_set_ctime_current(inode);
- di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
- di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+ di->i_mtime = di->i_ctime = cpu_to_le64(inode_get_mtime_sec(inode));
+ di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
if (handle)
ocfs2_update_inode_fsync_trans(handle, inode, 1);
}
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 196638a22b48..cdb9b9bdea1f 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -158,7 +158,7 @@ read_failure:
if (new_bh && bh) {
/* If middle bh fails, let previous bh
* finish its read and then put it to
- * aovoid bh leak
+ * avoid bh leak
*/
if (!buffer_jbd(bh))
wait_on_buffer(bh);
@@ -345,7 +345,7 @@ read_failure:
if (new_bh && bh) {
/* If middle bh fails, let previous bh
* finish its read and then put it to
- * aovoid bh leak
+ * avoid bh leak
*/
if (!buffer_jbd(bh))
wait_on_buffer(bh);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 21472e3ed182..4d7efefa98c5 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -213,7 +213,7 @@ struct o2hb_region {
unsigned int hr_num_pages;
struct page **hr_slot_data;
- struct block_device *hr_bdev;
+ struct bdev_handle *hr_bdev_handle;
struct o2hb_disk_slot *hr_slots;
/* live node map of this region */
@@ -261,6 +261,11 @@ struct o2hb_region {
int hr_last_hb_status;
};
+static inline struct block_device *reg_bdev(struct o2hb_region *reg)
+{
+ return reg->hr_bdev_handle ? reg->hr_bdev_handle->bdev : NULL;
+}
+
struct o2hb_bio_wait_ctxt {
atomic_t wc_num_reqs;
struct completion wc_io_complete;
@@ -286,7 +291,7 @@ static void o2hb_write_timeout(struct work_struct *work)
hr_write_timeout_work.work);
mlog(ML_ERROR, "Heartbeat write timeout to device %pg after %u "
- "milliseconds\n", reg->hr_bdev,
+ "milliseconds\n", reg_bdev(reg),
jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
if (o2hb_global_heartbeat_active()) {
@@ -383,7 +388,7 @@ static void o2hb_nego_timeout(struct work_struct *work)
if (!test_bit(master_node, reg->hr_nego_node_bitmap)) {
printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%pg).\n",
o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000,
- config_item_name(&reg->hr_item), reg->hr_bdev);
+ config_item_name(&reg->hr_item), reg_bdev(reg));
set_bit(master_node, reg->hr_nego_node_bitmap);
}
if (!bitmap_equal(reg->hr_nego_node_bitmap, live_node_bitmap,
@@ -398,7 +403,8 @@ static void o2hb_nego_timeout(struct work_struct *work)
}
printk(KERN_NOTICE "o2hb: all nodes hb write hung, maybe region %s (%pg) is down.\n",
- config_item_name(&reg->hr_item), reg->hr_bdev);
+ config_item_name(&reg->hr_item),
+ reg_bdev(reg));
/* approve negotiate timeout request. */
o2hb_arm_timeout(reg);
@@ -419,7 +425,7 @@ static void o2hb_nego_timeout(struct work_struct *work)
/* negotiate timeout with master node. */
printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%pg), negotiate timeout with node %d.\n",
o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, config_item_name(&reg->hr_item),
- reg->hr_bdev, master_node);
+ reg_bdev(reg), master_node);
ret = o2hb_send_nego_msg(reg->hr_key, O2HB_NEGO_TIMEOUT_MSG,
master_node);
if (ret)
@@ -436,7 +442,8 @@ static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data,
nego_msg = (struct o2hb_nego_msg *)msg->buf;
printk(KERN_NOTICE "o2hb: receive negotiate timeout message from node %d on region %s (%pg).\n",
- nego_msg->node_num, config_item_name(&reg->hr_item), reg->hr_bdev);
+ nego_msg->node_num, config_item_name(&reg->hr_item),
+ reg_bdev(reg));
if (nego_msg->node_num < O2NM_MAX_NODES)
set_bit(nego_msg->node_num, reg->hr_nego_node_bitmap);
else
@@ -451,7 +458,7 @@ static int o2hb_nego_approve_handler(struct o2net_msg *msg, u32 len, void *data,
struct o2hb_region *reg = data;
printk(KERN_NOTICE "o2hb: negotiate timeout approved by master node on region %s (%pg).\n",
- config_item_name(&reg->hr_item), reg->hr_bdev);
+ config_item_name(&reg->hr_item), reg_bdev(reg));
o2hb_arm_timeout(reg);
return 0;
}
@@ -515,7 +522,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
* GFP_KERNEL that the local node can get fenced. It would be
* nicest if we could pre-allocate these bios and avoid this
* all together. */
- bio = bio_alloc(reg->hr_bdev, 16, opf, GFP_ATOMIC);
+ bio = bio_alloc(reg_bdev(reg), 16, opf, GFP_ATOMIC);
if (!bio) {
mlog(ML_ERROR, "Could not alloc slots BIO!\n");
bio = ERR_PTR(-ENOMEM);
@@ -687,7 +694,7 @@ static int o2hb_check_own_slot(struct o2hb_region *reg)
errstr = ERRSTR3;
mlog(ML_ERROR, "%s (%pg): expected(%u:0x%llx, 0x%llx), "
- "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg->hr_bdev,
+ "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg_bdev(reg),
slot->ds_node_num, (unsigned long long)slot->ds_last_generation,
(unsigned long long)slot->ds_last_time, hb_block->hb_node,
(unsigned long long)le64_to_cpu(hb_block->hb_generation),
@@ -861,7 +868,7 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg)
goto unlock;
printk(KERN_NOTICE "o2hb: Region %s (%pg) is now a quorum device\n",
- config_item_name(&reg->hr_item), reg->hr_bdev);
+ config_item_name(&reg->hr_item), reg_bdev(reg));
set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
@@ -920,7 +927,7 @@ static int o2hb_check_slot(struct o2hb_region *reg,
* consider it a transient miss but don't populate any
* other values as they may be junk. */
mlog(ML_ERROR, "Node %d has written a bad crc to %pg\n",
- slot->ds_node_num, reg->hr_bdev);
+ slot->ds_node_num, reg_bdev(reg));
o2hb_dump_slot(hb_block);
slot->ds_equal_samples++;
@@ -1003,8 +1010,8 @@ fire_callbacks:
"of %u ms, but our count is %u ms.\n"
"Please double check your configuration values "
"for 'O2CB_HEARTBEAT_THRESHOLD'\n",
- slot->ds_node_num, reg->hr_bdev, slot_dead_ms,
- dead_ms);
+ slot->ds_node_num, reg_bdev(reg),
+ slot_dead_ms, dead_ms);
}
goto out;
}
@@ -1143,7 +1150,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
* can't be sure that the new block ever made it to
* disk */
mlog(ML_ERROR, "Write error %d on device \"%pg\"\n",
- write_wc.wc_error, reg->hr_bdev);
+ write_wc.wc_error, reg_bdev(reg));
ret = write_wc.wc_error;
goto bail;
}
@@ -1169,7 +1176,7 @@ bail:
printk(KERN_NOTICE "o2hb: Unable to stabilize "
"heartbeat on region %s (%pg)\n",
config_item_name(&reg->hr_item),
- reg->hr_bdev);
+ reg_bdev(reg));
atomic_set(&reg->hr_steady_iterations, 0);
reg->hr_aborted_start = 1;
wake_up(&o2hb_steady_queue);
@@ -1489,7 +1496,7 @@ static void o2hb_region_release(struct config_item *item)
struct page *page;
struct o2hb_region *reg = to_o2hb_region(item);
- mlog(ML_HEARTBEAT, "hb region release (%pg)\n", reg->hr_bdev);
+ mlog(ML_HEARTBEAT, "hb region release (%pg)\n", reg_bdev(reg));
kfree(reg->hr_tmp_block);
@@ -1502,8 +1509,8 @@ static void o2hb_region_release(struct config_item *item)
kfree(reg->hr_slot_data);
}
- if (reg->hr_bdev)
- blkdev_put(reg->hr_bdev, NULL);
+ if (reg->hr_bdev_handle)
+ bdev_release(reg->hr_bdev_handle);
kfree(reg->hr_slots);
@@ -1562,7 +1569,7 @@ static ssize_t o2hb_region_block_bytes_store(struct config_item *item,
unsigned long block_bytes;
unsigned int block_bits;
- if (reg->hr_bdev)
+ if (reg->hr_bdev_handle)
return -EINVAL;
status = o2hb_read_block_input(reg, page, &block_bytes,
@@ -1591,7 +1598,7 @@ static ssize_t o2hb_region_start_block_store(struct config_item *item,
char *p = (char *)page;
ssize_t ret;
- if (reg->hr_bdev)
+ if (reg->hr_bdev_handle)
return -EINVAL;
ret = kstrtoull(p, 0, &tmp);
@@ -1616,7 +1623,7 @@ static ssize_t o2hb_region_blocks_store(struct config_item *item,
unsigned long tmp;
char *p = (char *)page;
- if (reg->hr_bdev)
+ if (reg->hr_bdev_handle)
return -EINVAL;
tmp = simple_strtoul(p, &p, 0);
@@ -1635,8 +1642,8 @@ static ssize_t o2hb_region_dev_show(struct config_item *item, char *page)
{
unsigned int ret = 0;
- if (to_o2hb_region(item)->hr_bdev)
- ret = sprintf(page, "%pg\n", to_o2hb_region(item)->hr_bdev);
+ if (to_o2hb_region(item)->hr_bdev_handle)
+ ret = sprintf(page, "%pg\n", reg_bdev(to_o2hb_region(item)));
return ret;
}
@@ -1745,7 +1752,10 @@ out:
return ret;
}
-/* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */
+/*
+ * this is acting as commit; we set up all of hr_bdev_handle and hr_task or
+ * nothing
+ */
static ssize_t o2hb_region_dev_store(struct config_item *item,
const char *page,
size_t count)
@@ -1759,7 +1769,7 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
ssize_t ret = -EINVAL;
int live_threshold;
- if (reg->hr_bdev)
+ if (reg->hr_bdev_handle)
goto out;
/* We can't heartbeat without having had our node number
@@ -1785,16 +1795,15 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
if (!S_ISBLK(f.file->f_mapping->host->i_mode))
goto out2;
- reg->hr_bdev = blkdev_get_by_dev(f.file->f_mapping->host->i_rdev,
- BLK_OPEN_WRITE | BLK_OPEN_READ, NULL,
- NULL);
- if (IS_ERR(reg->hr_bdev)) {
- ret = PTR_ERR(reg->hr_bdev);
- reg->hr_bdev = NULL;
+ reg->hr_bdev_handle = bdev_open_by_dev(f.file->f_mapping->host->i_rdev,
+ BLK_OPEN_WRITE | BLK_OPEN_READ, NULL, NULL);
+ if (IS_ERR(reg->hr_bdev_handle)) {
+ ret = PTR_ERR(reg->hr_bdev_handle);
+ reg->hr_bdev_handle = NULL;
goto out2;
}
- sectsize = bdev_logical_block_size(reg->hr_bdev);
+ sectsize = bdev_logical_block_size(reg_bdev(reg));
if (sectsize != reg->hr_block_bytes) {
mlog(ML_ERROR,
"blocksize %u incorrect for device, expected %d",
@@ -1890,12 +1899,12 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
if (hb_task && o2hb_global_heartbeat_active())
printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%pg)\n",
- config_item_name(&reg->hr_item), reg->hr_bdev);
+ config_item_name(&reg->hr_item), reg_bdev(reg));
out3:
if (ret < 0) {
- blkdev_put(reg->hr_bdev, NULL);
- reg->hr_bdev = NULL;
+ bdev_release(reg->hr_bdev_handle);
+ reg->hr_bdev_handle = NULL;
}
out2:
fdput(f);
@@ -2085,7 +2094,7 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%pg)\n",
((atomic_read(&reg->hr_steady_iterations) == 0) ?
"stopped" : "start aborted"), config_item_name(item),
- reg->hr_bdev);
+ reg_bdev(reg));
}
/*
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 8b123d543e6e..a14c8fee6ee5 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1658,7 +1658,8 @@ int __ocfs2_add_entry(handle_t *handle,
offset, ocfs2_dir_trailer_blk_off(dir->i_sb));
if (ocfs2_dirent_would_fit(de, rec_len)) {
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_current(dir));
retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
if (retval < 0) {
mlog_errno(retval);
@@ -2962,11 +2963,11 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
ocfs2_dinode_new_extent_list(dir, di);
i_size_write(dir, sb->s_blocksize);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
di->i_size = cpu_to_le64(sb->s_blocksize);
- di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(dir).tv_sec);
- di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(dir).tv_nsec);
+ di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(dir));
+ di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(dir));
ocfs2_update_inode_fsync_trans(handle, dir, 1);
/*
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 81265123ce6c..85215162c9dd 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -80,8 +80,7 @@ static int param_set_dlmfs_capabilities(const char *val,
static int param_get_dlmfs_capabilities(char *buffer,
const struct kernel_param *kp)
{
- return strlcpy(buffer, DLMFS_CAPABILITIES,
- strlen(DLMFS_CAPABILITIES) + 1);
+ return sysfs_emit(buffer, DLMFS_CAPABILITIES);
}
module_param_call(capabilities, param_set_dlmfs_capabilities,
param_get_dlmfs_capabilities, NULL, 0444);
@@ -337,7 +336,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
if (inode) {
inode->i_ino = get_next_ino();
inode_init_owner(&nop_mnt_idmap, inode, NULL, mode);
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inc_nlink(inode);
inode->i_fop = &simple_dir_operations;
@@ -360,7 +359,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
inode->i_ino = get_next_ino();
inode_init_owner(&nop_mnt_idmap, inode, parent, mode);
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
ip = DLMFS_I(inode);
ip->ip_conn = DLMFS_I(parent)->ip_conn;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index c3e2961ee5db..64a6ef638495 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2162,7 +2162,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_meta_lvb *lvb;
- struct timespec64 ctime = inode_get_ctime(inode);
+ struct timespec64 ts;
lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
@@ -2183,12 +2183,12 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
lvb->lvb_igid = cpu_to_be32(i_gid_read(inode));
lvb->lvb_imode = cpu_to_be16(inode->i_mode);
lvb->lvb_inlink = cpu_to_be16(inode->i_nlink);
- lvb->lvb_iatime_packed =
- cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
- lvb->lvb_ictime_packed =
- cpu_to_be64(ocfs2_pack_timespec(&ctime));
- lvb->lvb_imtime_packed =
- cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
+ ts = inode_get_atime(inode);
+ lvb->lvb_iatime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts));
+ ts = inode_get_ctime(inode);
+ lvb->lvb_ictime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts));
+ ts = inode_get_mtime(inode);
+ lvb->lvb_imtime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts));
lvb->lvb_iattr = cpu_to_be32(oi->ip_attr);
lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features);
lvb->lvb_igeneration = cpu_to_be32(inode->i_generation);
@@ -2209,7 +2209,7 @@ static int ocfs2_refresh_inode_from_lvb(struct inode *inode)
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_meta_lvb *lvb;
- struct timespec64 ctime;
+ struct timespec64 ts;
mlog_meta_lvb(0, lockres);
@@ -2236,13 +2236,12 @@ static int ocfs2_refresh_inode_from_lvb(struct inode *inode)
i_gid_write(inode, be32_to_cpu(lvb->lvb_igid));
inode->i_mode = be16_to_cpu(lvb->lvb_imode);
set_nlink(inode, be16_to_cpu(lvb->lvb_inlink));
- ocfs2_unpack_timespec(&inode->i_atime,
- be64_to_cpu(lvb->lvb_iatime_packed));
- ocfs2_unpack_timespec(&inode->i_mtime,
- be64_to_cpu(lvb->lvb_imtime_packed));
- ocfs2_unpack_timespec(&ctime,
- be64_to_cpu(lvb->lvb_ictime_packed));
- inode_set_ctime_to_ts(inode, ctime);
+ ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_iatime_packed));
+ inode_set_atime_to_ts(inode, ts);
+ ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_imtime_packed));
+ inode_set_mtime_to_ts(inode, ts);
+ ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_ictime_packed));
+ inode_set_ctime_to_ts(inode, ts);
spin_unlock(&oi->ip_lock);
return 0;
}
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index c45596c25c66..94e2a1244442 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -233,16 +233,18 @@ int ocfs2_should_update_atime(struct inode *inode,
if (vfsmnt->mnt_flags & MNT_RELATIME) {
struct timespec64 ctime = inode_get_ctime(inode);
+ struct timespec64 atime = inode_get_atime(inode);
+ struct timespec64 mtime = inode_get_mtime(inode);
- if ((timespec64_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
- (timespec64_compare(&inode->i_atime, &ctime) <= 0))
+ if ((timespec64_compare(&atime, &mtime) <= 0) ||
+ (timespec64_compare(&atime, &ctime) <= 0))
return 1;
return 0;
}
now = current_time(inode);
- if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
+ if ((now.tv_sec - inode_get_atime_sec(inode) <= osb->s_atime_quantum))
return 0;
else
return 1;
@@ -275,9 +277,9 @@ int ocfs2_update_inode_atime(struct inode *inode,
* have i_rwsem to guard against concurrent changes to other
* inode fields.
*/
- inode->i_atime = current_time(inode);
- di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
- di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+ inode_set_atime_to_ts(inode, current_time(inode));
+ di->i_atime = cpu_to_le64(inode_get_atime_sec(inode));
+ di->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode));
ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, bh);
@@ -296,7 +298,7 @@ int ocfs2_set_inode_size(handle_t *handle,
i_size_write(inode, new_i_size);
inode->i_blocks = ocfs2_inode_sector_count(inode);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
if (status < 0) {
@@ -417,12 +419,12 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
}
i_size_write(inode, new_i_size);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
di = (struct ocfs2_dinode *) fe_bh->b_data;
di->i_size = cpu_to_le64(new_i_size);
- di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
- di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+ di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(inode));
+ di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, fe_bh);
@@ -821,9 +823,9 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
i_size_write(inode, abs_to);
inode->i_blocks = ocfs2_inode_sector_count(inode);
di->i_size = cpu_to_le64((u64)i_size_read(inode));
- inode->i_mtime = inode_set_ctime_current(inode);
- di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
- di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+ di->i_mtime = di->i_ctime = cpu_to_le64(inode_get_mtime_sec(inode));
+ di->i_ctime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
di->i_mtime_nsec = di->i_ctime_nsec;
if (handle) {
ocfs2_journal_dirty(handle, di_bh);
@@ -2040,7 +2042,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
goto out_inode_unlock;
}
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
if (ret < 0)
mlog_errno(ret);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index e8771600b930..999111bfc271 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -302,10 +302,10 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
inode->i_blocks = ocfs2_inode_sector_count(inode);
inode->i_mapping->a_ops = &ocfs2_aops;
}
- inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
- inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
- inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
- inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
+ inode_set_atime(inode, le64_to_cpu(fe->i_atime),
+ le32_to_cpu(fe->i_atime_nsec));
+ inode_set_mtime(inode, le64_to_cpu(fe->i_mtime),
+ le32_to_cpu(fe->i_mtime_nsec));
inode_set_ctime(inode, le64_to_cpu(fe->i_ctime),
le32_to_cpu(fe->i_ctime_nsec));
@@ -1312,12 +1312,12 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
fe->i_uid = cpu_to_le32(i_uid_read(inode));
fe->i_gid = cpu_to_le32(i_gid_read(inode));
fe->i_mode = cpu_to_le16(inode->i_mode);
- fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
- fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
- fe->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
- fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
- fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
- fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ fe->i_atime = cpu_to_le64(inode_get_atime_sec(inode));
+ fe->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode));
+ fe->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+ fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
+ fe->i_mtime = cpu_to_le64(inode_get_mtime_sec(inode));
+ fe->i_mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
ocfs2_journal_dirty(handle, bh);
ocfs2_update_inode_fsync_trans(handle, inode, 1);
@@ -1348,10 +1348,10 @@ void ocfs2_refresh_inode(struct inode *inode,
inode->i_blocks = 0;
else
inode->i_blocks = ocfs2_inode_sector_count(inode);
- inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
- inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
- inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
- inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
+ inode_set_atime(inode, le64_to_cpu(fe->i_atime),
+ le32_to_cpu(fe->i_atime_nsec));
+ inode_set_mtime(inode, le64_to_cpu(fe->i_mtime),
+ le32_to_cpu(fe->i_mtime_nsec));
inode_set_ctime(inode, le64_to_cpu(fe->i_ctime),
le32_to_cpu(fe->i_ctime_nsec));
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index ce215565d061..604fea3a26ff 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -90,7 +90,7 @@ enum ocfs2_replay_state {
struct ocfs2_replay_map {
unsigned int rm_slots;
enum ocfs2_replay_state rm_state;
- unsigned char rm_replay_slots[];
+ unsigned char rm_replay_slots[] __counted_by(rm_slots);
};
static void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state)
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 05d67968a3a9..1f9ed117e78b 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -951,8 +951,8 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
di = (struct ocfs2_dinode *)di_bh->b_data;
inode_set_ctime_current(inode);
- di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
- di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+ di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+ di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 5cd6d7771cea..814733ba2f4b 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -795,8 +795,8 @@ static int ocfs2_link(struct dentry *old_dentry,
inc_nlink(inode);
inode_set_ctime_current(inode);
ocfs2_set_links_count(fe, inode->i_nlink);
- fe->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
- fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+ fe->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+ fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
ocfs2_journal_dirty(handle, fe_bh);
err = ocfs2_add_entry(handle, dentry, inode,
@@ -995,7 +995,7 @@ static int ocfs2_unlink(struct inode *dir,
ocfs2_set_links_count(fe, inode->i_nlink);
ocfs2_journal_dirty(handle, fe_bh);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
if (S_ISDIR(inode->i_mode))
drop_nlink(dir);
@@ -1550,8 +1550,8 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
if (status >= 0) {
old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
- old_di->i_ctime = cpu_to_le64(inode_get_ctime(old_inode).tv_sec);
- old_di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(old_inode).tv_nsec);
+ old_di->i_ctime = cpu_to_le64(inode_get_ctime_sec(old_inode));
+ old_di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(old_inode));
ocfs2_journal_dirty(handle, old_inode_bh);
} else
mlog_errno(status);
@@ -1592,11 +1592,15 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
drop_nlink(new_inode);
inode_set_ctime_current(new_inode);
}
- old_dir->i_mtime = inode_set_ctime_current(old_dir);
+ inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir));
if (update_dot_dot) {
status = ocfs2_update_entry(old_inode, handle,
&old_inode_dot_dot_res, new_dir);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
drop_nlink(old_dir);
if (new_inode) {
drop_nlink(new_inode);
@@ -1614,8 +1618,8 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
if (old_dir != new_dir) {
/* Keep the same times on both directories.*/
- new_dir->i_mtime = inode_set_ctime_to_ts(new_dir,
- inode_get_ctime(old_dir));
+ inode_set_mtime_to_ts(new_dir,
+ inode_set_ctime_to_ts(new_dir, inode_get_ctime(old_dir)));
/*
* This will also pick up the i_nlink change from the
@@ -1636,6 +1640,10 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
INODE_CACHE(old_dir),
old_dir_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
ocfs2_set_links_count(fe, old_dir->i_nlink);
ocfs2_journal_dirty(handle, old_dir_bh);
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index dfaae1e52412..e09842fc9d4d 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -1240,6 +1240,10 @@ int ocfs2_create_local_dquot(struct dquot *dquot)
&od->dq_local_phys_blk,
&pcount,
NULL);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
/* Initialize dquot structure on disk */
status = ocfs2_local_write_dquot(dquot);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 25c8ec3c8c3a..3f80a56d0d60 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3751,8 +3751,8 @@ static int ocfs2_change_ctime(struct inode *inode,
}
inode_set_ctime_current(inode);
- di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
- di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+ di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+ di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
ocfs2_journal_dirty(handle, di_bh);
@@ -4075,10 +4075,10 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
*/
inode_set_ctime_current(t_inode);
- di->i_ctime = cpu_to_le64(inode_get_ctime(t_inode).tv_sec);
- di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(t_inode).tv_nsec);
+ di->i_ctime = cpu_to_le64(inode_get_ctime_sec(t_inode));
+ di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(t_inode));
- t_inode->i_mtime = s_inode->i_mtime;
+ inode_set_mtime_to_ts(t_inode, inode_get_mtime(s_inode));
di->i_mtime = s_di->i_mtime;
di->i_mtime_nsec = s_di->i_mtime_nsec;
}
@@ -4456,7 +4456,7 @@ int ocfs2_reflink_update_dest(struct inode *dest,
if (newlen > i_size_read(dest))
i_size_write(dest, newlen);
spin_unlock(&OCFS2_I(dest)->ip_lock);
- dest->i_mtime = inode_set_ctime_current(dest);
+ inode_set_mtime_to_ts(dest, inode_set_ctime_current(dest));
ret = ocfs2_mark_inode_dirty(handle, dest, d_bh);
if (ret) {
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index da7718cef735..e544c704b583 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -37,7 +37,7 @@ struct ocfs2_slot_info {
unsigned int si_blocks;
struct buffer_head **si_bh;
unsigned int si_num_slots;
- struct ocfs2_slot si_slots[];
+ struct ocfs2_slot si_slots[] __counted_by(si_num_slots);
};
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 6510ad783c91..3b81213ed7b8 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -87,14 +87,14 @@ static struct ocfs2_xattr_def_value_root def_xv = {
.xv.xr_list.l_count = cpu_to_le16(1),
};
-const struct xattr_handler *ocfs2_xattr_handlers[] = {
+const struct xattr_handler * const ocfs2_xattr_handlers[] = {
&ocfs2_xattr_user_handler,
&ocfs2_xattr_trusted_handler,
&ocfs2_xattr_security_handler,
NULL
};
-static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
+static const struct xattr_handler * const ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
[OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler,
[OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access,
[OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default,
@@ -3422,8 +3422,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
}
inode_set_ctime_current(inode);
- di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
- di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+ di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+ di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
ocfs2_journal_dirty(ctxt->handle, xis->inode_bh);
}
out:
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 00308b57f64f..65e9aa743919 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -30,7 +30,7 @@ struct ocfs2_security_xattr_info {
extern const struct xattr_handler ocfs2_xattr_user_handler;
extern const struct xattr_handler ocfs2_xattr_trusted_handler;
extern const struct xattr_handler ocfs2_xattr_security_handler;
-extern const struct xattr_handler *ocfs2_xattr_handlers[];
+extern const struct xattr_handler * const ocfs2_xattr_handlers[];
ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 2f8c1882f45c..d6cd81163030 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -51,7 +51,7 @@ struct inode *omfs_new_inode(struct inode *dir, umode_t mode)
inode_init_owner(&nop_mnt_idmap, inode, NULL, mode);
inode->i_mapping->a_ops = &omfs_aops;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
switch (mode & S_IFMT) {
case S_IFDIR:
inode->i_op = &omfs_dir_inops;
@@ -134,8 +134,8 @@ static int __omfs_write_inode(struct inode *inode, int wait)
oi->i_head.h_magic = OMFS_IMAGIC;
oi->i_size = cpu_to_be64(inode->i_size);
- ctime = inode_get_ctime(inode).tv_sec * 1000LL +
- ((inode_get_ctime(inode).tv_nsec + 999)/1000);
+ ctime = inode_get_ctime_sec(inode) * 1000LL +
+ ((inode_get_ctime_nsec(inode) + 999)/1000);
oi->i_ctime = cpu_to_be64(ctime);
omfs_update_checksums(oi);
@@ -230,11 +230,9 @@ struct inode *omfs_iget(struct super_block *sb, ino_t ino)
ctime = be64_to_cpu(oi->i_ctime);
nsecs = do_div(ctime, 1000) * 1000L;
- inode->i_atime.tv_sec = ctime;
- inode->i_mtime.tv_sec = ctime;
+ inode_set_atime(inode, ctime, nsecs);
+ inode_set_mtime(inode, ctime, nsecs);
inode_set_ctime(inode, ctime, nsecs);
- inode->i_atime.tv_nsec = nsecs;
- inode->i_mtime.tv_nsec = nsecs;
inode->i_mapping->a_ops = &omfs_aops;
diff --git a/fs/open.c b/fs/open.c
index 98f6601fbac6..3494a9cd8046 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -870,6 +870,30 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
return ksys_fchown(fd, user, group);
}
+static inline int file_get_write_access(struct file *f)
+{
+ int error;
+
+ error = get_write_access(f->f_inode);
+ if (unlikely(error))
+ return error;
+ error = mnt_get_write_access(f->f_path.mnt);
+ if (unlikely(error))
+ goto cleanup_inode;
+ if (unlikely(f->f_mode & FMODE_BACKING)) {
+ error = mnt_get_write_access(backing_file_user_path(f)->mnt);
+ if (unlikely(error))
+ goto cleanup_mnt;
+ }
+ return 0;
+
+cleanup_mnt:
+ mnt_put_write_access(f->f_path.mnt);
+cleanup_inode:
+ put_write_access(f->f_inode);
+ return error;
+}
+
static int do_dentry_open(struct file *f,
struct inode *inode,
int (*open)(struct inode *, struct file *))
@@ -892,14 +916,9 @@ static int do_dentry_open(struct file *f,
if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
i_readcount_inc(inode);
} else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
- error = get_write_access(inode);
+ error = file_get_write_access(f);
if (unlikely(error))
goto cleanup_file;
- error = __mnt_want_write(f->f_path.mnt);
- if (unlikely(error)) {
- put_write_access(inode);
- goto cleanup_file;
- }
f->f_mode |= FMODE_WRITER;
}
@@ -1069,8 +1088,6 @@ struct file *dentry_open(const struct path *path, int flags,
int error;
struct file *f;
- validate_creds(cred);
-
/* We must always pass in a valid mount pointer. */
BUG_ON(!path->mnt);
@@ -1109,7 +1126,6 @@ struct file *dentry_create(const struct path *path, int flags, umode_t mode,
struct file *f;
int error;
- validate_creds(cred);
f = alloc_empty_file(flags, cred);
if (IS_ERR(f))
return f;
@@ -1163,20 +1179,19 @@ EXPORT_SYMBOL_GPL(kernel_file_open);
/**
* backing_file_open - open a backing file for kernel internal use
- * @path: path of the file to open
+ * @user_path: path that the user reuqested to open
* @flags: open flags
* @real_path: path of the backing file
* @cred: credentials for open
*
* Open a backing file for a stackable filesystem (e.g., overlayfs).
- * @path may be on the stackable filesystem and backing inode on the
- * underlying filesystem. In this case, we want to be able to return
- * the @real_path of the backing inode. This is done by embedding the
- * returned file into a container structure that also stores the path of
- * the backing inode on the underlying filesystem, which can be
- * retrieved using backing_file_real_path().
+ * @user_path may be on the stackable filesystem and @real_path on the
+ * underlying filesystem. In this case, we want to be able to return the
+ * @user_path of the stackable filesystem. This is done by embedding the
+ * returned file into a container structure that also stores the stacked
+ * file's path, which can be retrieved using backing_file_user_path().
*/
-struct file *backing_file_open(const struct path *path, int flags,
+struct file *backing_file_open(const struct path *user_path, int flags,
const struct path *real_path,
const struct cred *cred)
{
@@ -1187,9 +1202,9 @@ struct file *backing_file_open(const struct path *path, int flags,
if (IS_ERR(f))
return f;
- f->f_path = *path;
- path_get(real_path);
- *backing_file_real_path(f) = *real_path;
+ path_get(user_path);
+ *backing_file_user_path(f) = *user_path;
+ f->f_path = *real_path;
error = do_dentry_open(f, d_inode(real_path->dentry), NULL);
if (error) {
fput(f);
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index b2457cb97fa0..c4b65a6d41cc 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -237,7 +237,7 @@ found:
if (IS_ERR(inode))
return ERR_CAST(inode);
if (inode->i_state & I_NEW) {
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
ent_oi = OP_I(inode);
ent_oi->type = ent_type;
ent_oi->u = ent_data;
@@ -387,7 +387,7 @@ static int openprom_fill_super(struct super_block *s, struct fs_context *fc)
goto out_no_root;
}
- root_inode->i_mtime = root_inode->i_atime = inode_set_ctime_current(root_inode);
+ simple_inode_init_ts(root_inode);
root_inode->i_op = &openprom_inode_operations;
root_inode->i_fop = &openprom_operations;
root_inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index b711654ca18a..926d9c0a428a 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -103,7 +103,7 @@ enum orangefs_vfs_op_states {
#define ORANGEFS_CACHE_CREATE_FLAGS 0
#endif
-extern const struct xattr_handler *orangefs_xattr_handlers[];
+extern const struct xattr_handler * const orangefs_xattr_handlers[];
extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu);
extern int orangefs_set_acl(struct mnt_idmap *idmap,
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index 0a9fcfdf552f..0fdceb00ca07 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -155,14 +155,14 @@ static inline void copy_attributes_from_inode(struct inode *inode,
if (orangefs_inode->attr_valid & ATTR_ATIME) {
attrs->mask |= ORANGEFS_ATTR_SYS_ATIME;
if (orangefs_inode->attr_valid & ATTR_ATIME_SET) {
- attrs->atime = (time64_t)inode->i_atime.tv_sec;
+ attrs->atime = (time64_t) inode_get_atime_sec(inode);
attrs->mask |= ORANGEFS_ATTR_SYS_ATIME_SET;
}
}
if (orangefs_inode->attr_valid & ATTR_MTIME) {
attrs->mask |= ORANGEFS_ATTR_SYS_MTIME;
if (orangefs_inode->attr_valid & ATTR_MTIME_SET) {
- attrs->mtime = (time64_t)inode->i_mtime.tv_sec;
+ attrs->mtime = (time64_t) inode_get_mtime_sec(inode);
attrs->mask |= ORANGEFS_ATTR_SYS_MTIME_SET;
}
}
@@ -357,15 +357,15 @@ again2:
downcall.resp.getattr.attributes.owner);
inode->i_gid = make_kgid(&init_user_ns, new_op->
downcall.resp.getattr.attributes.group);
- inode->i_atime.tv_sec = (time64_t)new_op->
- downcall.resp.getattr.attributes.atime;
- inode->i_mtime.tv_sec = (time64_t)new_op->
- downcall.resp.getattr.attributes.mtime;
+ inode_set_atime(inode,
+ (time64_t)new_op->downcall.resp.getattr.attributes.atime,
+ 0);
+ inode_set_mtime(inode,
+ (time64_t)new_op->downcall.resp.getattr.attributes.mtime,
+ 0);
inode_set_ctime(inode,
(time64_t)new_op->downcall.resp.getattr.attributes.ctime,
0);
- inode->i_atime.tv_nsec = 0;
- inode->i_mtime.tv_nsec = 0;
/* special case: mark the root inode as sticky */
inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) |
diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c
index 68b62689a63e..74ef75586f38 100644
--- a/fs/orangefs/xattr.c
+++ b/fs/orangefs/xattr.c
@@ -554,7 +554,7 @@ static const struct xattr_handler orangefs_xattr_default_handler = {
.set = orangefs_xattr_set_default,
};
-const struct xattr_handler *orangefs_xattr_handlers[] = {
+const struct xattr_handler * const orangefs_xattr_handlers[] = {
&orangefs_xattr_default_handler,
NULL
};
diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile
index 4e173d56b11f..5648954f8588 100644
--- a/fs/overlayfs/Makefile
+++ b/fs/overlayfs/Makefile
@@ -6,4 +6,4 @@
obj-$(CONFIG_OVERLAY_FS) += overlay.o
overlay-objs := super.o namei.o util.o inode.o file.o dir.o readdir.o \
- copy_up.o export.o params.o
+ copy_up.o export.o params.o xattrs.o
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index ada3fcc9c6d5..8bea66c97316 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -252,7 +252,9 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
return PTR_ERR(old_file);
/* Try to use clone_file_range to clone up within the same fs */
+ ovl_start_write(dentry);
cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0);
+ ovl_end_write(dentry);
if (cloned == len)
goto out_fput;
/* Couldn't clone, so now we try to copy the data */
@@ -287,8 +289,12 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
* it may not recognize all kind of holes and sometimes
* only skips partial of hole area. However, it will be
* enough for most of the use cases.
+ *
+ * We do not hold upper sb_writers throughout the loop to avert
+ * lockdep warning with llseek of lower file in nested overlay:
+ * - upper sb_writers
+ * -- lower ovl_inode_lock (ovl_llseek)
*/
-
if (skip_hole && data_pos < old_pos) {
data_pos = vfs_llseek(old_file, old_pos, SEEK_DATA);
if (data_pos > old_pos) {
@@ -303,9 +309,11 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
}
}
+ ovl_start_write(dentry);
bytes = do_splice_direct(old_file, &old_pos,
new_file, &new_pos,
this_len, SPLICE_F_MOVE);
+ ovl_end_write(dentry);
if (bytes <= 0) {
error = bytes;
break;
@@ -426,29 +434,29 @@ out_err:
return ERR_PTR(err);
}
-int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower,
- struct dentry *upper)
+struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin)
{
- const struct ovl_fh *fh = NULL;
- int err;
-
/*
* When lower layer doesn't support export operations store a 'null' fh,
* so we can use the overlay.origin xattr to distignuish between a copy
* up and a pure upper inode.
*/
- if (ovl_can_decode_fh(lower->d_sb)) {
- fh = ovl_encode_real_fh(ofs, lower, false);
- if (IS_ERR(fh))
- return PTR_ERR(fh);
- }
+ if (!ovl_can_decode_fh(origin->d_sb))
+ return NULL;
+
+ return ovl_encode_real_fh(ofs, origin, false);
+}
+
+int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh,
+ struct dentry *upper)
+{
+ int err;
/*
* Do not fail when upper doesn't support xattrs.
*/
err = ovl_check_setxattr(ofs, upper, OVL_XATTR_ORIGIN, fh->buf,
fh ? fh->fb.len : 0, 0);
- kfree(fh);
/* Ignore -EPERM from setting "user.*" on symlink/special */
return err == -EPERM ? 0 : err;
@@ -476,7 +484,7 @@ static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper,
*
* Caller must hold i_mutex on indexdir.
*/
-static int ovl_create_index(struct dentry *dentry, struct dentry *origin,
+static int ovl_create_index(struct dentry *dentry, const struct ovl_fh *fh,
struct dentry *upper)
{
struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
@@ -502,7 +510,7 @@ static int ovl_create_index(struct dentry *dentry, struct dentry *origin,
if (WARN_ON(ovl_test_flag(OVL_INDEX, d_inode(dentry))))
return -EIO;
- err = ovl_get_index_name(ofs, origin, &name);
+ err = ovl_get_index_name_fh(fh, &name);
if (err)
return err;
@@ -541,6 +549,7 @@ struct ovl_copy_up_ctx {
struct dentry *destdir;
struct qstr destname;
struct dentry *workdir;
+ const struct ovl_fh *origin_fh;
bool origin;
bool indexed;
bool metacopy;
@@ -555,14 +564,16 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c)
struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
struct inode *udir = d_inode(upperdir);
+ ovl_start_write(c->dentry);
+
/* Mark parent "impure" because it may now contain non-pure upper */
err = ovl_set_impure(c->parent, upperdir);
if (err)
- return err;
+ goto out;
err = ovl_set_nlink_lower(c->dentry);
if (err)
- return err;
+ goto out;
inode_lock_nested(udir, I_MUTEX_PARENT);
upper = ovl_lookup_upper(ofs, c->dentry->d_name.name, upperdir,
@@ -581,10 +592,12 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c)
}
inode_unlock(udir);
if (err)
- return err;
+ goto out;
err = ovl_set_nlink_upper(c->dentry);
+out:
+ ovl_end_write(c->dentry);
return err;
}
@@ -637,7 +650,7 @@ static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp)
* hard link.
*/
if (c->origin) {
- err = ovl_set_origin(ofs, c->lowerpath.dentry, temp);
+ err = ovl_set_origin_fh(ofs, c->origin_fh, temp);
if (err)
return err;
}
@@ -719,21 +732,19 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
.link = c->link
};
- /* workdir and destdir could be the same when copying up to indexdir */
- err = -EIO;
- if (lock_rename(c->workdir, c->destdir) != NULL)
- goto unlock;
-
err = ovl_prep_cu_creds(c->dentry, &cc);
if (err)
- goto unlock;
+ return err;
+ ovl_start_write(c->dentry);
+ inode_lock(wdir);
temp = ovl_create_temp(ofs, c->workdir, &cattr);
+ inode_unlock(wdir);
+ ovl_end_write(c->dentry);
ovl_revert_cu_creds(&cc);
- err = PTR_ERR(temp);
if (IS_ERR(temp))
- goto unlock;
+ return PTR_ERR(temp);
/*
* Copy up data first and then xattrs. Writing data after
@@ -741,15 +752,29 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
*/
path.dentry = temp;
err = ovl_copy_up_data(c, &path);
- if (err)
+ /*
+ * We cannot hold lock_rename() throughout this helper, because of
+ * lock ordering with sb_writers, which shouldn't be held when calling
+ * ovl_copy_up_data(), so lock workdir and destdir and make sure that
+ * temp wasn't moved before copy up completion or cleanup.
+ */
+ ovl_start_write(c->dentry);
+ if (lock_rename(c->workdir, c->destdir) != NULL ||
+ temp->d_parent != c->workdir) {
+ /* temp or workdir moved underneath us? abort without cleanup */
+ dput(temp);
+ err = -EIO;
+ goto unlock;
+ } else if (err) {
goto cleanup;
+ }
err = ovl_copy_up_metadata(c, temp);
if (err)
goto cleanup;
if (S_ISDIR(c->stat.mode) && c->indexed) {
- err = ovl_create_index(c->dentry, c->lowerpath.dentry, temp);
+ err = ovl_create_index(c->dentry, c->origin_fh, temp);
if (err)
goto cleanup;
}
@@ -779,6 +804,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
ovl_set_flag(OVL_WHITEOUTS, inode);
unlock:
unlock_rename(c->workdir, c->destdir);
+ ovl_end_write(c->dentry);
return err;
@@ -802,9 +828,10 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
if (err)
return err;
+ ovl_start_write(c->dentry);
tmpfile = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode);
+ ovl_end_write(c->dentry);
ovl_revert_cu_creds(&cc);
-
if (IS_ERR(tmpfile))
return PTR_ERR(tmpfile);
@@ -815,9 +842,11 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
goto out_fput;
}
+ ovl_start_write(c->dentry);
+
err = ovl_copy_up_metadata(c, temp);
if (err)
- goto out_fput;
+ goto out;
inode_lock_nested(udir, I_MUTEX_PARENT);
@@ -831,7 +860,7 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
inode_unlock(udir);
if (err)
- goto out_fput;
+ goto out;
if (c->metacopy_digest)
ovl_set_flag(OVL_HAS_DIGEST, d_inode(c->dentry));
@@ -843,6 +872,8 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
ovl_set_upperdata(d_inode(c->dentry));
ovl_inode_update(d_inode(c->dentry), dget(temp));
+out:
+ ovl_end_write(c->dentry);
out_fput:
fput(tmpfile);
return err;
@@ -861,6 +892,8 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
{
int err;
struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
+ struct dentry *origin = c->lowerpath.dentry;
+ struct ovl_fh *fh = NULL;
bool to_index = false;
/*
@@ -877,25 +910,35 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
to_index = true;
}
- if (S_ISDIR(c->stat.mode) || c->stat.nlink == 1 || to_index)
+ if (S_ISDIR(c->stat.mode) || c->stat.nlink == 1 || to_index) {
+ fh = ovl_get_origin_fh(ofs, origin);
+ if (IS_ERR(fh))
+ return PTR_ERR(fh);
+
+ /* origin_fh may be NULL */
+ c->origin_fh = fh;
c->origin = true;
+ }
if (to_index) {
c->destdir = ovl_indexdir(c->dentry->d_sb);
- err = ovl_get_index_name(ofs, c->lowerpath.dentry, &c->destname);
+ err = ovl_get_index_name(ofs, origin, &c->destname);
if (err)
- return err;
+ goto out_free_fh;
} else if (WARN_ON(!c->parent)) {
/* Disconnected dentry must be copied up to index dir */
- return -EIO;
+ err = -EIO;
+ goto out_free_fh;
} else {
/*
* Mark parent "impure" because it may now contain non-pure
* upper
*/
+ ovl_start_write(c->dentry);
err = ovl_set_impure(c->parent, c->destdir);
+ ovl_end_write(c->dentry);
if (err)
- return err;
+ goto out_free_fh;
}
/* Should we copyup with O_TMPFILE or with workdir? */
@@ -909,6 +952,7 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
if (c->indexed)
ovl_set_flag(OVL_INDEX, d_inode(c->dentry));
+ ovl_start_write(c->dentry);
if (to_index) {
/* Initialize nlink for copy up of disconnected dentry */
err = ovl_set_nlink_upper(c->dentry);
@@ -923,10 +967,13 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
ovl_dentry_set_upper_alias(c->dentry);
ovl_dentry_update_reval(c->dentry, ovl_dentry_upper(c->dentry));
}
+ ovl_end_write(c->dentry);
out:
if (to_index)
kfree(c->destname.name);
+out_free_fh:
+ kfree(fh);
return err;
}
@@ -1011,15 +1058,16 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
* Writing to upper file will clear security.capability xattr. We
* don't want that to happen for normal copy-up operation.
*/
+ ovl_start_write(c->dentry);
if (capability) {
err = ovl_do_setxattr(ofs, upperpath.dentry, XATTR_NAME_CAPS,
capability, cap_size, 0);
- if (err)
- goto out_free;
}
-
-
- err = ovl_removexattr(ofs, upperpath.dentry, OVL_XATTR_METACOPY);
+ if (!err) {
+ err = ovl_removexattr(ofs, upperpath.dentry,
+ OVL_XATTR_METACOPY);
+ }
+ ovl_end_write(c->dentry);
if (err)
goto out_free;
@@ -1170,17 +1218,10 @@ static bool ovl_open_need_copy_up(struct dentry *dentry, int flags)
int ovl_maybe_copy_up(struct dentry *dentry, int flags)
{
- int err = 0;
-
- if (ovl_open_need_copy_up(dentry, flags)) {
- err = ovl_want_write(dentry);
- if (!err) {
- err = ovl_copy_up_flags(dentry, flags);
- ovl_drop_write(dentry);
- }
- }
+ if (!ovl_open_need_copy_up(dentry, flags))
+ return 0;
- return err;
+ return ovl_copy_up_flags(dentry, flags);
}
int ovl_copy_up_with_data(struct dentry *dentry)
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 033fc0458a3d..aab3f5d93556 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -477,7 +477,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
goto out_unlock;
err = -ESTALE;
- if (d_is_negative(upper) || !IS_WHITEOUT(d_inode(upper)))
+ if (d_is_negative(upper) || !ovl_upper_is_whiteout(ofs, upper))
goto out_dput;
newdentry = ovl_create_temp(ofs, workdir, cattr);
@@ -559,10 +559,6 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
struct cred *override_cred;
struct dentry *parent = dentry->d_parent;
- err = ovl_copy_up(parent);
- if (err)
- return err;
-
old_cred = ovl_override_creds(dentry->d_sb);
/*
@@ -626,6 +622,10 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
.link = link,
};
+ err = ovl_copy_up(dentry->d_parent);
+ if (err)
+ return err;
+
err = ovl_want_write(dentry);
if (err)
goto out;
@@ -700,28 +700,24 @@ static int ovl_link(struct dentry *old, struct inode *newdir,
int err;
struct inode *inode;
- err = ovl_want_write(old);
+ err = ovl_copy_up(old);
if (err)
goto out;
- err = ovl_copy_up(old);
+ err = ovl_copy_up(new->d_parent);
if (err)
- goto out_drop_write;
+ goto out;
- err = ovl_copy_up(new->d_parent);
+ err = ovl_nlink_start(old);
if (err)
- goto out_drop_write;
+ goto out;
if (ovl_is_metacopy_dentry(old)) {
err = ovl_set_link_redirect(old);
if (err)
- goto out_drop_write;
+ goto out_nlink_end;
}
- err = ovl_nlink_start(old);
- if (err)
- goto out_drop_write;
-
inode = d_inode(old);
ihold(inode);
@@ -731,9 +727,8 @@ static int ovl_link(struct dentry *old, struct inode *newdir,
if (err)
iput(inode);
+out_nlink_end:
ovl_nlink_end(old);
-out_drop_write:
- ovl_drop_write(old);
out:
return err;
}
@@ -891,17 +886,13 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
goto out;
}
- err = ovl_want_write(dentry);
- if (err)
- goto out;
-
err = ovl_copy_up(dentry->d_parent);
if (err)
- goto out_drop_write;
+ goto out;
err = ovl_nlink_start(dentry);
if (err)
- goto out_drop_write;
+ goto out;
old_cred = ovl_override_creds(dentry->d_sb);
if (!lower_positive)
@@ -926,8 +917,6 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
if (ovl_dentry_upper(dentry))
ovl_copyattr(d_inode(dentry));
-out_drop_write:
- ovl_drop_write(dentry);
out:
ovl_cache_free(&list);
return err;
@@ -1131,29 +1120,32 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
}
}
- err = ovl_want_write(old);
- if (err)
- goto out;
-
err = ovl_copy_up(old);
if (err)
- goto out_drop_write;
+ goto out;
err = ovl_copy_up(new->d_parent);
if (err)
- goto out_drop_write;
+ goto out;
if (!overwrite) {
err = ovl_copy_up(new);
if (err)
- goto out_drop_write;
+ goto out;
} else if (d_inode(new)) {
err = ovl_nlink_start(new);
if (err)
- goto out_drop_write;
+ goto out;
update_nlink = true;
}
+ if (!update_nlink) {
+ /* ovl_nlink_start() took ovl_want_write() */
+ err = ovl_want_write(old);
+ if (err)
+ goto out;
+ }
+
old_cred = ovl_override_creds(old->d_sb);
if (!list_empty(&list)) {
@@ -1219,7 +1211,7 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
}
} else {
if (!d_is_negative(newdentry)) {
- if (!new_opaque || !ovl_is_whiteout(newdentry))
+ if (!new_opaque || !ovl_upper_is_whiteout(ofs, newdentry))
goto out_dput;
} else {
if (flags & RENAME_EXCHANGE)
@@ -1286,8 +1278,8 @@ out_revert_creds:
revert_creds(old_cred);
if (update_nlink)
ovl_nlink_end(new);
-out_drop_write:
- ovl_drop_write(old);
+ else
+ ovl_drop_write(old);
out:
dput(opaquedir);
ovl_cache_free(&list);
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 26b782c53910..7e16bbcad95e 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -23,12 +23,7 @@ static int ovl_encode_maybe_copy_up(struct dentry *dentry)
if (ovl_dentry_upper(dentry))
return 0;
- err = ovl_want_write(dentry);
- if (!err) {
- err = ovl_copy_up(dentry);
- ovl_drop_write(dentry);
- }
-
+ err = ovl_copy_up(dentry);
if (err) {
pr_warn_ratelimited("failed to copy up on encode (%pd2, err=%i)\n",
dentry, err);
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 8be4dc050d1e..131621daeb13 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -15,10 +15,15 @@
#include <linux/fs.h>
#include "overlayfs.h"
+#include "../internal.h" /* for sb_init_dio_done_wq */
+
struct ovl_aio_req {
struct kiocb iocb;
refcount_t ref;
struct kiocb *orig_iocb;
+ /* used for aio completion */
+ struct work_struct work;
+ long res;
};
static struct kmem_cache *ovl_aio_request_cachep;
@@ -235,10 +240,17 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence)
return ret;
}
+static void ovl_file_modified(struct file *file)
+{
+ /* Update size/mtime */
+ ovl_copyattr(file_inode(file));
+}
+
static void ovl_file_accessed(struct file *file)
{
struct inode *inode, *upperinode;
struct timespec64 ctime, uctime;
+ struct timespec64 mtime, umtime;
if (file->f_flags & O_NOATIME)
return;
@@ -251,29 +263,23 @@ static void ovl_file_accessed(struct file *file)
ctime = inode_get_ctime(inode);
uctime = inode_get_ctime(upperinode);
- if ((!timespec64_equal(&inode->i_mtime, &upperinode->i_mtime) ||
- !timespec64_equal(&ctime, &uctime))) {
- inode->i_mtime = upperinode->i_mtime;
+ mtime = inode_get_mtime(inode);
+ umtime = inode_get_mtime(upperinode);
+ if ((!timespec64_equal(&mtime, &umtime)) ||
+ !timespec64_equal(&ctime, &uctime)) {
+ inode_set_mtime_to_ts(inode, inode_get_mtime(upperinode));
inode_set_ctime_to_ts(inode, uctime);
}
touch_atime(&file->f_path);
}
-static rwf_t ovl_iocb_to_rwf(int ifl)
+#define OVL_IOCB_MASK \
+ (IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND)
+
+static rwf_t iocb_to_rw_flags(int flags)
{
- rwf_t flags = 0;
-
- if (ifl & IOCB_NOWAIT)
- flags |= RWF_NOWAIT;
- if (ifl & IOCB_HIPRI)
- flags |= RWF_HIPRI;
- if (ifl & IOCB_DSYNC)
- flags |= RWF_DSYNC;
- if (ifl & IOCB_SYNC)
- flags |= RWF_SYNC;
-
- return flags;
+ return (__force rwf_t)(flags & OVL_IOCB_MASK);
}
static inline void ovl_aio_put(struct ovl_aio_req *aio_req)
@@ -290,10 +296,8 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
struct kiocb *orig_iocb = aio_req->orig_iocb;
if (iocb->ki_flags & IOCB_WRITE) {
- struct inode *inode = file_inode(orig_iocb->ki_filp);
-
kiocb_end_write(iocb);
- ovl_copyattr(inode);
+ ovl_file_modified(orig_iocb->ki_filp);
}
orig_iocb->ki_pos = iocb->ki_pos;
@@ -310,6 +314,37 @@ static void ovl_aio_rw_complete(struct kiocb *iocb, long res)
orig_iocb->ki_complete(orig_iocb, res);
}
+static void ovl_aio_complete_work(struct work_struct *work)
+{
+ struct ovl_aio_req *aio_req = container_of(work,
+ struct ovl_aio_req, work);
+
+ ovl_aio_rw_complete(&aio_req->iocb, aio_req->res);
+}
+
+static void ovl_aio_queue_completion(struct kiocb *iocb, long res)
+{
+ struct ovl_aio_req *aio_req = container_of(iocb,
+ struct ovl_aio_req, iocb);
+ struct kiocb *orig_iocb = aio_req->orig_iocb;
+
+ /*
+ * Punt to a work queue to serialize updates of mtime/size.
+ */
+ aio_req->res = res;
+ INIT_WORK(&aio_req->work, ovl_aio_complete_work);
+ queue_work(file_inode(orig_iocb->ki_filp)->i_sb->s_dio_done_wq,
+ &aio_req->work);
+}
+
+static int ovl_init_aio_done_wq(struct super_block *sb)
+{
+ if (sb->s_dio_done_wq)
+ return 0;
+
+ return sb_init_dio_done_wq(sb);
+}
+
static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
@@ -331,8 +366,9 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
old_cred = ovl_override_creds(file_inode(file)->i_sb);
if (is_sync_kiocb(iocb)) {
- ret = vfs_iter_read(real.file, iter, &iocb->ki_pos,
- ovl_iocb_to_rwf(iocb->ki_flags));
+ rwf_t rwf = iocb_to_rw_flags(iocb->ki_flags);
+
+ ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, rwf);
} else {
struct ovl_aio_req *aio_req;
@@ -398,15 +434,20 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
old_cred = ovl_override_creds(file_inode(file)->i_sb);
if (is_sync_kiocb(iocb)) {
+ rwf_t rwf = iocb_to_rw_flags(ifl);
+
file_start_write(real.file);
- ret = vfs_iter_write(real.file, iter, &iocb->ki_pos,
- ovl_iocb_to_rwf(ifl));
+ ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, rwf);
file_end_write(real.file);
/* Update size */
- ovl_copyattr(inode);
+ ovl_file_modified(file);
} else {
struct ovl_aio_req *aio_req;
+ ret = ovl_init_aio_done_wq(inode->i_sb);
+ if (ret)
+ goto out;
+
ret = -ENOMEM;
aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL);
if (!aio_req)
@@ -415,7 +456,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
aio_req->orig_iocb = iocb;
kiocb_clone(&aio_req->iocb, iocb, get_file(real.file));
aio_req->iocb.ki_flags = ifl;
- aio_req->iocb.ki_complete = ovl_aio_rw_complete;
+ aio_req->iocb.ki_complete = ovl_aio_queue_completion;
refcount_set(&aio_req->ref, 2);
kiocb_start_write(&aio_req->iocb);
ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
@@ -489,7 +530,7 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
file_end_write(real.file);
/* Update size */
- ovl_copyattr(inode);
+ ovl_file_modified(out);
revert_creds(old_cred);
fdput(real);
@@ -570,7 +611,7 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len
revert_creds(old_cred);
/* Update size */
- ovl_copyattr(inode);
+ ovl_file_modified(file);
fdput(real);
@@ -654,7 +695,7 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
revert_creds(old_cred);
/* Update size */
- ovl_copyattr(inode_out);
+ ovl_file_modified(file_out);
fdput(real_in);
fdput(real_out);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 83ef66644c21..c63b31a460be 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -32,10 +32,6 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (err)
return err;
- err = ovl_want_write(dentry);
- if (err)
- goto out;
-
if (attr->ia_valid & ATTR_SIZE) {
/* Truncate should trigger data copy up as well */
full_copy_up = true;
@@ -54,7 +50,7 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
winode = d_inode(upperdentry);
err = get_write_access(winode);
if (err)
- goto out_drop_write;
+ goto out;
}
if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
@@ -78,6 +74,10 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
*/
attr->ia_valid &= ~ATTR_OPEN;
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out_put_write;
+
inode_lock(upperdentry->d_inode);
old_cred = ovl_override_creds(dentry->d_sb);
err = ovl_do_notify_change(ofs, upperdentry, attr);
@@ -85,12 +85,12 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (!err)
ovl_copyattr(dentry->d_inode);
inode_unlock(upperdentry->d_inode);
+ ovl_drop_write(dentry);
+out_put_write:
if (winode)
put_write_access(winode);
}
-out_drop_write:
- ovl_drop_write(dentry);
out:
return err;
}
@@ -171,7 +171,7 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
type = ovl_path_real(dentry, &realpath);
old_cred = ovl_override_creds(dentry->d_sb);
- err = vfs_getattr(&realpath, stat, request_mask, flags);
+ err = ovl_do_getattr(&realpath, stat, request_mask, flags);
if (err)
goto out;
@@ -196,8 +196,8 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
(!is_dir ? STATX_NLINK : 0);
ovl_path_lower(dentry, &realpath);
- err = vfs_getattr(&realpath, &lowerstat,
- lowermask, flags);
+ err = ovl_do_getattr(&realpath, &lowerstat, lowermask,
+ flags);
if (err)
goto out;
@@ -249,8 +249,8 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
ovl_path_lowerdata(dentry, &realpath);
if (realpath.dentry) {
- err = vfs_getattr(&realpath, &lowerdatastat,
- lowermask, flags);
+ err = ovl_do_getattr(&realpath, &lowerdatastat,
+ lowermask, flags);
if (err)
goto out;
} else {
@@ -339,130 +339,6 @@ static const char *ovl_get_link(struct dentry *dentry,
return p;
}
-bool ovl_is_private_xattr(struct super_block *sb, const char *name)
-{
- struct ovl_fs *ofs = OVL_FS(sb);
-
- if (ofs->config.userxattr)
- return strncmp(name, OVL_XATTR_USER_PREFIX,
- sizeof(OVL_XATTR_USER_PREFIX) - 1) == 0;
- else
- return strncmp(name, OVL_XATTR_TRUSTED_PREFIX,
- sizeof(OVL_XATTR_TRUSTED_PREFIX) - 1) == 0;
-}
-
-int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
- const void *value, size_t size, int flags)
-{
- int err;
- struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
- struct dentry *upperdentry = ovl_i_dentry_upper(inode);
- struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry);
- struct path realpath;
- const struct cred *old_cred;
-
- err = ovl_want_write(dentry);
- if (err)
- goto out;
-
- if (!value && !upperdentry) {
- ovl_path_lower(dentry, &realpath);
- old_cred = ovl_override_creds(dentry->d_sb);
- err = vfs_getxattr(mnt_idmap(realpath.mnt), realdentry, name, NULL, 0);
- revert_creds(old_cred);
- if (err < 0)
- goto out_drop_write;
- }
-
- if (!upperdentry) {
- err = ovl_copy_up(dentry);
- if (err)
- goto out_drop_write;
-
- realdentry = ovl_dentry_upper(dentry);
- }
-
- old_cred = ovl_override_creds(dentry->d_sb);
- if (value) {
- err = ovl_do_setxattr(ofs, realdentry, name, value, size,
- flags);
- } else {
- WARN_ON(flags != XATTR_REPLACE);
- err = ovl_do_removexattr(ofs, realdentry, name);
- }
- revert_creds(old_cred);
-
- /* copy c/mtime */
- ovl_copyattr(inode);
-
-out_drop_write:
- ovl_drop_write(dentry);
-out:
- return err;
-}
-
-int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
- void *value, size_t size)
-{
- ssize_t res;
- const struct cred *old_cred;
- struct path realpath;
-
- ovl_i_path_real(inode, &realpath);
- old_cred = ovl_override_creds(dentry->d_sb);
- res = vfs_getxattr(mnt_idmap(realpath.mnt), realpath.dentry, name, value, size);
- revert_creds(old_cred);
- return res;
-}
-
-static bool ovl_can_list(struct super_block *sb, const char *s)
-{
- /* Never list private (.overlay) */
- if (ovl_is_private_xattr(sb, s))
- return false;
-
- /* List all non-trusted xattrs */
- if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0)
- return true;
-
- /* list other trusted for superuser only */
- return ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN);
-}
-
-ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
-{
- struct dentry *realdentry = ovl_dentry_real(dentry);
- ssize_t res;
- size_t len;
- char *s;
- const struct cred *old_cred;
-
- old_cred = ovl_override_creds(dentry->d_sb);
- res = vfs_listxattr(realdentry, list, size);
- revert_creds(old_cred);
- if (res <= 0 || size == 0)
- return res;
-
- /* filter out private xattrs */
- for (s = list, len = res; len;) {
- size_t slen = strnlen(s, len) + 1;
-
- /* underlying fs providing us with an broken xattr list? */
- if (WARN_ON(slen > len))
- return -EIO;
-
- len -= slen;
- if (!ovl_can_list(dentry->d_sb, s)) {
- res -= slen;
- memmove(s, s + slen, len);
- } else {
- s += slen;
- }
- }
-
- return res;
-}
-
#ifdef CONFIG_FS_POSIX_ACL
/*
* Apply the idmapping of the layer to POSIX ACLs. The caller must pass a clone
@@ -611,10 +487,6 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode,
struct dentry *upperdentry = ovl_dentry_upper(dentry);
struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry);
- err = ovl_want_write(dentry);
- if (err)
- return err;
-
/*
* If ACL is to be removed from a lower file, check if it exists in
* the first place before copying it up.
@@ -630,7 +502,7 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode,
revert_creds(old_cred);
if (IS_ERR(real_acl)) {
err = PTR_ERR(real_acl);
- goto out_drop_write;
+ goto out;
}
posix_acl_release(real_acl);
}
@@ -638,23 +510,26 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode,
if (!upperdentry) {
err = ovl_copy_up(dentry);
if (err)
- goto out_drop_write;
+ goto out;
realdentry = ovl_dentry_upper(dentry);
}
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out;
+
old_cred = ovl_override_creds(dentry->d_sb);
if (acl)
err = ovl_do_set_acl(ofs, realdentry, acl_name, acl);
else
err = ovl_do_remove_acl(ofs, realdentry, acl_name);
revert_creds(old_cred);
+ ovl_drop_write(dentry);
/* copy c/mtime */
ovl_copyattr(inode);
-
-out_drop_write:
- ovl_drop_write(dentry);
+out:
return err;
}
@@ -704,7 +579,8 @@ int ovl_update_time(struct inode *inode, int flags)
if (upperpath.dentry) {
touch_atime(&upperpath);
- inode->i_atime = d_inode(upperpath.dentry)->i_atime;
+ inode_set_atime_to_ts(inode,
+ inode_get_atime(d_inode(upperpath.dentry)));
}
}
return 0;
@@ -777,14 +653,14 @@ int ovl_fileattr_set(struct mnt_idmap *idmap,
unsigned int flags;
int err;
- err = ovl_want_write(dentry);
- if (err)
- goto out;
-
err = ovl_copy_up(dentry);
if (!err) {
ovl_path_real(dentry, &upperpath);
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out;
+
old_cred = ovl_override_creds(inode->i_sb);
/*
* Store immutable/append-only flags in xattr and clear them
@@ -797,6 +673,7 @@ int ovl_fileattr_set(struct mnt_idmap *idmap,
if (!err)
err = ovl_real_fileattr_set(&upperpath, fa);
revert_creds(old_cred);
+ ovl_drop_write(dentry);
/*
* Merge real inode flags with inode flags read from
@@ -811,7 +688,6 @@ int ovl_fileattr_set(struct mnt_idmap *idmap,
/* Update ctime */
ovl_copyattr(inode);
}
- ovl_drop_write(dentry);
out:
return err;
}
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 80391c687c2a..03bc8d5dfa31 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -251,7 +251,10 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
err = -EREMOTE;
goto out_err;
}
- if (ovl_is_whiteout(this)) {
+
+ path.dentry = this;
+ path.mnt = d->mnt;
+ if (ovl_path_is_whiteout(OVL_FS(d->sb), &path)) {
d->stop = d->opaque = true;
goto put_and_out;
}
@@ -264,8 +267,6 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
goto put_and_out;
}
- path.dentry = this;
- path.mnt = d->mnt;
if (!d_can_lookup(this)) {
if (d->is_dir || !last_element) {
d->stop = true;
@@ -438,7 +439,7 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected,
else if (IS_ERR(origin))
return PTR_ERR(origin);
- if (upperdentry && !ovl_is_whiteout(upperdentry) &&
+ if (upperdentry && !ovl_upper_is_whiteout(ofs, upperdentry) &&
inode_wrong_type(d_inode(upperdentry), d_inode(origin)->i_mode))
goto invalid;
@@ -507,6 +508,19 @@ static int ovl_verify_fh(struct ovl_fs *ofs, struct dentry *dentry,
return err;
}
+int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry,
+ enum ovl_xattr ox, const struct ovl_fh *fh,
+ bool is_upper, bool set)
+{
+ int err;
+
+ err = ovl_verify_fh(ofs, dentry, ox, fh);
+ if (set && err == -ENODATA)
+ err = ovl_setxattr(ofs, dentry, ox, fh->buf, fh->fb.len);
+
+ return err;
+}
+
/*
* Verify that @real dentry matches the file handle stored in xattr @name.
*
@@ -515,9 +529,9 @@ static int ovl_verify_fh(struct ovl_fs *ofs, struct dentry *dentry,
*
* Return 0 on match, -ESTALE on mismatch, -ENODATA on no xattr, < 0 on error.
*/
-int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry,
- enum ovl_xattr ox, struct dentry *real, bool is_upper,
- bool set)
+int ovl_verify_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry,
+ enum ovl_xattr ox, struct dentry *real,
+ bool is_upper, bool set)
{
struct inode *inode;
struct ovl_fh *fh;
@@ -530,9 +544,7 @@ int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry,
goto fail;
}
- err = ovl_verify_fh(ofs, dentry, ox, fh);
- if (set && err == -ENODATA)
- err = ovl_setxattr(ofs, dentry, ox, fh->buf, fh->fb.len);
+ err = ovl_verify_set_fh(ofs, dentry, ox, fh, is_upper, set);
if (err)
goto fail;
@@ -548,6 +560,7 @@ fail:
goto out;
}
+
/* Get upper dentry from index */
struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index,
bool connected)
@@ -684,7 +697,7 @@ orphan:
goto out;
}
-static int ovl_get_index_name_fh(struct ovl_fh *fh, struct qstr *name)
+int ovl_get_index_name_fh(const struct ovl_fh *fh, struct qstr *name)
{
char *n, *s;
@@ -873,20 +886,27 @@ int ovl_path_next(int idx, struct dentry *dentry, struct path *path)
static int ovl_fix_origin(struct ovl_fs *ofs, struct dentry *dentry,
struct dentry *lower, struct dentry *upper)
{
+ const struct ovl_fh *fh;
int err;
if (ovl_check_origin_xattr(ofs, upper))
return 0;
+ fh = ovl_get_origin_fh(ofs, lower);
+ if (IS_ERR(fh))
+ return PTR_ERR(fh);
+
err = ovl_want_write(dentry);
if (err)
- return err;
+ goto out;
- err = ovl_set_origin(ofs, lower, upper);
+ err = ovl_set_origin_fh(ofs, fh, upper);
if (!err)
err = ovl_set_impure(dentry->d_parent, upper->d_parent);
ovl_drop_write(dentry);
+out:
+ kfree(fh);
return err;
}
@@ -1383,7 +1403,11 @@ bool ovl_lower_positive(struct dentry *dentry)
break;
}
} else {
- positive = !ovl_is_whiteout(this);
+ struct path path = {
+ .dentry = this,
+ .mnt = parentpath->layer->mnt,
+ };
+ positive = !ovl_path_is_whiteout(OVL_FS(dentry->d_sb), &path);
done = true;
dput(this);
}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 9817b2dcb132..05c3dd597fa8 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -28,7 +28,16 @@ enum ovl_path_type {
#define OVL_XATTR_NAMESPACE "overlay."
#define OVL_XATTR_TRUSTED_PREFIX XATTR_TRUSTED_PREFIX OVL_XATTR_NAMESPACE
+#define OVL_XATTR_TRUSTED_PREFIX_LEN (sizeof(OVL_XATTR_TRUSTED_PREFIX) - 1)
#define OVL_XATTR_USER_PREFIX XATTR_USER_PREFIX OVL_XATTR_NAMESPACE
+#define OVL_XATTR_USER_PREFIX_LEN (sizeof(OVL_XATTR_USER_PREFIX) - 1)
+
+#define OVL_XATTR_ESCAPE_PREFIX OVL_XATTR_NAMESPACE
+#define OVL_XATTR_ESCAPE_PREFIX_LEN (sizeof(OVL_XATTR_ESCAPE_PREFIX) - 1)
+#define OVL_XATTR_ESCAPE_TRUSTED_PREFIX OVL_XATTR_TRUSTED_PREFIX OVL_XATTR_ESCAPE_PREFIX
+#define OVL_XATTR_ESCAPE_TRUSTED_PREFIX_LEN (sizeof(OVL_XATTR_ESCAPE_TRUSTED_PREFIX) - 1)
+#define OVL_XATTR_ESCAPE_USER_PREFIX OVL_XATTR_USER_PREFIX OVL_XATTR_ESCAPE_PREFIX
+#define OVL_XATTR_ESCAPE_USER_PREFIX_LEN (sizeof(OVL_XATTR_ESCAPE_USER_PREFIX) - 1)
enum ovl_xattr {
OVL_XATTR_OPAQUE,
@@ -40,6 +49,8 @@ enum ovl_xattr {
OVL_XATTR_UUID,
OVL_XATTR_METACOPY,
OVL_XATTR_PROTATTR,
+ OVL_XATTR_XWHITEOUT,
+ OVL_XATTR_XWHITEOUTS,
};
enum ovl_inode_flag {
@@ -397,7 +408,19 @@ static inline bool ovl_open_flags_need_copy_up(int flags)
return ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC));
}
+static inline int ovl_do_getattr(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int flags)
+{
+ if (flags & AT_GETATTR_NOSEC)
+ return vfs_getattr_nosec(path, stat, request_mask, flags);
+ return vfs_getattr(path, stat, request_mask, flags);
+}
+
/* util.c */
+int ovl_get_write_access(struct dentry *dentry);
+void ovl_put_write_access(struct dentry *dentry);
+void ovl_start_write(struct dentry *dentry);
+void ovl_end_write(struct dentry *dentry);
int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
struct dentry *ovl_workdir(struct dentry *dentry);
@@ -460,6 +483,7 @@ void ovl_inode_update(struct inode *inode, struct dentry *upperdentry);
void ovl_dir_modified(struct dentry *dentry, bool impurity);
u64 ovl_inode_version_get(struct inode *inode);
bool ovl_is_whiteout(struct dentry *dentry);
+bool ovl_path_is_whiteout(struct ovl_fs *ofs, const struct path *path);
struct file *ovl_path_open(const struct path *path, int flags);
int ovl_copy_up_start(struct dentry *dentry, int flags);
void ovl_copy_up_end(struct dentry *dentry);
@@ -467,9 +491,21 @@ bool ovl_already_copied_up(struct dentry *dentry, int flags);
bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
enum ovl_xattr ox);
bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path);
+bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path);
+bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path);
bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs,
const struct path *upperpath);
+static inline bool ovl_upper_is_whiteout(struct ovl_fs *ofs,
+ struct dentry *upperdentry)
+{
+ struct path upperpath = {
+ .dentry = upperdentry,
+ .mnt = ovl_upper_mnt(ofs),
+ };
+ return ovl_path_is_whiteout(ofs, &upperpath);
+}
+
static inline bool ovl_check_origin_xattr(struct ovl_fs *ofs,
struct dentry *upperdentry)
{
@@ -624,11 +660,15 @@ struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh,
int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected,
struct dentry *upperdentry, struct ovl_path **stackp);
int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry,
- enum ovl_xattr ox, struct dentry *real, bool is_upper,
- bool set);
+ enum ovl_xattr ox, const struct ovl_fh *fh,
+ bool is_upper, bool set);
+int ovl_verify_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry,
+ enum ovl_xattr ox, struct dentry *real,
+ bool is_upper, bool set);
struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index,
bool connected);
int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index);
+int ovl_get_index_name_fh(const struct ovl_fh *fh, struct qstr *name);
int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin,
struct qstr *name);
struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh);
@@ -640,17 +680,24 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
bool ovl_lower_positive(struct dentry *dentry);
+static inline int ovl_verify_origin_fh(struct ovl_fs *ofs, struct dentry *upper,
+ const struct ovl_fh *fh, bool set)
+{
+ return ovl_verify_set_fh(ofs, upper, OVL_XATTR_ORIGIN, fh, false, set);
+}
+
static inline int ovl_verify_origin(struct ovl_fs *ofs, struct dentry *upper,
struct dentry *origin, bool set)
{
- return ovl_verify_set_fh(ofs, upper, OVL_XATTR_ORIGIN, origin,
- false, set);
+ return ovl_verify_origin_xattr(ofs, upper, OVL_XATTR_ORIGIN, origin,
+ false, set);
}
static inline int ovl_verify_upper(struct ovl_fs *ofs, struct dentry *index,
struct dentry *upper, bool set)
{
- return ovl_verify_set_fh(ofs, index, OVL_XATTR_UPPER, upper, true, set);
+ return ovl_verify_origin_xattr(ofs, index, OVL_XATTR_UPPER, upper,
+ true, set);
}
/* readdir.c */
@@ -684,17 +731,8 @@ int ovl_set_nlink_lower(struct dentry *dentry);
unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry,
struct dentry *upperdentry,
unsigned int fallback);
-int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
- struct iattr *attr);
-int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
- struct kstat *stat, u32 request_mask, unsigned int flags);
int ovl_permission(struct mnt_idmap *idmap, struct inode *inode,
int mask);
-int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
- const void *value, size_t size, int flags);
-int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
- void *value, size_t size);
-ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
#ifdef CONFIG_FS_POSIX_ACL
struct posix_acl *do_ovl_get_acl(struct mnt_idmap *idmap,
@@ -815,8 +853,9 @@ int ovl_copy_xattr(struct super_block *sb, const struct path *path, struct dentr
int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upper, struct kstat *stat);
struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real,
bool is_upper);
-int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower,
- struct dentry *upper);
+struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin);
+int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh,
+ struct dentry *upper);
/* export.c */
extern const struct export_operations ovl_export_operations;
@@ -830,3 +869,12 @@ static inline bool ovl_force_readonly(struct ovl_fs *ofs)
{
return (!ovl_upper_mnt(ofs) || !ofs->workdir);
}
+
+/* xattr.c */
+
+const struct xattr_handler * const *ovl_xattr_handlers(struct ovl_fs *ofs);
+int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr);
+int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
+ struct kstat *stat, u32 request_mask, unsigned int flags);
+ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index f6ff23fd101c..3fe2dde1598f 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -43,8 +43,10 @@ module_param_named(metacopy, ovl_metacopy_def, bool, 0644);
MODULE_PARM_DESC(metacopy,
"Default to on or off for the metadata only copy up feature");
-enum {
+enum ovl_opt {
Opt_lowerdir,
+ Opt_lowerdir_add,
+ Opt_datadir_add,
Opt_upperdir,
Opt_workdir,
Opt_default_permissions,
@@ -140,8 +142,11 @@ static int ovl_verity_mode_def(void)
#define fsparam_string_empty(NAME, OPT) \
__fsparam(fs_param_is_string, NAME, OPT, fs_param_can_be_empty, NULL)
+
const struct fs_parameter_spec ovl_parameter_spec[] = {
fsparam_string_empty("lowerdir", Opt_lowerdir),
+ fsparam_string("lowerdir+", Opt_lowerdir_add),
+ fsparam_string("datadir+", Opt_datadir_add),
fsparam_string("upperdir", Opt_upperdir),
fsparam_string("workdir", Opt_workdir),
fsparam_flag("default_permissions", Opt_default_permissions),
@@ -238,19 +243,8 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path)
pr_err("failed to resolve '%s': %i\n", name, err);
goto out;
}
- err = -EINVAL;
- if (ovl_dentry_weird(path->dentry)) {
- pr_err("filesystem on '%s' not supported\n", name);
- goto out_put;
- }
- if (!d_is_dir(path->dentry)) {
- pr_err("'%s' not a directory\n", name);
- goto out_put;
- }
return 0;
-out_put:
- path_put_init(path);
out:
return err;
}
@@ -268,7 +262,7 @@ static void ovl_unescape(char *s)
}
}
-static int ovl_mount_dir(const char *name, struct path *path, bool upper)
+static int ovl_mount_dir(const char *name, struct path *path)
{
int err = -ENOMEM;
char *tmp = kstrdup(name, GFP_KERNEL);
@@ -276,68 +270,147 @@ static int ovl_mount_dir(const char *name, struct path *path, bool upper)
if (tmp) {
ovl_unescape(tmp);
err = ovl_mount_dir_noesc(tmp, path);
-
- if (!err && upper && path->dentry->d_flags & DCACHE_OP_REAL) {
- pr_err("filesystem on '%s' not supported as upperdir\n",
- tmp);
- path_put_init(path);
- err = -EINVAL;
- }
kfree(tmp);
}
return err;
}
-static int ovl_parse_param_upperdir(const char *name, struct fs_context *fc,
- bool workdir)
+static int ovl_mount_dir_check(struct fs_context *fc, const struct path *path,
+ enum ovl_opt layer, const char *name, bool upper)
{
- int err;
- struct ovl_fs *ofs = fc->s_fs_info;
- struct ovl_config *config = &ofs->config;
struct ovl_fs_context *ctx = fc->fs_private;
- struct path path;
- char *dup;
- err = ovl_mount_dir(name, &path, true);
- if (err)
- return err;
+ if (ovl_dentry_weird(path->dentry))
+ return invalfc(fc, "filesystem on %s not supported", name);
+
+ if (!d_is_dir(path->dentry))
+ return invalfc(fc, "%s is not a directory", name);
+
/*
* Check whether upper path is read-only here to report failures
* early. Don't forget to recheck when the superblock is created
* as the mount attributes could change.
*/
- if (__mnt_is_readonly(path.mnt)) {
- path_put(&path);
- return -EINVAL;
+ if (upper) {
+ if (path->dentry->d_flags & DCACHE_OP_REAL)
+ return invalfc(fc, "filesystem on %s not supported as upperdir", name);
+ if (__mnt_is_readonly(path->mnt))
+ return invalfc(fc, "filesystem on %s is read-only", name);
+ } else {
+ if (ctx->lowerdir_all && layer != Opt_lowerdir)
+ return invalfc(fc, "lowerdir+ and datadir+ cannot follow lowerdir");
+ if (ctx->nr_data && layer == Opt_lowerdir_add)
+ return invalfc(fc, "regular lower layers cannot follow data layers");
+ if (ctx->nr == OVL_MAX_STACK)
+ return invalfc(fc, "too many lower directories, limit is %d",
+ OVL_MAX_STACK);
}
+ return 0;
+}
- dup = kstrdup(name, GFP_KERNEL);
- if (!dup) {
- path_put(&path);
+static int ovl_ctx_realloc_lower(struct fs_context *fc)
+{
+ struct ovl_fs_context *ctx = fc->fs_private;
+ struct ovl_fs_context_layer *l;
+ size_t nr;
+
+ if (ctx->nr < ctx->capacity)
+ return 0;
+
+ nr = min_t(size_t, max(4096 / sizeof(*l), ctx->capacity * 2),
+ OVL_MAX_STACK);
+ l = krealloc_array(ctx->lower, nr, sizeof(*l), GFP_KERNEL_ACCOUNT);
+ if (!l)
return -ENOMEM;
+
+ ctx->lower = l;
+ ctx->capacity = nr;
+ return 0;
+}
+
+static void ovl_add_layer(struct fs_context *fc, enum ovl_opt layer,
+ struct path *path, char **pname)
+{
+ struct ovl_fs *ofs = fc->s_fs_info;
+ struct ovl_config *config = &ofs->config;
+ struct ovl_fs_context *ctx = fc->fs_private;
+ struct ovl_fs_context_layer *l;
+
+ switch (layer) {
+ case Opt_workdir:
+ swap(config->workdir, *pname);
+ swap(ctx->work, *path);
+ break;
+ case Opt_upperdir:
+ swap(config->upperdir, *pname);
+ swap(ctx->upper, *path);
+ break;
+ case Opt_datadir_add:
+ ctx->nr_data++;
+ fallthrough;
+ case Opt_lowerdir_add:
+ WARN_ON(ctx->nr >= ctx->capacity);
+ l = &ctx->lower[ctx->nr++];
+ memset(l, 0, sizeof(*l));
+ swap(l->name, *pname);
+ swap(l->path, *path);
+ break;
+ default:
+ WARN_ON(1);
}
+}
- if (workdir) {
- kfree(config->workdir);
- config->workdir = dup;
- path_put(&ctx->work);
- ctx->work = path;
- } else {
- kfree(config->upperdir);
- config->upperdir = dup;
- path_put(&ctx->upper);
- ctx->upper = path;
+static int ovl_parse_layer(struct fs_context *fc, struct fs_parameter *param,
+ enum ovl_opt layer)
+{
+ char *name = kstrdup(param->string, GFP_KERNEL);
+ bool upper = (layer == Opt_upperdir || layer == Opt_workdir);
+ struct path path;
+ int err;
+
+ if (!name)
+ return -ENOMEM;
+
+ if (upper)
+ err = ovl_mount_dir(name, &path);
+ else
+ err = ovl_mount_dir_noesc(name, &path);
+ if (err)
+ goto out_free;
+
+ err = ovl_mount_dir_check(fc, &path, layer, name, upper);
+ if (err)
+ goto out_put;
+
+ if (!upper) {
+ err = ovl_ctx_realloc_lower(fc);
+ if (err)
+ goto out_put;
}
- return 0;
+
+ /* Store the user provided path string in ctx to show in mountinfo */
+ ovl_add_layer(fc, layer, &path, &name);
+
+out_put:
+ path_put(&path);
+out_free:
+ kfree(name);
+ return err;
}
-static void ovl_parse_param_drop_lowerdir(struct ovl_fs_context *ctx)
+static void ovl_reset_lowerdirs(struct ovl_fs_context *ctx)
{
- for (size_t nr = 0; nr < ctx->nr; nr++) {
- path_put(&ctx->lower[nr].path);
- kfree(ctx->lower[nr].name);
- ctx->lower[nr].name = NULL;
+ struct ovl_fs_context_layer *l = ctx->lower;
+
+ // Reset old user provided lowerdir string
+ kfree(ctx->lowerdir_all);
+ ctx->lowerdir_all = NULL;
+
+ for (size_t nr = 0; nr < ctx->nr; nr++, l++) {
+ path_put(&l->path);
+ kfree(l->name);
+ l->name = NULL;
}
ctx->nr = 0;
ctx->nr_data = 0;
@@ -346,7 +419,7 @@ static void ovl_parse_param_drop_lowerdir(struct ovl_fs_context *ctx)
/*
* Parse lowerdir= mount option:
*
- * (1) lowerdir=/lower1:/lower2:/lower3::/data1::/data2
+ * e.g.: lowerdir=/lower1:/lower2:/lower3::/data1::/data2
* Set "/lower1", "/lower2", and "/lower3" as lower layers and
* "/data1" and "/data2" as data lower layers. Any existing lower
* layers are replaced.
@@ -356,9 +429,9 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
int err;
struct ovl_fs_context *ctx = fc->fs_private;
struct ovl_fs_context_layer *l;
- char *dup = NULL, *dup_iter;
- ssize_t nr_lower = 0, nr = 0, nr_data = 0;
- bool append = false, data_layer = false;
+ char *dup = NULL, *iter;
+ ssize_t nr_lower, nr;
+ bool data_layer = false;
/*
* Ensure we're backwards compatible with mount(2)
@@ -366,16 +439,21 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
*/
/* drop all existing lower layers */
- if (!*name) {
- ovl_parse_param_drop_lowerdir(ctx);
+ ovl_reset_lowerdirs(ctx);
+
+ if (!*name)
return 0;
- }
if (*name == ':') {
pr_err("cannot append lower layer");
return -EINVAL;
}
+ // Store user provided lowerdir string to show in mount options
+ ctx->lowerdir_all = kstrdup(name, GFP_KERNEL);
+ if (!ctx->lowerdir_all)
+ return -ENOMEM;
+
dup = kstrdup(name, GFP_KERNEL);
if (!dup)
return -ENOMEM;
@@ -385,36 +463,11 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
if (nr_lower < 0)
goto out_err;
- if ((nr_lower > OVL_MAX_STACK) ||
- (append && (size_add(ctx->nr, nr_lower) > OVL_MAX_STACK))) {
+ if (nr_lower > OVL_MAX_STACK) {
pr_err("too many lower directories, limit is %d\n", OVL_MAX_STACK);
goto out_err;
}
- if (!append)
- ovl_parse_param_drop_lowerdir(ctx);
-
- /*
- * (1) append
- *
- * We want nr <= nr_lower <= capacity We know nr > 0 and nr <=
- * capacity. If nr == 0 this wouldn't be append. If nr +
- * nr_lower is <= capacity then nr <= nr_lower <= capacity
- * already holds. If nr + nr_lower exceeds capacity, we realloc.
- *
- * (2) replace
- *
- * Ensure we're backwards compatible with mount(2) which allows
- * "lowerdir=/a:/b:/c,lowerdir=/d:/e:/f" causing the last
- * specified lowerdir mount option to win.
- *
- * We want nr <= nr_lower <= capacity We know either (i) nr == 0
- * or (ii) nr > 0. We also know nr_lower > 0. The capacity
- * could've been changed multiple times already so we only know
- * nr <= capacity. If nr + nr_lower > capacity we realloc,
- * otherwise nr <= nr_lower <= capacity holds already.
- */
- nr_lower += ctx->nr;
if (nr_lower > ctx->capacity) {
err = -ENOMEM;
l = krealloc_array(ctx->lower, nr_lower, sizeof(*ctx->lower),
@@ -426,59 +479,40 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
ctx->capacity = nr_lower;
}
- /*
- * (3) By (1) and (2) we know nr <= nr_lower <= capacity.
- * (4) If ctx->nr == 0 => replace
- * We have verified above that the lowerdir mount option
- * isn't an append, i.e., the lowerdir mount option
- * doesn't start with ":" or "::".
- * (4.1) The lowerdir mount options only contains regular lower
- * layers ":".
- * => Nothing to verify.
- * (4.2) The lowerdir mount options contains regular ":" and
- * data "::" layers.
- * => We need to verify that data lower layers "::" aren't
- * followed by regular ":" lower layers
- * (5) If ctx->nr > 0 => append
- * We know that there's at least one regular layer
- * otherwise we would've failed when parsing the previous
- * lowerdir mount option.
- * (5.1) The lowerdir mount option is a regular layer ":" append
- * => We need to verify that no data layers have been
- * specified before.
- * (5.2) The lowerdir mount option is a data layer "::" append
- * We know that there's at least one regular layer or
- * other data layers. => There's nothing to verify.
- */
- dup_iter = dup;
- for (nr = ctx->nr; nr < nr_lower; nr++) {
- l = &ctx->lower[nr];
+ iter = dup;
+ l = ctx->lower;
+ for (nr = 0; nr < nr_lower; nr++, l++) {
+ ctx->nr++;
memset(l, 0, sizeof(*l));
- err = ovl_mount_dir(dup_iter, &l->path, false);
+ err = ovl_mount_dir(iter, &l->path);
+ if (err)
+ goto out_put;
+
+ err = ovl_mount_dir_check(fc, &l->path, Opt_lowerdir, iter, false);
if (err)
goto out_put;
err = -ENOMEM;
- l->name = kstrdup(dup_iter, GFP_KERNEL_ACCOUNT);
+ l->name = kstrdup(iter, GFP_KERNEL_ACCOUNT);
if (!l->name)
goto out_put;
if (data_layer)
- nr_data++;
+ ctx->nr_data++;
/* Calling strchr() again would overrun. */
- if ((nr + 1) == nr_lower)
+ if (ctx->nr == nr_lower)
break;
err = -EINVAL;
- dup_iter = strchr(dup_iter, '\0') + 1;
- if (*dup_iter) {
+ iter = strchr(iter, '\0') + 1;
+ if (*iter) {
/*
* This is a regular layer so we require that
* there are no data layers.
*/
- if ((ctx->nr_data + nr_data) > 0) {
+ if (ctx->nr_data > 0) {
pr_err("regular lower layers cannot follow data lower layers");
goto out_put;
}
@@ -489,29 +523,13 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
/* This is a data lower layer. */
data_layer = true;
- dup_iter++;
+ iter++;
}
- ctx->nr = nr_lower;
- ctx->nr_data += nr_data;
kfree(dup);
return 0;
out_put:
- /*
- * We know nr >= ctx->nr < nr_lower. If we failed somewhere
- * we want to undo until nr == ctx->nr. This is correct for
- * both ctx->nr == 0 and ctx->nr > 0.
- */
- for (; nr >= ctx->nr; nr--) {
- l = &ctx->lower[nr];
- kfree(l->name);
- l->name = NULL;
- path_put(&l->path);
-
- /* don't overflow */
- if (nr == 0)
- break;
- }
+ ovl_reset_lowerdirs(ctx);
out_err:
kfree(dup);
@@ -556,11 +574,11 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_lowerdir:
err = ovl_parse_param_lowerdir(param->string, fc);
break;
+ case Opt_lowerdir_add:
+ case Opt_datadir_add:
case Opt_upperdir:
- fallthrough;
case Opt_workdir:
- err = ovl_parse_param_upperdir(param->string, fc,
- (Opt_workdir == opt));
+ err = ovl_parse_layer(fc, param, opt);
break;
case Opt_default_permissions:
config->default_permissions = true;
@@ -617,7 +635,7 @@ static int ovl_get_tree(struct fs_context *fc)
static inline void ovl_fs_context_free(struct ovl_fs_context *ctx)
{
- ovl_parse_param_drop_lowerdir(ctx);
+ ovl_reset_lowerdirs(ctx);
path_put(&ctx->upper);
path_put(&ctx->work);
kfree(ctx->lower);
@@ -933,23 +951,28 @@ int ovl_show_options(struct seq_file *m, struct dentry *dentry)
{
struct super_block *sb = dentry->d_sb;
struct ovl_fs *ofs = OVL_FS(sb);
- size_t nr, nr_merged_lower = ofs->numlayer - ofs->numdatalayer;
+ size_t nr, nr_merged_lower, nr_lower = 0;
+ char **lowerdirs = ofs->config.lowerdirs;
/*
- * lowerdirs[] starts from offset 1, then
- * >= 0 regular lower layers prefixed with : and
- * >= 0 data-only lower layers prefixed with ::
- *
- * we need to escase comma and space like seq_show_option() does and
- * we also need to escape the colon separator from lowerdir paths.
+ * lowerdirs[0] holds the colon separated list that user provided
+ * with lowerdir mount option.
+ * lowerdirs[1..numlayer] hold the lowerdir paths that were added
+ * using the lowerdir+ and datadir+ mount options.
+ * For now, we do not allow mixing the legacy lowerdir mount option
+ * with the new lowerdir+ and datadir+ mount options.
*/
- seq_puts(m, ",lowerdir=");
- for (nr = 1; nr < ofs->numlayer; nr++) {
- if (nr > 1)
- seq_putc(m, ':');
- if (nr >= nr_merged_lower)
- seq_putc(m, ':');
- seq_escape(m, ofs->config.lowerdirs[nr], ":, \t\n\\");
+ if (lowerdirs[0]) {
+ seq_show_option(m, "lowerdir", lowerdirs[0]);
+ } else {
+ nr_lower = ofs->numlayer;
+ nr_merged_lower = nr_lower - ofs->numdatalayer;
+ }
+ for (nr = 1; nr < nr_lower; nr++) {
+ if (nr < nr_merged_lower)
+ seq_show_option(m, "lowerdir+", lowerdirs[nr]);
+ else
+ seq_show_option(m, "datadir+", lowerdirs[nr]);
}
if (ofs->config.upperdir) {
seq_show_option(m, "upperdir", ofs->config.upperdir);
diff --git a/fs/overlayfs/params.h b/fs/overlayfs/params.h
index 8750da68ab2a..c96d93982021 100644
--- a/fs/overlayfs/params.h
+++ b/fs/overlayfs/params.h
@@ -32,6 +32,7 @@ struct ovl_fs_context {
size_t nr_data;
struct ovl_opt_set set;
struct ovl_fs_context_layer *lower;
+ char *lowerdir_all; /* user provided lowerdir string */
};
int ovl_init_fs_context(struct fs_context *fc);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index de39e067ae65..a490fc47c3e7 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -25,6 +25,7 @@ struct ovl_cache_entry {
struct ovl_cache_entry *next_maybe_whiteout;
bool is_upper;
bool is_whiteout;
+ bool check_xwhiteout;
char name[];
};
@@ -47,6 +48,7 @@ struct ovl_readdir_data {
int err;
bool is_upper;
bool d_type_supported;
+ bool in_xwhiteouts_dir;
};
struct ovl_dir_file {
@@ -162,6 +164,8 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
p->ino = 0;
p->is_upper = rdd->is_upper;
p->is_whiteout = false;
+ /* Defer check for overlay.whiteout to ovl_iterate() */
+ p->check_xwhiteout = rdd->in_xwhiteouts_dir && d_type == DT_REG;
if (d_type == DT_CHR) {
p->next_maybe_whiteout = rdd->first_maybe_whiteout;
@@ -301,6 +305,8 @@ static inline int ovl_dir_read(const struct path *realpath,
if (IS_ERR(realfile))
return PTR_ERR(realfile);
+ rdd->in_xwhiteouts_dir = rdd->dentry &&
+ ovl_path_check_xwhiteouts_xattr(OVL_FS(rdd->dentry->d_sb), realpath);
rdd->first_maybe_whiteout = NULL;
rdd->ctx.pos = 0;
do {
@@ -447,7 +453,7 @@ static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid,
}
/*
- * Set d_ino for upper entries. Non-upper entries should always report
+ * Set d_ino for upper entries if needed. Non-upper entries should always report
* the uppermost real inode ino and should not call this function.
*
* When not all layer are on same fs, report real ino also for upper.
@@ -455,8 +461,11 @@ static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid,
* When all layers are on the same fs, and upper has a reference to
* copy up origin, call vfs_getattr() on the overlay entry to make
* sure that d_ino will be consistent with st_ino from stat(2).
+ *
+ * Also checks the overlay.whiteout xattr by doing a full lookup which will return
+ * negative in this case.
*/
-static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry *p)
+static int ovl_cache_update(const struct path *path, struct ovl_cache_entry *p, bool update_ino)
{
struct dentry *dir = path->dentry;
@@ -467,7 +476,7 @@ static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry
int xinobits = ovl_xino_bits(ofs);
int err = 0;
- if (!ovl_same_dev(ofs))
+ if (!ovl_same_dev(ofs) && !p->check_xwhiteout)
goto out;
if (p->name[0] == '.') {
@@ -481,6 +490,7 @@ static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry
goto get;
}
}
+ /* This checks also for xwhiteouts */
this = lookup_one(mnt_idmap(path->mnt), p->name, dir, p->len);
if (IS_ERR_OR_NULL(this) || !this->d_inode) {
/* Mark a stale entry */
@@ -494,6 +504,9 @@ static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry
}
get:
+ if (!ovl_same_dev(ofs) || !update_ino)
+ goto out;
+
type = ovl_path_type(this);
if (OVL_TYPE_ORIGIN(type)) {
struct kstat stat;
@@ -572,7 +585,7 @@ static int ovl_dir_read_impure(const struct path *path, struct list_head *list,
list_for_each_entry_safe(p, n, list, l_node) {
if (strcmp(p->name, ".") != 0 &&
strcmp(p->name, "..") != 0) {
- err = ovl_cache_update_ino(path, p);
+ err = ovl_cache_update(path, p, true);
if (err)
return err;
}
@@ -778,13 +791,13 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx)
while (od->cursor != &od->cache->entries) {
p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
if (!p->is_whiteout) {
- if (!p->ino) {
- err = ovl_cache_update_ino(&file->f_path, p);
+ if (!p->ino || p->check_xwhiteout) {
+ err = ovl_cache_update(&file->f_path, p, !p->ino);
if (err)
goto out;
}
}
- /* ovl_cache_update_ino() sets is_whiteout on stale entry */
+ /* ovl_cache_update() sets is_whiteout on stale entry */
if (!p->is_whiteout) {
if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
break;
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 3fa2416264a4..a0967bb25003 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -34,14 +34,22 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
struct dentry *real = NULL, *lower;
int err;
- /* It's an overlay file */
+ /*
+ * vfs is only expected to call d_real() with NULL from d_real_inode()
+ * and with overlay inode from file_dentry() on an overlay file.
+ *
+ * TODO: remove @inode argument from d_real() API, remove code in this
+ * function that deals with non-NULL @inode and remove d_real() call
+ * from file_dentry().
+ */
if (inode && d_inode(dentry) == inode)
return dentry;
+ else if (inode)
+ goto bug;
if (!d_is_reg(dentry)) {
- if (!inode || inode == d_inode(dentry))
- return dentry;
- goto bug;
+ /* d_real_inode() is only relevant for regular files */
+ return dentry;
}
real = ovl_dentry_upper(dentry);
@@ -437,68 +445,6 @@ static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir)
return ok;
}
-static int ovl_own_xattr_get(const struct xattr_handler *handler,
- struct dentry *dentry, struct inode *inode,
- const char *name, void *buffer, size_t size)
-{
- return -EOPNOTSUPP;
-}
-
-static int ovl_own_xattr_set(const struct xattr_handler *handler,
- struct mnt_idmap *idmap,
- struct dentry *dentry, struct inode *inode,
- const char *name, const void *value,
- size_t size, int flags)
-{
- return -EOPNOTSUPP;
-}
-
-static int ovl_other_xattr_get(const struct xattr_handler *handler,
- struct dentry *dentry, struct inode *inode,
- const char *name, void *buffer, size_t size)
-{
- return ovl_xattr_get(dentry, inode, name, buffer, size);
-}
-
-static int ovl_other_xattr_set(const struct xattr_handler *handler,
- struct mnt_idmap *idmap,
- struct dentry *dentry, struct inode *inode,
- const char *name, const void *value,
- size_t size, int flags)
-{
- return ovl_xattr_set(dentry, inode, name, value, size, flags);
-}
-
-static const struct xattr_handler ovl_own_trusted_xattr_handler = {
- .prefix = OVL_XATTR_TRUSTED_PREFIX,
- .get = ovl_own_xattr_get,
- .set = ovl_own_xattr_set,
-};
-
-static const struct xattr_handler ovl_own_user_xattr_handler = {
- .prefix = OVL_XATTR_USER_PREFIX,
- .get = ovl_own_xattr_get,
- .set = ovl_own_xattr_set,
-};
-
-static const struct xattr_handler ovl_other_xattr_handler = {
- .prefix = "", /* catch all */
- .get = ovl_other_xattr_get,
- .set = ovl_other_xattr_set,
-};
-
-static const struct xattr_handler *ovl_trusted_xattr_handlers[] = {
- &ovl_own_trusted_xattr_handler,
- &ovl_other_xattr_handler,
- NULL
-};
-
-static const struct xattr_handler *ovl_user_xattr_handlers[] = {
- &ovl_own_user_xattr_handler,
- &ovl_other_xattr_handler,
- NULL
-};
-
static int ovl_setup_trap(struct super_block *sb, struct dentry *dir,
struct inode **ptrap, const char *name)
{
@@ -639,7 +585,7 @@ static int ovl_check_rename_whiteout(struct ovl_fs *ofs)
if (IS_ERR(whiteout))
goto cleanup_temp;
- err = ovl_is_whiteout(whiteout);
+ err = ovl_upper_is_whiteout(ofs, whiteout);
/* Best effort cleanup of whiteout and temp file */
if (err)
@@ -879,15 +825,20 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs,
{
struct vfsmount *mnt = ovl_upper_mnt(ofs);
struct dentry *indexdir;
+ struct dentry *origin = ovl_lowerstack(oe)->dentry;
+ const struct ovl_fh *fh;
int err;
+ fh = ovl_get_origin_fh(ofs, origin);
+ if (IS_ERR(fh))
+ return PTR_ERR(fh);
+
err = mnt_want_write(mnt);
if (err)
- return err;
+ goto out_free_fh;
/* Verify lower root is upper root origin */
- err = ovl_verify_origin(ofs, upperpath->dentry,
- ovl_lowerstack(oe)->dentry, true);
+ err = ovl_verify_origin_fh(ofs, upperpath->dentry, fh, true);
if (err) {
pr_err("failed to verify upper root origin\n");
goto out;
@@ -919,9 +870,10 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs,
* directory entries.
*/
if (ovl_check_origin_xattr(ofs, ofs->indexdir)) {
- err = ovl_verify_set_fh(ofs, ofs->indexdir,
- OVL_XATTR_ORIGIN,
- upperpath->dentry, true, false);
+ err = ovl_verify_origin_xattr(ofs, ofs->indexdir,
+ OVL_XATTR_ORIGIN,
+ upperpath->dentry, true,
+ false);
if (err)
pr_err("failed to verify index dir 'origin' xattr\n");
}
@@ -939,6 +891,8 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs,
out:
mnt_drop_write(mnt);
+out_free_fh:
+ kfree(fh);
return err;
}
@@ -1374,8 +1328,11 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
ofs->layers = layers;
/*
* Layer 0 is reserved for upper even if there's no upper.
- * For consistency, config.lowerdirs[0] is NULL.
+ * config.lowerdirs[0] is used for storing the user provided colon
+ * separated lowerdir string.
*/
+ ofs->config.lowerdirs[0] = ctx->lowerdir_all;
+ ctx->lowerdir_all = NULL;
ofs->numlayer = 1;
sb->s_stack_depth = 0;
@@ -1485,11 +1442,18 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
cap_lower(cred->cap_effective, CAP_SYS_RESOURCE);
sb->s_magic = OVERLAYFS_SUPER_MAGIC;
- sb->s_xattr = ofs->config.userxattr ? ovl_user_xattr_handlers :
- ovl_trusted_xattr_handlers;
+ sb->s_xattr = ovl_xattr_handlers(ofs);
sb->s_fs_info = ofs;
+#ifdef CONFIG_FS_POSIX_ACL
sb->s_flags |= SB_POSIXACL;
- sb->s_iflags |= SB_I_SKIP_SYNC | SB_I_IMA_UNVERIFIABLE_SIGNATURE;
+#endif
+ sb->s_iflags |= SB_I_SKIP_SYNC;
+ /*
+ * Ensure that umask handling is done by the filesystems used
+ * for the the upper layer instead of overlayfs as that would
+ * lead to unexpected results.
+ */
+ sb->s_iflags |= SB_I_NOUMASK;
err = -ENOMEM;
root_dentry = ovl_get_root(sb, ctx->upper.dentry, oe);
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 89e0d60d35b6..c3f020ca13a8 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -17,12 +17,38 @@
#include <linux/ratelimit.h>
#include "overlayfs.h"
+/* Get write access to upper mnt - may fail if upper sb was remounted ro */
+int ovl_get_write_access(struct dentry *dentry)
+{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ return mnt_get_write_access(ovl_upper_mnt(ofs));
+}
+
+/* Get write access to upper sb - may block if upper sb is frozen */
+void ovl_start_write(struct dentry *dentry)
+{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ sb_start_write(ovl_upper_mnt(ofs)->mnt_sb);
+}
+
int ovl_want_write(struct dentry *dentry)
{
struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
return mnt_want_write(ovl_upper_mnt(ofs));
}
+void ovl_put_write_access(struct dentry *dentry)
+{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ mnt_put_write_access(ovl_upper_mnt(ofs));
+}
+
+void ovl_end_write(struct dentry *dentry)
+{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ sb_end_write(ovl_upper_mnt(ofs)->mnt_sb);
+}
+
void ovl_drop_write(struct dentry *dentry)
{
struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
@@ -55,7 +81,7 @@ int ovl_can_decode_fh(struct super_block *sb)
if (!capable(CAP_DAC_READ_SEARCH))
return 0;
- if (!sb->s_export_op || !sb->s_export_op->fh_to_dentry)
+ if (!exportfs_can_decode_fh(sb->s_export_op))
return 0;
return sb->s_export_op->encode_fh ? -1 : FILEID_INO32_GEN;
@@ -575,6 +601,16 @@ bool ovl_is_whiteout(struct dentry *dentry)
return inode && IS_WHITEOUT(inode);
}
+/*
+ * Use this over ovl_is_whiteout for upper and lower files, as it also
+ * handles overlay.whiteout xattr whiteout files.
+ */
+bool ovl_path_is_whiteout(struct ovl_fs *ofs, const struct path *path)
+{
+ return ovl_is_whiteout(path->dentry) ||
+ ovl_path_check_xwhiteout_xattr(ofs, path);
+}
+
struct file *ovl_path_open(const struct path *path, int flags)
{
struct inode *inode = d_inode(path->dentry);
@@ -644,22 +680,36 @@ bool ovl_already_copied_up(struct dentry *dentry, int flags)
return false;
}
+/*
+ * The copy up "transaction" keeps an elevated mnt write count on upper mnt,
+ * but leaves taking freeze protection on upper sb to lower level helpers.
+ */
int ovl_copy_up_start(struct dentry *dentry, int flags)
{
struct inode *inode = d_inode(dentry);
int err;
err = ovl_inode_lock_interruptible(inode);
- if (!err && ovl_already_copied_up_locked(dentry, flags)) {
+ if (err)
+ return err;
+
+ if (ovl_already_copied_up_locked(dentry, flags))
err = 1; /* Already copied up */
- ovl_inode_unlock(inode);
- }
+ else
+ err = ovl_get_write_access(dentry);
+ if (err)
+ goto out_unlock;
+
+ return 0;
+out_unlock:
+ ovl_inode_unlock(inode);
return err;
}
void ovl_copy_up_end(struct dentry *dentry)
{
+ ovl_put_write_access(dentry);
ovl_inode_unlock(d_inode(dentry));
}
@@ -676,6 +726,32 @@ bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path)
return false;
}
+bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path)
+{
+ struct dentry *dentry = path->dentry;
+ int res;
+
+ /* xattr.whiteout must be a zero size regular file */
+ if (!d_is_reg(dentry) || i_size_read(d_inode(dentry)) != 0)
+ return false;
+
+ res = ovl_path_getxattr(ofs, path, OVL_XATTR_XWHITEOUT, NULL, 0);
+ return res >= 0;
+}
+
+bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path)
+{
+ struct dentry *dentry = path->dentry;
+ int res;
+
+ /* xattr.whiteouts must be a directory */
+ if (!d_is_dir(dentry))
+ return false;
+
+ res = ovl_path_getxattr(ofs, path, OVL_XATTR_XWHITEOUTS, NULL, 0);
+ return res >= 0;
+}
+
/*
* Load persistent uuid from xattr into s_uuid if found, or store a new
* random generated value in s_uuid and in xattr.
@@ -760,6 +836,8 @@ bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
#define OVL_XATTR_UUID_POSTFIX "uuid"
#define OVL_XATTR_METACOPY_POSTFIX "metacopy"
#define OVL_XATTR_PROTATTR_POSTFIX "protattr"
+#define OVL_XATTR_XWHITEOUT_POSTFIX "whiteout"
+#define OVL_XATTR_XWHITEOUTS_POSTFIX "whiteouts"
#define OVL_XATTR_TAB_ENTRY(x) \
[x] = { [false] = OVL_XATTR_TRUSTED_PREFIX x ## _POSTFIX, \
@@ -775,6 +853,8 @@ const char *const ovl_xattr_table[][2] = {
OVL_XATTR_TAB_ENTRY(OVL_XATTR_UUID),
OVL_XATTR_TAB_ENTRY(OVL_XATTR_METACOPY),
OVL_XATTR_TAB_ENTRY(OVL_XATTR_PROTATTR),
+ OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUT),
+ OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUTS),
};
int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry,
@@ -898,7 +978,7 @@ int ovl_set_protattr(struct inode *inode, struct dentry *upper,
return 0;
}
-/**
+/*
* Caller must hold a reference to inode to prevent it from being freed while
* it is marked inuse.
*/
@@ -973,12 +1053,18 @@ static void ovl_cleanup_index(struct dentry *dentry)
struct dentry *index = NULL;
struct inode *inode;
struct qstr name = { };
+ bool got_write = false;
int err;
err = ovl_get_index_name(ofs, lowerdentry, &name);
if (err)
goto fail;
+ err = ovl_want_write(dentry);
+ if (err)
+ goto fail;
+
+ got_write = true;
inode = d_inode(upperdentry);
if (!S_ISDIR(inode->i_mode) && inode->i_nlink != 1) {
pr_warn_ratelimited("cleanup linked index (%pd2, ino=%lu, nlink=%u)\n",
@@ -1016,6 +1102,8 @@ static void ovl_cleanup_index(struct dentry *dentry)
goto fail;
out:
+ if (got_write)
+ ovl_drop_write(dentry);
kfree(name.name);
dput(index);
return;
@@ -1062,8 +1150,12 @@ int ovl_nlink_start(struct dentry *dentry)
if (err)
return err;
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out_unlock;
+
if (d_is_dir(dentry) || !ovl_test_flag(OVL_INDEX, inode))
- goto out;
+ return 0;
old_cred = ovl_override_creds(dentry->d_sb);
/*
@@ -1074,10 +1166,15 @@ int ovl_nlink_start(struct dentry *dentry)
*/
err = ovl_set_nlink_upper(dentry);
revert_creds(old_cred);
-
-out:
if (err)
- ovl_inode_unlock(inode);
+ goto out_drop_write;
+
+ return 0;
+
+out_drop_write:
+ ovl_drop_write(dentry);
+out_unlock:
+ ovl_inode_unlock(inode);
return err;
}
@@ -1086,6 +1183,8 @@ void ovl_nlink_end(struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
+ ovl_drop_write(dentry);
+
if (ovl_test_flag(OVL_INDEX, inode) && inode->i_nlink == 0) {
const struct cred *old_cred;
@@ -1403,14 +1502,16 @@ void ovl_copyattr(struct inode *inode)
realinode = ovl_i_path_real(inode, &realpath);
real_idmap = mnt_idmap(realpath.mnt);
+ spin_lock(&inode->i_lock);
vfsuid = i_uid_into_vfsuid(real_idmap, realinode);
vfsgid = i_gid_into_vfsgid(real_idmap, realinode);
inode->i_uid = vfsuid_into_kuid(vfsuid);
inode->i_gid = vfsgid_into_kgid(vfsgid);
inode->i_mode = realinode->i_mode;
- inode->i_atime = realinode->i_atime;
- inode->i_mtime = realinode->i_mtime;
+ inode_set_atime_to_ts(inode, inode_get_atime(realinode));
+ inode_set_mtime_to_ts(inode, inode_get_mtime(realinode));
inode_set_ctime_to_ts(inode, inode_get_ctime(realinode));
i_size_write(inode, i_size_read(realinode));
+ spin_unlock(&inode->i_lock);
}
diff --git a/fs/overlayfs/xattrs.c b/fs/overlayfs/xattrs.c
new file mode 100644
index 000000000000..383978e4663c
--- /dev/null
+++ b/fs/overlayfs/xattrs.c
@@ -0,0 +1,271 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/fs.h>
+#include <linux/xattr.h>
+#include "overlayfs.h"
+
+static bool ovl_is_escaped_xattr(struct super_block *sb, const char *name)
+{
+ struct ovl_fs *ofs = sb->s_fs_info;
+
+ if (ofs->config.userxattr)
+ return strncmp(name, OVL_XATTR_ESCAPE_USER_PREFIX,
+ OVL_XATTR_ESCAPE_USER_PREFIX_LEN) == 0;
+ else
+ return strncmp(name, OVL_XATTR_ESCAPE_TRUSTED_PREFIX,
+ OVL_XATTR_ESCAPE_TRUSTED_PREFIX_LEN - 1) == 0;
+}
+
+static bool ovl_is_own_xattr(struct super_block *sb, const char *name)
+{
+ struct ovl_fs *ofs = OVL_FS(sb);
+
+ if (ofs->config.userxattr)
+ return strncmp(name, OVL_XATTR_USER_PREFIX,
+ OVL_XATTR_USER_PREFIX_LEN) == 0;
+ else
+ return strncmp(name, OVL_XATTR_TRUSTED_PREFIX,
+ OVL_XATTR_TRUSTED_PREFIX_LEN) == 0;
+}
+
+bool ovl_is_private_xattr(struct super_block *sb, const char *name)
+{
+ return ovl_is_own_xattr(sb, name) && !ovl_is_escaped_xattr(sb, name);
+}
+
+static int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ int err;
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ struct dentry *upperdentry = ovl_i_dentry_upper(inode);
+ struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry);
+ struct path realpath;
+ const struct cred *old_cred;
+
+ if (!value && !upperdentry) {
+ ovl_path_lower(dentry, &realpath);
+ old_cred = ovl_override_creds(dentry->d_sb);
+ err = vfs_getxattr(mnt_idmap(realpath.mnt), realdentry, name, NULL, 0);
+ revert_creds(old_cred);
+ if (err < 0)
+ goto out;
+ }
+
+ if (!upperdentry) {
+ err = ovl_copy_up(dentry);
+ if (err)
+ goto out;
+
+ realdentry = ovl_dentry_upper(dentry);
+ }
+
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out;
+
+ old_cred = ovl_override_creds(dentry->d_sb);
+ if (value) {
+ err = ovl_do_setxattr(ofs, realdentry, name, value, size,
+ flags);
+ } else {
+ WARN_ON(flags != XATTR_REPLACE);
+ err = ovl_do_removexattr(ofs, realdentry, name);
+ }
+ revert_creds(old_cred);
+ ovl_drop_write(dentry);
+
+ /* copy c/mtime */
+ ovl_copyattr(inode);
+out:
+ return err;
+}
+
+static int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
+ void *value, size_t size)
+{
+ ssize_t res;
+ const struct cred *old_cred;
+ struct path realpath;
+
+ ovl_i_path_real(inode, &realpath);
+ old_cred = ovl_override_creds(dentry->d_sb);
+ res = vfs_getxattr(mnt_idmap(realpath.mnt), realpath.dentry, name, value, size);
+ revert_creds(old_cred);
+ return res;
+}
+
+static bool ovl_can_list(struct super_block *sb, const char *s)
+{
+ /* Never list private (.overlay) */
+ if (ovl_is_private_xattr(sb, s))
+ return false;
+
+ /* List all non-trusted xattrs */
+ if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0)
+ return true;
+
+ /* list other trusted for superuser only */
+ return ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN);
+}
+
+ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+ struct dentry *realdentry = ovl_dentry_real(dentry);
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ ssize_t res;
+ size_t len;
+ char *s;
+ const struct cred *old_cred;
+ size_t prefix_len, name_len;
+
+ old_cred = ovl_override_creds(dentry->d_sb);
+ res = vfs_listxattr(realdentry, list, size);
+ revert_creds(old_cred);
+ if (res <= 0 || size == 0)
+ return res;
+
+ prefix_len = ofs->config.userxattr ?
+ OVL_XATTR_USER_PREFIX_LEN : OVL_XATTR_TRUSTED_PREFIX_LEN;
+
+ /* filter out private xattrs */
+ for (s = list, len = res; len;) {
+ size_t slen = strnlen(s, len) + 1;
+
+ /* underlying fs providing us with an broken xattr list? */
+ if (WARN_ON(slen > len))
+ return -EIO;
+
+ len -= slen;
+ if (!ovl_can_list(dentry->d_sb, s)) {
+ res -= slen;
+ memmove(s, s + slen, len);
+ } else if (ovl_is_escaped_xattr(dentry->d_sb, s)) {
+ res -= OVL_XATTR_ESCAPE_PREFIX_LEN;
+ name_len = slen - prefix_len - OVL_XATTR_ESCAPE_PREFIX_LEN;
+ s += prefix_len;
+ memmove(s, s + OVL_XATTR_ESCAPE_PREFIX_LEN, name_len + len);
+ s += name_len;
+ } else {
+ s += slen;
+ }
+ }
+
+ return res;
+}
+
+static char *ovl_xattr_escape_name(const char *prefix, const char *name)
+{
+ size_t prefix_len = strlen(prefix);
+ size_t name_len = strlen(name);
+ size_t escaped_len;
+ char *escaped, *s;
+
+ escaped_len = prefix_len + OVL_XATTR_ESCAPE_PREFIX_LEN + name_len;
+ if (escaped_len > XATTR_NAME_MAX)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ escaped = kmalloc(escaped_len + 1, GFP_KERNEL);
+ if (escaped == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ s = escaped;
+ memcpy(s, prefix, prefix_len);
+ s += prefix_len;
+ memcpy(s, OVL_XATTR_ESCAPE_PREFIX, OVL_XATTR_ESCAPE_PREFIX_LEN);
+ s += OVL_XATTR_ESCAPE_PREFIX_LEN;
+ memcpy(s, name, name_len + 1);
+
+ return escaped;
+}
+
+static int ovl_own_xattr_get(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, void *buffer, size_t size)
+{
+ char *escaped;
+ int r;
+
+ escaped = ovl_xattr_escape_name(handler->prefix, name);
+ if (IS_ERR(escaped))
+ return PTR_ERR(escaped);
+
+ r = ovl_xattr_get(dentry, inode, escaped, buffer, size);
+
+ kfree(escaped);
+
+ return r;
+}
+
+static int ovl_own_xattr_set(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ char *escaped;
+ int r;
+
+ escaped = ovl_xattr_escape_name(handler->prefix, name);
+ if (IS_ERR(escaped))
+ return PTR_ERR(escaped);
+
+ r = ovl_xattr_set(dentry, inode, escaped, value, size, flags);
+
+ kfree(escaped);
+
+ return r;
+}
+
+static int ovl_other_xattr_get(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, void *buffer, size_t size)
+{
+ return ovl_xattr_get(dentry, inode, name, buffer, size);
+}
+
+static int ovl_other_xattr_set(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ return ovl_xattr_set(dentry, inode, name, value, size, flags);
+}
+
+static const struct xattr_handler ovl_own_trusted_xattr_handler = {
+ .prefix = OVL_XATTR_TRUSTED_PREFIX,
+ .get = ovl_own_xattr_get,
+ .set = ovl_own_xattr_set,
+};
+
+static const struct xattr_handler ovl_own_user_xattr_handler = {
+ .prefix = OVL_XATTR_USER_PREFIX,
+ .get = ovl_own_xattr_get,
+ .set = ovl_own_xattr_set,
+};
+
+static const struct xattr_handler ovl_other_xattr_handler = {
+ .prefix = "", /* catch all */
+ .get = ovl_other_xattr_get,
+ .set = ovl_other_xattr_set,
+};
+
+static const struct xattr_handler * const ovl_trusted_xattr_handlers[] = {
+ &ovl_own_trusted_xattr_handler,
+ &ovl_other_xattr_handler,
+ NULL
+};
+
+static const struct xattr_handler * const ovl_user_xattr_handlers[] = {
+ &ovl_own_user_xattr_handler,
+ &ovl_other_xattr_handler,
+ NULL
+};
+
+const struct xattr_handler * const *ovl_xattr_handlers(struct ovl_fs *ofs)
+{
+ return ofs->config.userxattr ? ovl_user_xattr_handlers :
+ ovl_trusted_xattr_handlers;
+}
+
diff --git a/fs/pipe.c b/fs/pipe.c
index 139190165a1c..804a7d789452 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -227,6 +227,36 @@ static inline bool pipe_readable(const struct pipe_inode_info *pipe)
return !pipe_empty(head, tail) || !writers;
}
+static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf,
+ unsigned int tail)
+{
+ pipe_buf_release(pipe, buf);
+
+ /*
+ * If the pipe has a watch_queue, we need additional protection
+ * by the spinlock because notifications get posted with only
+ * this spinlock, no mutex
+ */
+ if (pipe_has_watch_queue(pipe)) {
+ spin_lock_irq(&pipe->rd_wait.lock);
+#ifdef CONFIG_WATCH_QUEUE
+ if (buf->flags & PIPE_BUF_FLAG_LOSS)
+ pipe->note_loss = true;
+#endif
+ pipe->tail = ++tail;
+ spin_unlock_irq(&pipe->rd_wait.lock);
+ return tail;
+ }
+
+ /*
+ * Without a watch_queue, we can simply increment the tail
+ * without the spinlock - the mutex is enough.
+ */
+ pipe->tail = ++tail;
+ return tail;
+}
+
static ssize_t
pipe_read(struct kiocb *iocb, struct iov_iter *to)
{
@@ -320,17 +350,8 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
buf->len = 0;
}
- if (!buf->len) {
- pipe_buf_release(pipe, buf);
- spin_lock_irq(&pipe->rd_wait.lock);
-#ifdef CONFIG_WATCH_QUEUE
- if (buf->flags & PIPE_BUF_FLAG_LOSS)
- pipe->note_loss = true;
-#endif
- tail++;
- pipe->tail = tail;
- spin_unlock_irq(&pipe->rd_wait.lock);
- }
+ if (!buf->len)
+ tail = pipe_update_tail(pipe, buf, tail);
total_len -= chars;
if (!total_len)
break; /* common path: read succeeded */
@@ -437,12 +458,10 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
goto out;
}
-#ifdef CONFIG_WATCH_QUEUE
- if (pipe->watch_queue) {
+ if (pipe_has_watch_queue(pipe)) {
ret = -EXDEV;
goto out;
}
-#endif
/*
* If it wasn't empty we try to merge new data into
@@ -507,16 +526,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
* it, either the reader will consume it or it'll still
* be there for the next write.
*/
- spin_lock_irq(&pipe->rd_wait.lock);
-
- head = pipe->head;
- if (pipe_full(head, pipe->tail, pipe->max_usage)) {
- spin_unlock_irq(&pipe->rd_wait.lock);
- continue;
- }
-
pipe->head = head + 1;
- spin_unlock_irq(&pipe->rd_wait.lock);
/* Insert it into the buffer array */
buf = &pipe->bufs[head & mask];
@@ -854,7 +864,7 @@ void free_pipe_info(struct pipe_inode_info *pipe)
kfree(pipe);
}
-static struct vfsmount *pipe_mnt __read_mostly;
+static struct vfsmount *pipe_mnt __ro_after_init;
/*
* pipefs_dname() is called from d_path().
@@ -898,7 +908,7 @@ static struct inode * get_pipe_inode(void)
inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
return inode;
@@ -1324,10 +1334,8 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg)
unsigned int nr_slots, size;
long ret = 0;
-#ifdef CONFIG_WATCH_QUEUE
- if (pipe->watch_queue)
+ if (pipe_has_watch_queue(pipe))
return -EBUSY;
-#endif
size = round_pipe_size(arg);
nr_slots = size >> PAGE_SHIFT;
@@ -1379,10 +1387,8 @@ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
if (file->f_op != &pipefifo_fops || !pipe)
return NULL;
-#ifdef CONFIG_WATCH_QUEUE
- if (for_splice && pipe->watch_queue)
+ if (for_splice && pipe_has_watch_queue(pipe))
return NULL;
-#endif
return pipe;
}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 2c2efbe685d8..ff08a8957552 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -536,12 +536,13 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
/* add up live thread stats at the group level */
if (whole) {
- struct task_struct *t = task;
- do {
+ struct task_struct *t;
+
+ __for_each_thread(sig, t) {
min_flt += t->min_flt;
maj_flt += t->maj_flt;
gtime += task_gtime(t);
- } while_each_thread(task, t);
+ }
min_flt += sig->min_flt;
maj_flt += sig->maj_flt;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ffd54617c354..dd31e3b6bf77 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1153,11 +1153,10 @@ err_unlock:
static ssize_t oom_adj_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
int oom_adj;
int err;
- memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count)) {
@@ -1213,11 +1212,10 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
int oom_score_adj;
int err;
- memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count)) {
@@ -1358,13 +1356,13 @@ static ssize_t proc_fault_inject_write(struct file * file,
const char __user * buf, size_t count, loff_t *ppos)
{
struct task_struct *task;
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
int make_it_fail;
int rv;
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- memset(buffer, 0, sizeof(buffer));
+
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
@@ -1509,11 +1507,10 @@ sched_autogroup_write(struct file *file, const char __user *buf,
{
struct inode *inode = file_inode(file);
struct task_struct *p;
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
int nice;
int err;
- memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
@@ -1666,10 +1663,9 @@ static ssize_t comm_write(struct file *file, const char __user *buf,
{
struct inode *inode = file_inode(file);
struct task_struct *p;
- char buffer[TASK_COMM_LEN];
+ char buffer[TASK_COMM_LEN] = {};
const size_t maxlen = sizeof(buffer) - 1;
- memset(buffer, 0, sizeof(buffer));
if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count))
return -EFAULT;
@@ -1902,7 +1898,7 @@ struct inode *proc_pid_make_inode(struct super_block *sb,
ei = PROC_I(inode);
inode->i_mode = mode;
inode->i_ino = get_next_ino();
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_op = &proc_def_inode_operations;
/*
@@ -2218,7 +2214,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path)
rc = -ENOENT;
vma = find_exact_vma(mm, vm_start, vm_end);
if (vma && vma->vm_file) {
- *path = vma->vm_file->f_path;
+ *path = *file_user_path(vma->vm_file);
path_get(path);
rc = 0;
}
@@ -2976,8 +2972,7 @@ static const struct file_operations proc_coredump_filter_operations = {
#ifdef CONFIG_TASK_IO_ACCOUNTING
static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
{
- struct task_io_accounting acct = task->ioac;
- unsigned long flags;
+ struct task_io_accounting acct;
int result;
result = down_read_killable(&task->signal->exec_update_lock);
@@ -2989,15 +2984,28 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
goto out_unlock;
}
- if (whole && lock_task_sighand(task, &flags)) {
- struct task_struct *t = task;
+ if (whole) {
+ struct signal_struct *sig = task->signal;
+ struct task_struct *t;
+ unsigned int seq = 1;
+ unsigned long flags;
+
+ rcu_read_lock();
+ do {
+ seq++; /* 2 on the 1st/lockless path, otherwise odd */
+ flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
- task_io_accounting_add(&acct, &task->signal->ioac);
- while_each_thread(task, t)
- task_io_accounting_add(&acct, &t->ioac);
+ acct = sig->ioac;
+ __for_each_thread(sig, t)
+ task_io_accounting_add(&acct, &t->ioac);
- unlock_task_sighand(task, &flags);
+ } while (need_seqretry(&sig->stats_lock, seq));
+ done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
+ rcu_read_unlock();
+ } else {
+ acct = task->ioac;
}
+
seq_printf(m,
"rchar: %llu\n"
"wchar: %llu\n"
@@ -3818,7 +3826,7 @@ static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos,
for_each_thread(task, pos) {
if (!nr--)
goto found;
- };
+ }
fail:
pos = NULL;
goto out;
@@ -3840,10 +3848,8 @@ static struct task_struct *next_tid(struct task_struct *start)
struct task_struct *pos = NULL;
rcu_read_lock();
if (pid_alive(start)) {
- pos = next_thread(start);
- if (thread_group_leader(pos))
- pos = NULL;
- else
+ pos = __next_thread(start);
+ if (pos)
get_task_struct(pos);
}
rcu_read_unlock();
diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c
index 2e244ada1f97..902b326e1e56 100644
--- a/fs/proc/bootconfig.c
+++ b/fs/proc/bootconfig.c
@@ -62,6 +62,12 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size)
break;
dst += ret;
}
+ if (ret >= 0 && boot_command_line[0]) {
+ ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n",
+ boot_command_line);
+ if (ret > 0)
+ dst += ret;
+ }
}
out:
kfree(key);
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 6276b3938842..6e72e5ad42bc 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -113,10 +113,12 @@ static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode)
struct file *file;
rcu_read_lock();
- file = task_lookup_fd_rcu(task, fd);
- if (file)
- *mode = file->f_mode;
+ file = task_lookup_fdget_rcu(task, fd);
rcu_read_unlock();
+ if (file) {
+ *mode = file->f_mode;
+ fput(file);
+ }
return !!file;
}
@@ -259,12 +261,13 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
char name[10 + 1];
unsigned int len;
- f = task_lookup_next_fd_rcu(p, &fd);
+ f = task_lookup_next_fdget_rcu(p, &fd);
ctx->pos = fd + 2LL;
if (!f)
break;
data.mode = f->f_mode;
rcu_read_unlock();
+ fput(f);
data.fd = fd;
len = snprintf(name, sizeof(name), "%u", fd);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 532dc9d240f7..b33e490e3fd9 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -110,18 +110,15 @@ void __init proc_init_kmemcache(void)
void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock)
{
- struct inode *inode;
- struct proc_inode *ei;
struct hlist_node *node;
struct super_block *old_sb = NULL;
rcu_read_lock();
- for (;;) {
+ while ((node = hlist_first_rcu(inodes))) {
+ struct proc_inode *ei = hlist_entry(node, struct proc_inode, sibling_inodes);
struct super_block *sb;
- node = hlist_first_rcu(inodes);
- if (!node)
- break;
- ei = hlist_entry(node, struct proc_inode, sibling_inodes);
+ struct inode *inode;
+
spin_lock(lock);
hlist_del_init_rcu(&ei->sibling_inodes);
spin_unlock(lock);
@@ -660,7 +657,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
inode->i_private = de->data;
inode->i_ino = de->low_ino;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
PROC_I(inode)->pde = de;
if (is_empty_pde(de)) {
make_empty_dir_inode(inode);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 23fc24d16b31..6422e569b080 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -546,7 +546,8 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
* and explicitly excluded physical ranges.
*/
if (!page || PageOffline(page) ||
- is_page_hwpoison(page) || !pfn_is_ram(pfn)) {
+ is_page_hwpoison(page) || !pfn_is_ram(pfn) ||
+ pfn_is_unaccepted_memory(pfn)) {
if (iov_iter_zero(tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 4d3493579458..c6e7ebc63756 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -58,7 +58,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
if (file) {
seq_pad(m, ' ');
- seq_file_path(m, file, "");
+ seq_path(m, file_user_path(file), "");
}
seq_putc(m, '\n');
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index c88854df0b62..8064ea76f80b 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -465,7 +465,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
head->count++;
spin_unlock(&sysctl_lock);
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_mode = table->mode;
if (!S_ISDIR(table->mode)) {
inode->i_mode |= S_IFREG;
@@ -1576,7 +1576,6 @@ static const struct sysctl_alias sysctl_aliases[] = {
{"hung_task_panic", "kernel.hung_task_panic" },
{"numa_zonelist_order", "vm.numa_zonelist_order" },
{"softlockup_all_cpu_backtrace", "kernel.softlockup_all_cpu_backtrace" },
- {"softlockup_panic", "kernel.softlockup_panic" },
{ }
};
@@ -1592,6 +1591,13 @@ static const char *sysctl_find_alias(char *param)
return NULL;
}
+bool sysctl_is_alias(char *param)
+{
+ const char *alias = sysctl_find_alias(param);
+
+ return alias != NULL;
+}
+
/* Set sysctl value passed on kernel command line. */
static int process_sysctl_arg(char *param, char *val,
const char *unused, void *arg)
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 9191248f2dac..b55dbc70287b 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -188,7 +188,7 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc)
s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
/* procfs dentries and inodes don't require IO to create */
- s->s_shrink.seeks = 0;
+ s->s_shrink->seeks = 0;
pde_get(&proc_root);
root_inode = proc_get_inode(s, &proc_root);
diff --git a/fs/proc/self.c b/fs/proc/self.c
index ecc4da8d265e..b46fbfd22681 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -46,7 +46,7 @@ int proc_setup_self(struct super_block *s)
struct inode *inode = new_inode(s);
if (inode) {
inode->i_ino = self_inum;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_mode = S_IFLNK | S_IRWXUGO;
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3dd5be96691b..435b61054b5b 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -20,6 +20,8 @@
#include <linux/shmem_fs.h>
#include <linux/uaccess.h>
#include <linux/pkeys.h>
+#include <linux/minmax.h>
+#include <linux/overflow.h>
#include <asm/elf.h>
#include <asm/tlb.h>
@@ -296,7 +298,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
if (anon_name)
seq_printf(m, "[anon_shmem:%s]", anon_name->name);
else
- seq_file_path(m, file, "\n");
+ seq_path(m, file_user_path(file), "\n");
goto done;
}
@@ -849,9 +851,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
static int show_smap(struct seq_file *m, void *v)
{
struct vm_area_struct *vma = v;
- struct mem_size_stats mss;
-
- memset(&mss, 0, sizeof(mss));
+ struct mem_size_stats mss = {};
smap_gather_stats(vma, &mss, 0);
@@ -877,7 +877,7 @@ static int show_smap(struct seq_file *m, void *v)
static int show_smaps_rollup(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
- struct mem_size_stats mss;
+ struct mem_size_stats mss = {};
struct mm_struct *mm = priv->mm;
struct vm_area_struct *vma;
unsigned long vma_start = 0, last_vma_end = 0;
@@ -893,8 +893,6 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
goto out_put_task;
}
- memset(&mss, 0, sizeof(mss));
-
ret = mmap_read_lock_killable(mm);
if (ret)
goto out_put_mm;
@@ -1246,14 +1244,13 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
struct task_struct *task;
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
struct mm_struct *mm;
struct vm_area_struct *vma;
enum clear_refs_types type;
int itype;
int rv;
- memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
@@ -1761,11 +1758,753 @@ static int pagemap_release(struct inode *inode, struct file *file)
return 0;
}
+#define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \
+ PAGE_IS_FILE | PAGE_IS_PRESENT | \
+ PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \
+ PAGE_IS_HUGE)
+#define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC)
+
+struct pagemap_scan_private {
+ struct pm_scan_arg arg;
+ unsigned long masks_of_interest, cur_vma_category;
+ struct page_region *vec_buf;
+ unsigned long vec_buf_len, vec_buf_index, found_pages;
+ struct page_region __user *vec_out;
+};
+
+static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
+ struct vm_area_struct *vma,
+ unsigned long addr, pte_t pte)
+{
+ unsigned long categories = 0;
+
+ if (pte_present(pte)) {
+ struct page *page;
+
+ categories |= PAGE_IS_PRESENT;
+ if (!pte_uffd_wp(pte))
+ categories |= PAGE_IS_WRITTEN;
+
+ if (p->masks_of_interest & PAGE_IS_FILE) {
+ page = vm_normal_page(vma, addr, pte);
+ if (page && !PageAnon(page))
+ categories |= PAGE_IS_FILE;
+ }
+
+ if (is_zero_pfn(pte_pfn(pte)))
+ categories |= PAGE_IS_PFNZERO;
+ } else if (is_swap_pte(pte)) {
+ swp_entry_t swp;
+
+ categories |= PAGE_IS_SWAPPED;
+ if (!pte_swp_uffd_wp_any(pte))
+ categories |= PAGE_IS_WRITTEN;
+
+ if (p->masks_of_interest & PAGE_IS_FILE) {
+ swp = pte_to_swp_entry(pte);
+ if (is_pfn_swap_entry(swp) &&
+ !PageAnon(pfn_swap_entry_to_page(swp)))
+ categories |= PAGE_IS_FILE;
+ }
+ }
+
+ return categories;
+}
+
+static void make_uffd_wp_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *pte)
+{
+ pte_t ptent = ptep_get(pte);
+
+ if (pte_present(ptent)) {
+ pte_t old_pte;
+
+ old_pte = ptep_modify_prot_start(vma, addr, pte);
+ ptent = pte_mkuffd_wp(ptent);
+ ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
+ } else if (is_swap_pte(ptent)) {
+ ptent = pte_swp_mkuffd_wp(ptent);
+ set_pte_at(vma->vm_mm, addr, pte, ptent);
+ } else {
+ set_pte_at(vma->vm_mm, addr, pte,
+ make_pte_marker(PTE_MARKER_UFFD_WP));
+ }
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
+ struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd)
+{
+ unsigned long categories = PAGE_IS_HUGE;
+
+ if (pmd_present(pmd)) {
+ struct page *page;
+
+ categories |= PAGE_IS_PRESENT;
+ if (!pmd_uffd_wp(pmd))
+ categories |= PAGE_IS_WRITTEN;
+
+ if (p->masks_of_interest & PAGE_IS_FILE) {
+ page = vm_normal_page_pmd(vma, addr, pmd);
+ if (page && !PageAnon(page))
+ categories |= PAGE_IS_FILE;
+ }
+
+ if (is_zero_pfn(pmd_pfn(pmd)))
+ categories |= PAGE_IS_PFNZERO;
+ } else if (is_swap_pmd(pmd)) {
+ swp_entry_t swp;
+
+ categories |= PAGE_IS_SWAPPED;
+ if (!pmd_swp_uffd_wp(pmd))
+ categories |= PAGE_IS_WRITTEN;
+
+ if (p->masks_of_interest & PAGE_IS_FILE) {
+ swp = pmd_to_swp_entry(pmd);
+ if (is_pfn_swap_entry(swp) &&
+ !PageAnon(pfn_swap_entry_to_page(swp)))
+ categories |= PAGE_IS_FILE;
+ }
+ }
+
+ return categories;
+}
+
+static void make_uffd_wp_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ pmd_t old, pmd = *pmdp;
+
+ if (pmd_present(pmd)) {
+ old = pmdp_invalidate_ad(vma, addr, pmdp);
+ pmd = pmd_mkuffd_wp(old);
+ set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+ } else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
+ pmd = pmd_swp_mkuffd_wp(pmd);
+ set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+ }
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#ifdef CONFIG_HUGETLB_PAGE
+static unsigned long pagemap_hugetlb_category(pte_t pte)
+{
+ unsigned long categories = PAGE_IS_HUGE;
+
+ /*
+ * According to pagemap_hugetlb_range(), file-backed HugeTLB
+ * page cannot be swapped. So PAGE_IS_FILE is not checked for
+ * swapped pages.
+ */
+ if (pte_present(pte)) {
+ categories |= PAGE_IS_PRESENT;
+ if (!huge_pte_uffd_wp(pte))
+ categories |= PAGE_IS_WRITTEN;
+ if (!PageAnon(pte_page(pte)))
+ categories |= PAGE_IS_FILE;
+ if (is_zero_pfn(pte_pfn(pte)))
+ categories |= PAGE_IS_PFNZERO;
+ } else if (is_swap_pte(pte)) {
+ categories |= PAGE_IS_SWAPPED;
+ if (!pte_swp_uffd_wp_any(pte))
+ categories |= PAGE_IS_WRITTEN;
+ }
+
+ return categories;
+}
+
+static void make_uffd_wp_huge_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t ptent)
+{
+ unsigned long psize;
+
+ if (is_hugetlb_entry_hwpoisoned(ptent) || is_pte_marker(ptent))
+ return;
+
+ psize = huge_page_size(hstate_vma(vma));
+
+ if (is_hugetlb_entry_migration(ptent))
+ set_huge_pte_at(vma->vm_mm, addr, ptep,
+ pte_swp_mkuffd_wp(ptent), psize);
+ else if (!huge_pte_none(ptent))
+ huge_ptep_modify_prot_commit(vma, addr, ptep, ptent,
+ huge_pte_mkuffd_wp(ptent));
+ else
+ set_huge_pte_at(vma->vm_mm, addr, ptep,
+ make_pte_marker(PTE_MARKER_UFFD_WP), psize);
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
+static void pagemap_scan_backout_range(struct pagemap_scan_private *p,
+ unsigned long addr, unsigned long end)
+{
+ struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
+
+ if (cur_buf->start != addr)
+ cur_buf->end = addr;
+ else
+ cur_buf->start = cur_buf->end = 0;
+
+ p->found_pages -= (end - addr) / PAGE_SIZE;
+}
+#endif
+
+static bool pagemap_scan_is_interesting_page(unsigned long categories,
+ const struct pagemap_scan_private *p)
+{
+ categories ^= p->arg.category_inverted;
+ if ((categories & p->arg.category_mask) != p->arg.category_mask)
+ return false;
+ if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask))
+ return false;
+
+ return true;
+}
+
+static bool pagemap_scan_is_interesting_vma(unsigned long categories,
+ const struct pagemap_scan_private *p)
+{
+ unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED;
+
+ categories ^= p->arg.category_inverted;
+ if ((categories & required) != required)
+ return false;
+
+ return true;
+}
+
+static int pagemap_scan_test_walk(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned long vma_category = 0;
+ bool wp_allowed = userfaultfd_wp_async(vma) &&
+ userfaultfd_wp_use_markers(vma);
+
+ if (!wp_allowed) {
+ /* User requested explicit failure over wp-async capability */
+ if (p->arg.flags & PM_SCAN_CHECK_WPASYNC)
+ return -EPERM;
+ /*
+ * User requires wr-protect, and allows silently skipping
+ * unsupported vmas.
+ */
+ if (p->arg.flags & PM_SCAN_WP_MATCHING)
+ return 1;
+ /*
+ * Then the request doesn't involve wr-protects at all,
+ * fall through to the rest checks, and allow vma walk.
+ */
+ }
+
+ if (vma->vm_flags & VM_PFNMAP)
+ return 1;
+
+ if (wp_allowed)
+ vma_category |= PAGE_IS_WPALLOWED;
+
+ if (!pagemap_scan_is_interesting_vma(vma_category, p))
+ return 1;
+
+ p->cur_vma_category = vma_category;
+
+ return 0;
+}
+
+static bool pagemap_scan_push_range(unsigned long categories,
+ struct pagemap_scan_private *p,
+ unsigned long addr, unsigned long end)
+{
+ struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
+
+ /*
+ * When there is no output buffer provided at all, the sentinel values
+ * won't match here. There is no other way for `cur_buf->end` to be
+ * non-zero other than it being non-empty.
+ */
+ if (addr == cur_buf->end && categories == cur_buf->categories) {
+ cur_buf->end = end;
+ return true;
+ }
+
+ if (cur_buf->end) {
+ if (p->vec_buf_index >= p->vec_buf_len - 1)
+ return false;
+
+ cur_buf = &p->vec_buf[++p->vec_buf_index];
+ }
+
+ cur_buf->start = addr;
+ cur_buf->end = end;
+ cur_buf->categories = categories;
+
+ return true;
+}
+
+static int pagemap_scan_output(unsigned long categories,
+ struct pagemap_scan_private *p,
+ unsigned long addr, unsigned long *end)
+{
+ unsigned long n_pages, total_pages;
+ int ret = 0;
+
+ if (!p->vec_buf)
+ return 0;
+
+ categories &= p->arg.return_mask;
+
+ n_pages = (*end - addr) / PAGE_SIZE;
+ if (check_add_overflow(p->found_pages, n_pages, &total_pages) ||
+ total_pages > p->arg.max_pages) {
+ size_t n_too_much = total_pages - p->arg.max_pages;
+ *end -= n_too_much * PAGE_SIZE;
+ n_pages -= n_too_much;
+ ret = -ENOSPC;
+ }
+
+ if (!pagemap_scan_push_range(categories, p, addr, *end)) {
+ *end = addr;
+ n_pages = 0;
+ ret = -ENOSPC;
+ }
+
+ p->found_pages += n_pages;
+ if (ret)
+ p->arg.walk_end = *end;
+
+ return ret;
+}
+
+static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start,
+ unsigned long end, struct mm_walk *walk)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned long categories;
+ spinlock_t *ptl;
+ int ret = 0;
+
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (!ptl)
+ return -ENOENT;
+
+ categories = p->cur_vma_category |
+ pagemap_thp_category(p, vma, start, *pmd);
+
+ if (!pagemap_scan_is_interesting_page(categories, p))
+ goto out_unlock;
+
+ ret = pagemap_scan_output(categories, p, start, &end);
+ if (start == end)
+ goto out_unlock;
+
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+ goto out_unlock;
+ if (~categories & PAGE_IS_WRITTEN)
+ goto out_unlock;
+
+ /*
+ * Break huge page into small pages if the WP operation
+ * needs to be performed on a portion of the huge page.
+ */
+ if (end != start + HPAGE_SIZE) {
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmd, start);
+ pagemap_scan_backout_range(p, start, end);
+ /* Report as if there was no THP */
+ return -ENOENT;
+ }
+
+ make_uffd_wp_pmd(vma, start, pmd);
+ flush_tlb_range(vma, start, end);
+out_unlock:
+ spin_unlock(ptl);
+ return ret;
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+ return -ENOENT;
+#endif
+}
+
+static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned long addr, flush_end = 0;
+ pte_t *pte, *start_pte;
+ spinlock_t *ptl;
+ int ret;
+
+ arch_enter_lazy_mmu_mode();
+
+ ret = pagemap_scan_thp_entry(pmd, start, end, walk);
+ if (ret != -ENOENT) {
+ arch_leave_lazy_mmu_mode();
+ return ret;
+ }
+
+ ret = 0;
+ start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
+ if (!pte) {
+ arch_leave_lazy_mmu_mode();
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
+
+ if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) {
+ /* Fast path for performing exclusive WP */
+ for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
+ if (pte_uffd_wp(ptep_get(pte)))
+ continue;
+ make_uffd_wp_pte(vma, addr, pte);
+ if (!flush_end)
+ start = addr;
+ flush_end = addr + PAGE_SIZE;
+ }
+ goto flush_and_return;
+ }
+
+ if (!p->arg.category_anyof_mask && !p->arg.category_inverted &&
+ p->arg.category_mask == PAGE_IS_WRITTEN &&
+ p->arg.return_mask == PAGE_IS_WRITTEN) {
+ for (addr = start; addr < end; pte++, addr += PAGE_SIZE) {
+ unsigned long next = addr + PAGE_SIZE;
+
+ if (pte_uffd_wp(ptep_get(pte)))
+ continue;
+ ret = pagemap_scan_output(p->cur_vma_category | PAGE_IS_WRITTEN,
+ p, addr, &next);
+ if (next == addr)
+ break;
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+ continue;
+ make_uffd_wp_pte(vma, addr, pte);
+ if (!flush_end)
+ start = addr;
+ flush_end = next;
+ }
+ goto flush_and_return;
+ }
+
+ for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
+ unsigned long categories = p->cur_vma_category |
+ pagemap_page_category(p, vma, addr, ptep_get(pte));
+ unsigned long next = addr + PAGE_SIZE;
+
+ if (!pagemap_scan_is_interesting_page(categories, p))
+ continue;
+
+ ret = pagemap_scan_output(categories, p, addr, &next);
+ if (next == addr)
+ break;
+
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+ continue;
+ if (~categories & PAGE_IS_WRITTEN)
+ continue;
+
+ make_uffd_wp_pte(vma, addr, pte);
+ if (!flush_end)
+ start = addr;
+ flush_end = next;
+ }
+
+flush_and_return:
+ if (flush_end)
+ flush_tlb_range(vma, start, addr);
+
+ pte_unmap_unlock(start_pte, ptl);
+ arch_leave_lazy_mmu_mode();
+
+ cond_resched();
+ return ret;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask,
+ unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned long categories;
+ spinlock_t *ptl;
+ int ret = 0;
+ pte_t pte;
+
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING) {
+ /* Go the short route when not write-protecting pages. */
+
+ pte = huge_ptep_get(ptep);
+ categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
+
+ if (!pagemap_scan_is_interesting_page(categories, p))
+ return 0;
+
+ return pagemap_scan_output(categories, p, start, &end);
+ }
+
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+ ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep);
+
+ pte = huge_ptep_get(ptep);
+ categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
+
+ if (!pagemap_scan_is_interesting_page(categories, p))
+ goto out_unlock;
+
+ ret = pagemap_scan_output(categories, p, start, &end);
+ if (start == end)
+ goto out_unlock;
+
+ if (~categories & PAGE_IS_WRITTEN)
+ goto out_unlock;
+
+ if (end != start + HPAGE_SIZE) {
+ /* Partial HugeTLB page WP isn't possible. */
+ pagemap_scan_backout_range(p, start, end);
+ p->arg.walk_end = start;
+ ret = 0;
+ goto out_unlock;
+ }
+
+ make_uffd_wp_huge_pte(vma, start, ptep, pte);
+ flush_hugetlb_tlb_range(vma, start, end);
+
+out_unlock:
+ spin_unlock(ptl);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+
+ return ret;
+}
+#else
+#define pagemap_scan_hugetlb_entry NULL
+#endif
+
+static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end,
+ int depth, struct mm_walk *walk)
+{
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ int ret, err;
+
+ if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p))
+ return 0;
+
+ ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end);
+ if (addr == end)
+ return ret;
+
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+ return ret;
+
+ err = uffd_wp_range(vma, addr, end - addr, true);
+ if (err < 0)
+ ret = err;
+
+ return ret;
+}
+
+static const struct mm_walk_ops pagemap_scan_ops = {
+ .test_walk = pagemap_scan_test_walk,
+ .pmd_entry = pagemap_scan_pmd_entry,
+ .pte_hole = pagemap_scan_pte_hole,
+ .hugetlb_entry = pagemap_scan_hugetlb_entry,
+};
+
+static int pagemap_scan_get_args(struct pm_scan_arg *arg,
+ unsigned long uarg)
+{
+ if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg)))
+ return -EFAULT;
+
+ if (arg->size != sizeof(struct pm_scan_arg))
+ return -EINVAL;
+
+ /* Validate requested features */
+ if (arg->flags & ~PM_SCAN_FLAGS)
+ return -EINVAL;
+ if ((arg->category_inverted | arg->category_mask |
+ arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES)
+ return -EINVAL;
+
+ arg->start = untagged_addr((unsigned long)arg->start);
+ arg->end = untagged_addr((unsigned long)arg->end);
+ arg->vec = untagged_addr((unsigned long)arg->vec);
+
+ /* Validate memory pointers */
+ if (!IS_ALIGNED(arg->start, PAGE_SIZE))
+ return -EINVAL;
+ if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start))
+ return -EFAULT;
+ if (!arg->vec && arg->vec_len)
+ return -EINVAL;
+ if (arg->vec && !access_ok((void __user *)(long)arg->vec,
+ arg->vec_len * sizeof(struct page_region)))
+ return -EFAULT;
+
+ /* Fixup default values */
+ arg->end = ALIGN(arg->end, PAGE_SIZE);
+ arg->walk_end = 0;
+ if (!arg->max_pages)
+ arg->max_pages = ULONG_MAX;
+
+ return 0;
+}
+
+static int pagemap_scan_writeback_args(struct pm_scan_arg *arg,
+ unsigned long uargl)
+{
+ struct pm_scan_arg __user *uarg = (void __user *)uargl;
+
+ if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p)
+{
+ if (!p->arg.vec_len)
+ return 0;
+
+ p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT,
+ p->arg.vec_len);
+ p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf),
+ GFP_KERNEL);
+ if (!p->vec_buf)
+ return -ENOMEM;
+
+ p->vec_buf->start = p->vec_buf->end = 0;
+ p->vec_out = (struct page_region __user *)(long)p->arg.vec;
+
+ return 0;
+}
+
+static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p)
+{
+ const struct page_region *buf = p->vec_buf;
+ long n = p->vec_buf_index;
+
+ if (!p->vec_buf)
+ return 0;
+
+ if (buf[n].end != buf[n].start)
+ n++;
+
+ if (!n)
+ return 0;
+
+ if (copy_to_user(p->vec_out, buf, n * sizeof(*buf)))
+ return -EFAULT;
+
+ p->arg.vec_len -= n;
+ p->vec_out += n;
+
+ p->vec_buf_index = 0;
+ p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len);
+ p->vec_buf->start = p->vec_buf->end = 0;
+
+ return n;
+}
+
+static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
+{
+ struct mmu_notifier_range range;
+ struct pagemap_scan_private p = {0};
+ unsigned long walk_start;
+ size_t n_ranges_out = 0;
+ int ret;
+
+ ret = pagemap_scan_get_args(&p.arg, uarg);
+ if (ret)
+ return ret;
+
+ p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask |
+ p.arg.return_mask;
+ ret = pagemap_scan_init_bounce_buffer(&p);
+ if (ret)
+ return ret;
+
+ /* Protection change for the range is going to happen. */
+ if (p.arg.flags & PM_SCAN_WP_MATCHING) {
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0,
+ mm, p.arg.start, p.arg.end);
+ mmu_notifier_invalidate_range_start(&range);
+ }
+
+ for (walk_start = p.arg.start; walk_start < p.arg.end;
+ walk_start = p.arg.walk_end) {
+ long n_out;
+
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+
+ ret = mmap_read_lock_killable(mm);
+ if (ret)
+ break;
+ ret = walk_page_range(mm, walk_start, p.arg.end,
+ &pagemap_scan_ops, &p);
+ mmap_read_unlock(mm);
+
+ n_out = pagemap_scan_flush_buffer(&p);
+ if (n_out < 0)
+ ret = n_out;
+ else
+ n_ranges_out += n_out;
+
+ if (ret != -ENOSPC)
+ break;
+
+ if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages)
+ break;
+ }
+
+ /* ENOSPC signifies early stop (buffer full) from the walk. */
+ if (!ret || ret == -ENOSPC)
+ ret = n_ranges_out;
+
+ /* The walk_end isn't set when ret is zero */
+ if (!p.arg.walk_end)
+ p.arg.walk_end = p.arg.end;
+ if (pagemap_scan_writeback_args(&p.arg, uarg))
+ ret = -EFAULT;
+
+ if (p.arg.flags & PM_SCAN_WP_MATCHING)
+ mmu_notifier_invalidate_range_end(&range);
+
+ kfree(p.vec_buf);
+ return ret;
+}
+
+static long do_pagemap_cmd(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct mm_struct *mm = file->private_data;
+
+ switch (cmd) {
+ case PAGEMAP_SCAN:
+ return do_pagemap_scan(mm, arg);
+
+ default:
+ return -EINVAL;
+ }
+}
+
const struct file_operations proc_pagemap_operations = {
.llseek = mem_lseek, /* borrow this */
.read = pagemap_read,
.open = pagemap_open,
.release = pagemap_release,
+ .unlocked_ioctl = do_pagemap_cmd,
+ .compat_ioctl = do_pagemap_cmd,
};
#endif /* CONFIG_PROC_PAGE_MONITOR */
@@ -1945,8 +2684,9 @@ static int show_numa_map(struct seq_file *m, void *v)
struct numa_maps *md = &numa_priv->md;
struct file *file = vma->vm_file;
struct mm_struct *mm = vma->vm_mm;
- struct mempolicy *pol;
char buffer[64];
+ struct mempolicy *pol;
+ pgoff_t ilx;
int nid;
if (!mm)
@@ -1955,7 +2695,7 @@ static int show_numa_map(struct seq_file *m, void *v)
/* Ensure we start with an empty set of numa_maps statistics. */
memset(md, 0, sizeof(*md));
- pol = __get_vma_policy(vma, vma->vm_start);
+ pol = __get_vma_policy(vma, vma->vm_start, &ilx);
if (pol) {
mpol_to_str(buffer, sizeof(buffer), pol);
mpol_cond_put(pol);
@@ -1967,7 +2707,7 @@ static int show_numa_map(struct seq_file *m, void *v)
if (file) {
seq_puts(m, " file=");
- seq_file_path(m, file, "\n\t= ");
+ seq_path(m, file_user_path(file), "\n\t= ");
} else if (vma_is_initial_heap(vma)) {
seq_puts(m, " heap");
} else if (vma_is_initial_stack(vma)) {
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 7cebd397cc26..bce674533000 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -157,7 +157,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
if (file) {
seq_pad(m, ' ');
- seq_file_path(m, file, "");
+ seq_path(m, file_user_path(file), "");
} else if (mm && vma_is_initial_stack(vma)) {
seq_pad(m, ' ');
seq_puts(m, "[stack]");
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index 63ac1f93289f..0e5050d6ab64 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -46,7 +46,7 @@ int proc_setup_thread_self(struct super_block *s)
struct inode *inode = new_inode(s);
if (inode) {
inode->i_ino = thread_self_inum;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_mode = S_IFLNK | S_IRWXUGO;
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 585360706b33..d41c20d1b5e8 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -223,7 +223,7 @@ static struct inode *pstore_get_inode(struct super_block *sb)
struct inode *inode = new_inode(sb);
if (inode) {
inode->i_ino = get_next_ino();
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
}
return inode;
}
@@ -390,7 +390,8 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record)
inode->i_private = private;
if (record->time.tv_sec)
- inode->i_mtime = inode_set_ctime_to_ts(inode, record->time);
+ inode_set_mtime_to_ts(inode,
+ inode_set_ctime_to_ts(inode, record->time));
d_add(dentry, inode);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index e5bca9a004cc..03425928d2fb 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -464,6 +464,8 @@ out:
*/
int pstore_register(struct pstore_info *psi)
{
+ char *new_backend;
+
if (backend && strcmp(backend, psi->name)) {
pr_warn("backend '%s' already in use: ignoring '%s'\n",
backend, psi->name);
@@ -484,11 +486,16 @@ int pstore_register(struct pstore_info *psi)
return -EINVAL;
}
+ new_backend = kstrdup(psi->name, GFP_KERNEL);
+ if (!new_backend)
+ return -ENOMEM;
+
mutex_lock(&psinfo_lock);
if (psinfo) {
pr_warn("backend '%s' already loaded: ignoring '%s'\n",
psinfo->name, psi->name);
mutex_unlock(&psinfo_lock);
+ kfree(new_backend);
return -EBUSY;
}
@@ -521,7 +528,7 @@ int pstore_register(struct pstore_info *psi)
* Update the module parameter backend, so it is visible
* through /sys/module/pstore/parameters/backend
*/
- backend = kstrdup(psi->name, GFP_KERNEL);
+ backend = new_backend;
pr_info("Registered %s as persistent store backend\n", psi->name);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index a7171f5532a1..6eb9bb369b57 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -301,10 +301,8 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
i_gid_write(inode, (gid_t)le16_to_cpu(raw_inode->di_gid));
set_nlink(inode, le16_to_cpu(raw_inode->di_nlink));
inode->i_size = le32_to_cpu(raw_inode->di_size);
- inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->di_mtime);
- inode->i_mtime.tv_nsec = 0;
- inode->i_atime.tv_sec = le32_to_cpu(raw_inode->di_atime);
- inode->i_atime.tv_nsec = 0;
+ inode_set_mtime(inode, le32_to_cpu(raw_inode->di_mtime), 0);
+ inode_set_atime(inode, le32_to_cpu(raw_inode->di_atime), 0);
inode_set_ctime(inode, le32_to_cpu(raw_inode->di_ctime), 0);
inode->i_blocks = le32_to_cpu(raw_inode->di_first_xtnt.xtnt_size);
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 21f90d519f1a..a286c545717f 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -558,10 +558,8 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino)
i_uid_write(inode, (uid_t)fs32_to_cpu(sbi, raw_inode->di_uid));
i_gid_write(inode, (gid_t)fs32_to_cpu(sbi, raw_inode->di_gid));
inode->i_size = fs64_to_cpu(sbi, raw_inode->di_size);
- inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->di_mtime);
- inode->i_mtime.tv_nsec = 0;
- inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->di_atime);
- inode->i_atime.tv_nsec = 0;
+ inode_set_mtime(inode, fs32_to_cpu(sbi, raw_inode->di_mtime), 0);
+ inode_set_atime(inode, fs32_to_cpu(sbi, raw_inode->di_atime), 0);
inode_set_ctime(inode, fs32_to_cpu(sbi, raw_inode->di_ctime), 0);
/* calc blocks based on 512 byte blocksize */
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 31e897ad5e6a..58b5de081b57 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -803,12 +803,6 @@ dqcache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS]));
}
-static struct shrinker dqcache_shrinker = {
- .count_objects = dqcache_shrink_count,
- .scan_objects = dqcache_shrink_scan,
- .seeks = DEFAULT_SEEKS,
-};
-
/*
* Safely release dquot and put reference to dquot.
*/
@@ -2351,6 +2345,20 @@ static int vfs_setup_quota_inode(struct inode *inode, int type)
if (sb_has_quota_loaded(sb, type))
return -EBUSY;
+ /*
+ * Quota files should never be encrypted. They should be thought of as
+ * filesystem metadata, not user data. New-style internal quota files
+ * cannot be encrypted by users anyway, but old-style external quota
+ * files could potentially be incorrectly created in an encrypted
+ * directory, hence this explicit check. Some reasons why encrypted
+ * quota files don't work include: (1) some filesystems that support
+ * encryption don't handle it in their quota_read and quota_write, and
+ * (2) cleaning up encrypted quota files at unmount would need special
+ * consideration, as quota files are cleaned up later than user files.
+ */
+ if (IS_ENCRYPTED(inode))
+ return -EINVAL;
+
dqopt->files[type] = igrab(inode);
if (!dqopt->files[type])
return -EIO;
@@ -2968,6 +2976,7 @@ static int __init dquot_init(void)
{
int i, ret;
unsigned long nr_hash, order;
+ struct shrinker *dqcache_shrinker;
printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__);
@@ -3002,8 +3011,14 @@ static int __init dquot_init(void)
pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld,"
" %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order));
- if (register_shrinker(&dqcache_shrinker, "dquota-cache"))
- panic("Cannot register dquot shrinker");
+ dqcache_shrinker = shrinker_alloc(0, "dquota-cache");
+ if (!dqcache_shrinker)
+ panic("Cannot allocate dquot shrinker");
+
+ dqcache_shrinker->count_objects = dqcache_shrink_count;
+ dqcache_shrinker->scan_objects = dqcache_shrink_scan;
+
+ shrinker_register(dqcache_shrinker);
return 0;
}
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 18e8387cab41..4ac05a9e25bc 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -65,7 +65,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
inode->i_mapping->a_ops = &ram_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
mapping_set_unevictable(inode->i_mapping);
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
switch (mode & S_IFMT) {
default:
init_special_inode(inode, mode, dev);
@@ -105,7 +105,7 @@ ramfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
error = 0;
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
}
return error;
}
@@ -138,7 +138,8 @@ static int ramfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
if (!error) {
d_instantiate(dentry, inode);
dget(dentry);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_current(dir));
} else
iput(inode);
}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 86e55d4bb10d..1d825459ee6e 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1257,11 +1257,9 @@ static void init_inode(struct inode *inode, struct treepath *path)
i_uid_write(inode, sd_v1_uid(sd));
i_gid_write(inode, sd_v1_gid(sd));
inode->i_size = sd_v1_size(sd);
- inode->i_atime.tv_sec = sd_v1_atime(sd);
- inode->i_mtime.tv_sec = sd_v1_mtime(sd);
+ inode_set_atime(inode, sd_v1_atime(sd), 0);
+ inode_set_mtime(inode, sd_v1_mtime(sd), 0);
inode_set_ctime(inode, sd_v1_ctime(sd), 0);
- inode->i_atime.tv_nsec = 0;
- inode->i_mtime.tv_nsec = 0;
inode->i_blocks = sd_v1_blocks(sd);
inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
@@ -1311,11 +1309,9 @@ static void init_inode(struct inode *inode, struct treepath *path)
i_uid_write(inode, sd_v2_uid(sd));
inode->i_size = sd_v2_size(sd);
i_gid_write(inode, sd_v2_gid(sd));
- inode->i_mtime.tv_sec = sd_v2_mtime(sd);
- inode->i_atime.tv_sec = sd_v2_atime(sd);
+ inode_set_mtime(inode, sd_v2_mtime(sd), 0);
+ inode_set_atime(inode, sd_v2_atime(sd), 0);
inode_set_ctime(inode, sd_v2_ctime(sd), 0);
- inode->i_mtime.tv_nsec = 0;
- inode->i_atime.tv_nsec = 0;
inode->i_blocks = sd_v2_blocks(sd);
rdev = sd_v2_rdev(sd);
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
@@ -1370,9 +1366,9 @@ static void inode2sd(void *sd, struct inode *inode, loff_t size)
set_sd_v2_uid(sd_v2, i_uid_read(inode));
set_sd_v2_size(sd_v2, size);
set_sd_v2_gid(sd_v2, i_gid_read(inode));
- set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec);
- set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec);
- set_sd_v2_ctime(sd_v2, inode_get_ctime(inode).tv_sec);
+ set_sd_v2_mtime(sd_v2, inode_get_mtime_sec(inode));
+ set_sd_v2_atime(sd_v2, inode_get_atime_sec(inode));
+ set_sd_v2_ctime(sd_v2, inode_get_ctime_sec(inode));
set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
@@ -1391,9 +1387,9 @@ static void inode2sd_v1(void *sd, struct inode *inode, loff_t size)
set_sd_v1_gid(sd_v1, i_gid_read(inode));
set_sd_v1_nlink(sd_v1, inode->i_nlink);
set_sd_v1_size(sd_v1, size);
- set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec);
- set_sd_v1_ctime(sd_v1, inode_get_ctime(inode).tv_sec);
- set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec);
+ set_sd_v1_atime(sd_v1, inode_get_atime_sec(inode));
+ set_sd_v1_ctime(sd_v1, inode_get_ctime_sec(inode));
+ set_sd_v1_mtime(sd_v1, inode_get_mtime_sec(inode));
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev));
@@ -1984,7 +1980,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
/* uid and gid must already be set by the caller for quota init */
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_size = i_size;
inode->i_blocks = 0;
inode->i_bytes = 0;
@@ -2507,10 +2503,10 @@ out:
* start/recovery path as __block_write_full_folio, along with special
* code to handle reiserfs tails.
*/
-static int reiserfs_write_full_page(struct page *page,
+static int reiserfs_write_full_folio(struct folio *folio,
struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
unsigned long end_index = inode->i_size >> PAGE_SHIFT;
int error = 0;
unsigned long block;
@@ -2518,7 +2514,7 @@ static int reiserfs_write_full_page(struct page *page,
struct buffer_head *head, *bh;
int partial = 0;
int nr = 0;
- int checked = PageChecked(page);
+ int checked = folio_test_checked(folio);
struct reiserfs_transaction_handle th;
struct super_block *s = inode->i_sb;
int bh_per_page = PAGE_SIZE / s->s_blocksize;
@@ -2526,47 +2522,46 @@ static int reiserfs_write_full_page(struct page *page,
/* no logging allowed when nonblocking or from PF_MEMALLOC */
if (checked && (current->flags & PF_MEMALLOC)) {
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
+ folio_redirty_for_writepage(wbc, folio);
+ folio_unlock(folio);
return 0;
}
/*
- * The page dirty bit is cleared before writepage is called, which
+ * The folio dirty bit is cleared before writepage is called, which
* means we have to tell create_empty_buffers to make dirty buffers
- * The page really should be up to date at this point, so tossing
+ * The folio really should be up to date at this point, so tossing
* in the BH_Uptodate is just a sanity check.
*/
- if (!page_has_buffers(page)) {
- create_empty_buffers(page, s->s_blocksize,
+ head = folio_buffers(folio);
+ if (!head)
+ head = create_empty_buffers(folio, s->s_blocksize,
(1 << BH_Dirty) | (1 << BH_Uptodate));
- }
- head = page_buffers(page);
/*
- * last page in the file, zero out any contents past the
+ * last folio in the file, zero out any contents past the
* last byte in the file
*/
- if (page->index >= end_index) {
+ if (folio->index >= end_index) {
unsigned last_offset;
last_offset = inode->i_size & (PAGE_SIZE - 1);
- /* no file contents in this page */
- if (page->index >= end_index + 1 || !last_offset) {
- unlock_page(page);
+ /* no file contents in this folio */
+ if (folio->index >= end_index + 1 || !last_offset) {
+ folio_unlock(folio);
return 0;
}
- zero_user_segment(page, last_offset, PAGE_SIZE);
+ folio_zero_segment(folio, last_offset, folio_size(folio));
}
bh = head;
- block = page->index << (PAGE_SHIFT - s->s_blocksize_bits);
+ block = folio->index << (PAGE_SHIFT - s->s_blocksize_bits);
last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
/* first map all the buffers, logging any direct items we find */
do {
if (block > last_block) {
/*
* This can happen when the block size is less than
- * the page size. The corresponding bytes in the page
+ * the folio size. The corresponding bytes in the folio
* were zero filled above
*/
clear_buffer_dirty(bh);
@@ -2593,7 +2588,7 @@ static int reiserfs_write_full_page(struct page *page,
* blocks we're going to log
*/
if (checked) {
- ClearPageChecked(page);
+ folio_clear_checked(folio);
reiserfs_write_lock(s);
error = journal_begin(&th, s, bh_per_page + 1);
if (error) {
@@ -2602,7 +2597,7 @@ static int reiserfs_write_full_page(struct page *page,
}
reiserfs_update_inode_transaction(inode);
}
- /* now go through and lock any dirty buffers on the page */
+ /* now go through and lock any dirty buffers on the folio */
do {
get_bh(bh);
if (!buffer_mapped(bh))
@@ -2623,7 +2618,7 @@ static int reiserfs_write_full_page(struct page *page,
lock_buffer(bh);
} else {
if (!trylock_buffer(bh)) {
- redirty_page_for_writepage(wbc, page);
+ folio_redirty_for_writepage(wbc, folio);
continue;
}
}
@@ -2640,13 +2635,13 @@ static int reiserfs_write_full_page(struct page *page,
if (error)
goto fail;
}
- BUG_ON(PageWriteback(page));
- set_page_writeback(page);
- unlock_page(page);
+ BUG_ON(folio_test_writeback(folio));
+ folio_start_writeback(folio);
+ folio_unlock(folio);
/*
- * since any buffer might be the only dirty buffer on the page,
- * the first submit_bh can bring the page out of writeback.
+ * since any buffer might be the only dirty buffer on the folio,
+ * the first submit_bh can bring the folio out of writeback.
* be careful with the buffers.
*/
do {
@@ -2663,10 +2658,10 @@ static int reiserfs_write_full_page(struct page *page,
done:
if (nr == 0) {
/*
- * if this page only had a direct item, it is very possible for
+ * if this folio only had a direct item, it is very possible for
* no io to be required without there being an error. Or,
* someone else could have locked them and sent them down the
- * pipe without locking the page
+ * pipe without locking the folio
*/
bh = head;
do {
@@ -2677,18 +2672,18 @@ done:
bh = bh->b_this_page;
} while (bh != head);
if (!partial)
- SetPageUptodate(page);
- end_page_writeback(page);
+ folio_mark_uptodate(folio);
+ folio_end_writeback(folio);
}
return error;
fail:
/*
* catches various errors, we need to make sure any valid dirty blocks
- * get to the media. The page is currently locked and not marked for
+ * get to the media. The folio is currently locked and not marked for
* writeback
*/
- ClearPageUptodate(page);
+ folio_clear_uptodate(folio);
bh = head;
do {
get_bh(bh);
@@ -2698,16 +2693,16 @@ fail:
} else {
/*
* clear any dirty bits that might have come from
- * getting attached to a dirty page
+ * getting attached to a dirty folio
*/
clear_buffer_dirty(bh);
}
bh = bh->b_this_page;
} while (bh != head);
- SetPageError(page);
- BUG_ON(PageWriteback(page));
- set_page_writeback(page);
- unlock_page(page);
+ folio_set_error(folio);
+ BUG_ON(folio_test_writeback(folio));
+ folio_start_writeback(folio);
+ folio_unlock(folio);
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
@@ -2728,9 +2723,10 @@ static int reiserfs_read_folio(struct file *f, struct folio *folio)
static int reiserfs_writepage(struct page *page, struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
+ struct folio *folio = page_folio(page);
+ struct inode *inode = folio->mapping->host;
reiserfs_wait_on_write_block(inode->i_sb);
- return reiserfs_write_full_page(page, wbc);
+ return reiserfs_write_full_folio(folio, wbc);
}
static void reiserfs_truncate_failed_write(struct inode *inode)
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 015bfe4e4524..171c912af50f 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -90,8 +90,7 @@ static int flush_commit_list(struct super_block *s,
static int can_dirty(struct reiserfs_journal_cnode *cn);
static int journal_join(struct reiserfs_transaction_handle *th,
struct super_block *sb);
-static void release_journal_dev(struct super_block *super,
- struct reiserfs_journal *journal);
+static void release_journal_dev(struct reiserfs_journal *journal);
static void dirty_one_transaction(struct super_block *s,
struct reiserfs_journal_list *jl);
static void flush_async_commits(struct work_struct *work);
@@ -1893,7 +1892,7 @@ static void free_journal_ram(struct super_block *sb)
* j_header_bh is on the journal dev, make sure
* not to release the journal dev until we brelse j_header_bh
*/
- release_journal_dev(sb, journal);
+ release_journal_dev(journal);
vfree(journal);
}
@@ -2387,7 +2386,7 @@ static int journal_read(struct super_block *sb)
cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb);
reiserfs_info(sb, "checking transaction log (%pg)\n",
- journal->j_dev_bd);
+ journal->j_bdev_handle->bdev);
start = ktime_get_seconds();
/*
@@ -2448,7 +2447,7 @@ static int journal_read(struct super_block *sb)
* device and journal device to be the same
*/
d_bh =
- reiserfs_breada(journal->j_dev_bd, cur_dblock,
+ reiserfs_breada(journal->j_bdev_handle->bdev, cur_dblock,
sb->s_blocksize,
SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
SB_ONDISK_JOURNAL_SIZE(sb));
@@ -2587,17 +2586,11 @@ static void journal_list_init(struct super_block *sb)
SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb);
}
-static void release_journal_dev(struct super_block *super,
- struct reiserfs_journal *journal)
+static void release_journal_dev(struct reiserfs_journal *journal)
{
- if (journal->j_dev_bd != NULL) {
- void *holder = NULL;
-
- if (journal->j_dev_bd->bd_dev != super->s_dev)
- holder = journal;
-
- blkdev_put(journal->j_dev_bd, holder);
- journal->j_dev_bd = NULL;
+ if (journal->j_bdev_handle) {
+ bdev_release(journal->j_bdev_handle);
+ journal->j_bdev_handle = NULL;
}
}
@@ -2612,7 +2605,7 @@ static int journal_init_dev(struct super_block *super,
result = 0;
- journal->j_dev_bd = NULL;
+ journal->j_bdev_handle = NULL;
jdev = SB_ONDISK_JOURNAL_DEVICE(super) ?
new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev;
@@ -2623,36 +2616,37 @@ static int journal_init_dev(struct super_block *super,
if ((!jdev_name || !jdev_name[0])) {
if (jdev == super->s_dev)
holder = NULL;
- journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode, holder,
- NULL);
- if (IS_ERR(journal->j_dev_bd)) {
- result = PTR_ERR(journal->j_dev_bd);
- journal->j_dev_bd = NULL;
+ journal->j_bdev_handle = bdev_open_by_dev(jdev, blkdev_mode,
+ holder, NULL);
+ if (IS_ERR(journal->j_bdev_handle)) {
+ result = PTR_ERR(journal->j_bdev_handle);
+ journal->j_bdev_handle = NULL;
reiserfs_warning(super, "sh-458",
"cannot init journal device unknown-block(%u,%u): %i",
MAJOR(jdev), MINOR(jdev), result);
return result;
} else if (jdev != super->s_dev)
- set_blocksize(journal->j_dev_bd, super->s_blocksize);
+ set_blocksize(journal->j_bdev_handle->bdev,
+ super->s_blocksize);
return 0;
}
- journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, holder,
- NULL);
- if (IS_ERR(journal->j_dev_bd)) {
- result = PTR_ERR(journal->j_dev_bd);
- journal->j_dev_bd = NULL;
+ journal->j_bdev_handle = bdev_open_by_path(jdev_name, blkdev_mode,
+ holder, NULL);
+ if (IS_ERR(journal->j_bdev_handle)) {
+ result = PTR_ERR(journal->j_bdev_handle);
+ journal->j_bdev_handle = NULL;
reiserfs_warning(super, "sh-457",
"journal_init_dev: Cannot open '%s': %i",
jdev_name, result);
return result;
}
- set_blocksize(journal->j_dev_bd, super->s_blocksize);
+ set_blocksize(journal->j_bdev_handle->bdev, super->s_blocksize);
reiserfs_info(super,
"journal_init_dev: journal device: %pg\n",
- journal->j_dev_bd);
+ journal->j_bdev_handle->bdev);
return 0;
}
@@ -2810,7 +2804,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
"journal header magic %x (device %pg) does "
"not match to magic found in super block %x",
jh->jh_journal.jp_journal_magic,
- journal->j_dev_bd,
+ journal->j_bdev_handle->bdev,
sb_jp_journal_magic(rs));
brelse(bhjh);
goto free_and_return;
@@ -2834,7 +2828,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
reiserfs_info(sb, "journal params: device %pg, size %u, "
"journal first block %u, max trans len %u, max batch %u, "
"max commit age %u, max trans age %u\n",
- journal->j_dev_bd,
+ journal->j_bdev_handle->bdev,
SB_ONDISK_JOURNAL_SIZE(sb),
SB_ONDISK_JOURNAL_1st_BLOCK(sb),
journal->j_trans_max,
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 9c5704be2435..994d6e6995ab 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -572,7 +572,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
}
dir->i_size += paste_size;
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
if (!S_ISDIR(inode->i_mode) && visible)
/* reiserfs_mkdir or reiserfs_rename will do that by itself */
reiserfs_update_sd(th, dir);
@@ -966,8 +966,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
inode->i_nlink);
clear_nlink(inode);
- dir->i_mtime = inode_set_ctime_to_ts(dir,
- inode_set_ctime_current(inode));
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
reiserfs_update_sd(&th, inode);
DEC_DIR_INODE_NLINK(dir)
@@ -1075,7 +1075,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
reiserfs_update_sd(&th, inode);
dir->i_size -= (de.de_entrylen + DEH_SIZE);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
reiserfs_update_sd(&th, dir);
if (!savelink)
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 3dba8acf4e83..83cb9402e0f9 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -354,7 +354,7 @@ static int show_journal(struct seq_file *m, void *unused)
"prepare: \t%12lu\n"
"prepare_retry: \t%12lu\n",
DJP(jp_journal_1st_block),
- SB_JOURNAL(sb)->j_dev_bd,
+ SB_JOURNAL(sb)->j_bdev_handle->bdev,
DJP(jp_journal_dev),
DJP(jp_journal_size),
DJP(jp_journal_trans_max),
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 7d12b8c5b2fa..725667880e62 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -299,7 +299,7 @@ struct reiserfs_journal {
/* oldest journal block. start here for traverse */
struct reiserfs_journal_cnode *j_first;
- struct block_device *j_dev_bd;
+ struct bdev_handle *j_bdev_handle;
/* first block on s_dev of reserved area journal */
int j_1st_reserved_block;
@@ -1165,7 +1165,7 @@ static inline int bmap_would_wrap(unsigned bmap_nr)
return bmap_nr > ((1LL << 16) - 1);
}
-extern const struct xattr_handler *reiserfs_xattr_handlers[];
+extern const struct xattr_handler * const reiserfs_xattr_handlers[];
/*
* this says about version of key of all items (but stat data) the
@@ -2809,9 +2809,12 @@ struct reiserfs_journal_header {
#define journal_hash(t,sb,block) ((t)[_jhashfn((sb),(block)) & JBH_HASH_MASK])
/* We need these to make journal.c code more readable */
-#define journal_find_get_block(s, block) __find_get_block(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
-#define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
-#define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
+#define journal_find_get_block(s, block) __find_get_block(\
+ SB_JOURNAL(s)->j_bdev_handle->bdev, block, s->s_blocksize)
+#define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_bdev_handle->bdev,\
+ block, s->s_blocksize)
+#define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_bdev_handle->bdev,\
+ block, s->s_blocksize)
enum reiserfs_bh_state_bits {
BH_JDirty = BH_PrivateStart, /* buffer is in current transaction */
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 3676e02a0232..2138ee7d271d 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -2003,7 +2003,8 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
pathrelse(&s_search_path);
if (update_timestamps) {
- inode->i_mtime = current_time(inode);
+ inode_set_mtime_to_ts(inode,
+ current_time(inode));
inode_set_ctime_current(inode);
}
reiserfs_update_sd(th, inode);
@@ -2028,7 +2029,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
update_and_out:
if (update_timestamps) {
/* this is truncate, not file closing */
- inode->i_mtime = current_time(inode);
+ inode_set_mtime_to_ts(inode, current_time(inode));
inode_set_ctime_current(inode);
}
reiserfs_update_sd(th, inode);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 7eaf36b3de12..67b5510beded 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2587,7 +2587,7 @@ out:
return err;
if (inode->i_size < off + len - towrite)
i_size_write(inode, off + len - towrite);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
return len - towrite;
}
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 6000964c2b80..998035a6388e 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -780,7 +780,7 @@ static inline bool reiserfs_posix_acl_list(const char *name,
}
/* This is the implementation for the xattr plugin infrastructure */
-static inline bool reiserfs_xattr_list(const struct xattr_handler **handlers,
+static inline bool reiserfs_xattr_list(const struct xattr_handler * const *handlers,
const char *name, struct dentry *dentry)
{
if (handlers) {
@@ -911,7 +911,7 @@ static int create_privroot(struct dentry *dentry) { return 0; }
#endif
/* Actual operations that are exported to VFS-land */
-const struct xattr_handler *reiserfs_xattr_handlers[] = {
+const struct xattr_handler * const reiserfs_xattr_handlers[] = {
#ifdef CONFIG_REISERFS_FS_XATTR
&reiserfs_xattr_user_handler,
&reiserfs_xattr_trusted_handler,
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 5c35f6c76037..545ad44f96b8 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -322,7 +322,8 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
set_nlink(i, 1); /* Hard to decide.. */
i->i_size = be32_to_cpu(ri.size);
- i->i_mtime = i->i_atime = inode_set_ctime(i, 0, 0);
+ inode_set_mtime_to_ts(i,
+ inode_set_atime_to_ts(i, inode_set_ctime(i, 0, 0)));
/* set up mode and ops */
mode = romfs_modemap[nextfh & ROMFH_TYPE];
@@ -593,7 +594,7 @@ static void romfs_kill_sb(struct super_block *sb)
#ifdef CONFIG_ROMFS_ON_BLOCK
if (sb->s_bdev) {
sync_blockdev(sb->s_bdev);
- blkdev_put(sb->s_bdev, sb);
+ bdev_release(sb->s_bdev_handle);
}
#endif
}
diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c
index fe1bf5b6e0cb..d64a306a414b 100644
--- a/fs/smb/client/cached_dir.c
+++ b/fs/smb/client/cached_dir.c
@@ -32,7 +32,7 @@ static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids,
* fully cached or it may be in the process of
* being deleted due to a lease break.
*/
- if (!cfid->has_lease) {
+ if (!cfid->time || !cfid->has_lease) {
spin_unlock(&cfids->cfid_list_lock);
return NULL;
}
@@ -193,9 +193,19 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
npath = path_no_prefix(cifs_sb, path);
if (IS_ERR(npath)) {
rc = PTR_ERR(npath);
- kfree(utf16_path);
- return rc;
+ goto out;
+ }
+
+ if (!npath[0]) {
+ dentry = dget(cifs_sb->root);
+ } else {
+ dentry = path_to_dentry(cifs_sb, npath);
+ if (IS_ERR(dentry)) {
+ rc = -ENOENT;
+ goto out;
+ }
}
+ cfid->dentry = dentry;
/*
* We do not hold the lock for the open because in case
@@ -249,6 +259,15 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
smb2_set_related(&rqst[1]);
+ /*
+ * Set @cfid->has_lease to true before sending out compounded request so
+ * its lease reference can be put in cached_dir_lease_break() due to a
+ * potential lease break right after the request is sent or while @cfid
+ * is still being cached. Concurrent processes won't be to use it yet
+ * due to @cfid->time being zero.
+ */
+ cfid->has_lease = true;
+
rc = compound_send_recv(xid, ses, server,
flags, 2, rqst,
resp_buftype, rsp_iov);
@@ -263,6 +282,8 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
cfid->tcon = tcon;
cfid->is_open = true;
+ spin_lock(&cfids->cfid_list_lock);
+
o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base;
oparms.fid->persistent_fid = o_rsp->PersistentFileId;
oparms.fid->volatile_fid = o_rsp->VolatileFileId;
@@ -270,18 +291,32 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
oparms.fid->mid = le64_to_cpu(o_rsp->hdr.MessageId);
#endif /* CIFS_DEBUG2 */
- if (o_rsp->OplockLevel != SMB2_OPLOCK_LEVEL_LEASE)
+
+ if (o_rsp->OplockLevel != SMB2_OPLOCK_LEVEL_LEASE) {
+ spin_unlock(&cfids->cfid_list_lock);
+ rc = -EINVAL;
+ goto oshr_free;
+ }
+
+ rc = smb2_parse_contexts(server, rsp_iov,
+ &oparms.fid->epoch,
+ oparms.fid->lease_key,
+ &oplock, NULL, NULL);
+ if (rc) {
+ spin_unlock(&cfids->cfid_list_lock);
goto oshr_free;
+ }
- smb2_parse_contexts(server, o_rsp,
- &oparms.fid->epoch,
- oparms.fid->lease_key, &oplock,
- NULL, NULL);
- if (!(oplock & SMB2_LEASE_READ_CACHING_HE))
+ rc = -EINVAL;
+ if (!(oplock & SMB2_LEASE_READ_CACHING_HE)) {
+ spin_unlock(&cfids->cfid_list_lock);
goto oshr_free;
+ }
qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
- if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info))
+ if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info)) {
+ spin_unlock(&cfids->cfid_list_lock);
goto oshr_free;
+ }
if (!smb2_validate_and_copy_iov(
le16_to_cpu(qi_rsp->OutputBufferOffset),
sizeof(struct smb2_file_all_info),
@@ -289,37 +324,24 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
(char *)&cfid->file_all_info))
cfid->file_all_info_is_valid = true;
- if (!npath[0])
- dentry = dget(cifs_sb->root);
- else {
- dentry = path_to_dentry(cifs_sb, npath);
- if (IS_ERR(dentry)) {
- rc = -ENOENT;
- goto oshr_free;
- }
- }
- spin_lock(&cfids->cfid_list_lock);
- cfid->dentry = dentry;
cfid->time = jiffies;
- cfid->has_lease = true;
spin_unlock(&cfids->cfid_list_lock);
+ /* At this point the directory handle is fully cached */
+ rc = 0;
oshr_free:
- kfree(utf16_path);
SMB2_open_free(&rqst[0]);
SMB2_query_info_free(&rqst[1]);
free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
- spin_lock(&cfids->cfid_list_lock);
- if (!cfid->has_lease) {
- if (rc) {
- if (cfid->on_list) {
- list_del(&cfid->entry);
- cfid->on_list = false;
- cfids->num_entries--;
- }
- rc = -ENOENT;
- } else {
+ if (rc) {
+ spin_lock(&cfids->cfid_list_lock);
+ if (cfid->on_list) {
+ list_del(&cfid->entry);
+ cfid->on_list = false;
+ cfids->num_entries--;
+ }
+ if (cfid->has_lease) {
/*
* We are guaranteed to have two references at this
* point. One for the caller and one for a potential
@@ -327,25 +349,24 @@ oshr_free:
* will be closed when the caller closes the cached
* handle.
*/
+ cfid->has_lease = false;
spin_unlock(&cfids->cfid_list_lock);
kref_put(&cfid->refcount, smb2_close_cached_fid);
goto out;
}
+ spin_unlock(&cfids->cfid_list_lock);
}
- spin_unlock(&cfids->cfid_list_lock);
+out:
if (rc) {
if (cfid->is_open)
SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid,
cfid->fid.volatile_fid);
free_cached_dir(cfid);
- cfid = NULL;
- }
-out:
- if (rc == 0) {
+ } else {
*ret_cfid = cfid;
atomic_inc(&tcon->num_remote_opens);
}
-
+ kfree(utf16_path);
return rc;
}
diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c
index 76922fcc4bc6..60027f5aebe8 100644
--- a/fs/smb/client/cifs_debug.c
+++ b/fs/smb/client/cifs_debug.c
@@ -40,11 +40,13 @@ void cifs_dump_detail(void *buf, struct TCP_Server_Info *server)
#ifdef CONFIG_CIFS_DEBUG2
struct smb_hdr *smb = buf;
- cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d\n",
- smb->Command, smb->Status.CifsError,
- smb->Flags, smb->Flags2, smb->Mid, smb->Pid);
- cifs_dbg(VFS, "smb buf %p len %u\n", smb,
- server->ops->calc_smb_size(smb));
+ cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d Wct: %d\n",
+ smb->Command, smb->Status.CifsError, smb->Flags,
+ smb->Flags2, smb->Mid, smb->Pid, smb->WordCount);
+ if (!server->ops->check_message(buf, server->total_read, server)) {
+ cifs_dbg(VFS, "smb buf %p len %u\n", smb,
+ server->ops->calc_smb_size(smb));
+ }
#endif /* CONFIG_CIFS_DEBUG2 */
}
@@ -136,6 +138,11 @@ cifs_dump_channel(struct seq_file *m, int i, struct cifs_chan *chan)
{
struct TCP_Server_Info *server = chan->server;
+ if (!server) {
+ seq_printf(m, "\n\n\t\tChannel: %d DISABLED", i+1);
+ return;
+ }
+
seq_printf(m, "\n\n\t\tChannel: %d ConnectionId: 0x%llx"
"\n\t\tNumber of credits: %d,%d,%d Dialect 0x%x"
"\n\t\tTCP status: %d Instance: %d"
@@ -279,6 +286,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
struct cifs_ses *ses;
struct cifs_tcon *tcon;
struct cifs_server_iface *iface;
+ size_t iface_weight = 0, iface_min_speed = 0;
+ struct cifs_server_iface *last_iface = NULL;
int c, i, j;
seq_puts(m,
@@ -427,6 +436,8 @@ skip_rdma:
if (server->nosharesock)
seq_printf(m, " nosharesock");
+ seq_printf(m, "\nServer capabilities: 0x%x", server->capabilities);
+
if (server->rdma)
seq_printf(m, "\nRDMA ");
seq_printf(m, "\nTCP status: %d Instance: %d"
@@ -452,6 +463,11 @@ skip_rdma:
seq_printf(m, "\n\n\tSessions: ");
i = 0;
list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ spin_lock(&ses->ses_lock);
+ if (ses->ses_status == SES_EXITING) {
+ spin_unlock(&ses->ses_lock);
+ continue;
+ }
i++;
if ((ses->serverDomain == NULL) ||
(ses->serverOS == NULL) ||
@@ -472,6 +488,7 @@ skip_rdma:
ses->ses_count, ses->serverOS, ses->serverNOS,
ses->capabilities, ses->ses_status);
}
+ spin_unlock(&ses->ses_lock);
seq_printf(m, "\n\tSecurity type: %s ",
get_security_type_str(server->ops->select_sectype(server, ses->sectype)));
@@ -536,11 +553,25 @@ skip_rdma:
"\tLast updated: %lu seconds ago",
ses->iface_count,
(jiffies - ses->iface_last_update) / HZ);
+
+ last_iface = list_last_entry(&ses->iface_list,
+ struct cifs_server_iface,
+ iface_head);
+ iface_min_speed = last_iface->speed;
+
j = 0;
list_for_each_entry(iface, &ses->iface_list,
iface_head) {
seq_printf(m, "\n\t%d)", ++j);
cifs_dump_iface(m, iface);
+
+ iface_weight = iface->speed / iface_min_speed;
+ seq_printf(m, "\t\tWeight (cur,total): (%zu,%zu)"
+ "\n\t\tAllocated channels: %u\n",
+ iface->weight_fulfilled,
+ iface_weight,
+ iface->num_channels);
+
if (is_ses_using_iface(ses, iface))
seq_puts(m, "\t\t[CONNECTED]\n");
}
@@ -738,14 +769,14 @@ static ssize_t name##_write(struct file *file, const char __user *buffer, \
size_t count, loff_t *ppos) \
{ \
int rc; \
- rc = kstrtoint_from_user(buffer, count, 10, & name); \
+ rc = kstrtoint_from_user(buffer, count, 10, &name); \
if (rc) \
return rc; \
return count; \
} \
static int name##_proc_show(struct seq_file *m, void *v) \
{ \
- seq_printf(m, "%d\n", name ); \
+ seq_printf(m, "%d\n", name); \
return 0; \
} \
static int name##_open(struct inode *inode, struct file *file) \
diff --git a/fs/smb/client/cifs_ioctl.h b/fs/smb/client/cifs_ioctl.h
index 332588e77c31..26327442e383 100644
--- a/fs/smb/client/cifs_ioctl.h
+++ b/fs/smb/client/cifs_ioctl.h
@@ -26,6 +26,11 @@ struct smb_mnt_fs_info {
__u64 cifs_posix_caps;
} __packed;
+struct smb_mnt_tcon_info {
+ __u32 tid;
+ __u64 session_id;
+} __packed;
+
struct smb_snapshot_array {
__u32 number_of_snapshots;
__u32 number_of_snapshots_returned;
@@ -108,6 +113,7 @@ struct smb3_notify_info {
#define CIFS_IOC_NOTIFY _IOW(CIFS_IOCTL_MAGIC, 9, struct smb3_notify)
#define CIFS_DUMP_FULL_KEY _IOWR(CIFS_IOCTL_MAGIC, 10, struct smb3_full_key_debug_info)
#define CIFS_IOC_NOTIFY_INFO _IOWR(CIFS_IOCTL_MAGIC, 11, struct smb3_notify_info)
+#define CIFS_IOC_GET_TCON_INFO _IOR(CIFS_IOCTL_MAGIC, 12, struct smb_mnt_tcon_info)
#define CIFS_IOC_SHUTDOWN _IOR('X', 125, __u32)
/*
diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c
index 6f3285f1dfee..af7849e5974f 100644
--- a/fs/smb/client/cifs_spnego.c
+++ b/fs/smb/client/cifs_spnego.c
@@ -64,8 +64,8 @@ struct key_type cifs_spnego_key_type = {
* strlen(";sec=ntlmsspi") */
#define MAX_MECH_STR_LEN 13
-/* strlen of "host=" */
-#define HOST_KEY_LEN 5
+/* strlen of ";host=" */
+#define HOST_KEY_LEN 6
/* strlen of ";ip4=" or ";ip6=" */
#define IP_KEY_LEN 5
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 22869cda1356..2131638f26d0 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -1191,36 +1191,108 @@ const char *cifs_get_link(struct dentry *dentry, struct inode *inode,
const struct inode_operations cifs_symlink_inode_ops = {
.get_link = cifs_get_link,
+ .setattr = cifs_setattr,
.permission = cifs_permission,
.listxattr = cifs_listxattr,
};
+/*
+ * Advance the EOF marker to after the source range.
+ */
+static int cifs_precopy_set_eof(struct inode *src_inode, struct cifsInodeInfo *src_cifsi,
+ struct cifs_tcon *src_tcon,
+ unsigned int xid, loff_t src_end)
+{
+ struct cifsFileInfo *writeable_srcfile;
+ int rc = -EINVAL;
+
+ writeable_srcfile = find_writable_file(src_cifsi, FIND_WR_FSUID_ONLY);
+ if (writeable_srcfile) {
+ if (src_tcon->ses->server->ops->set_file_size)
+ rc = src_tcon->ses->server->ops->set_file_size(
+ xid, src_tcon, writeable_srcfile,
+ src_inode->i_size, true /* no need to set sparse */);
+ else
+ rc = -ENOSYS;
+ cifsFileInfo_put(writeable_srcfile);
+ cifs_dbg(FYI, "SetFSize for copychunk rc = %d\n", rc);
+ }
+
+ if (rc < 0)
+ goto set_failed;
+
+ netfs_resize_file(&src_cifsi->netfs, src_end);
+ fscache_resize_cookie(cifs_inode_cookie(src_inode), src_end);
+ return 0;
+
+set_failed:
+ return filemap_write_and_wait(src_inode->i_mapping);
+}
+
+/*
+ * Flush out either the folio that overlaps the beginning of a range in which
+ * pos resides or the folio that overlaps the end of a range unless that folio
+ * is entirely within the range we're going to invalidate. We extend the flush
+ * bounds to encompass the folio.
+ */
+static int cifs_flush_folio(struct inode *inode, loff_t pos, loff_t *_fstart, loff_t *_fend,
+ bool first)
+{
+ struct folio *folio;
+ unsigned long long fpos, fend;
+ pgoff_t index = pos / PAGE_SIZE;
+ size_t size;
+ int rc = 0;
+
+ folio = filemap_get_folio(inode->i_mapping, index);
+ if (IS_ERR(folio))
+ return 0;
+
+ size = folio_size(folio);
+ fpos = folio_pos(folio);
+ fend = fpos + size - 1;
+ *_fstart = min_t(unsigned long long, *_fstart, fpos);
+ *_fend = max_t(unsigned long long, *_fend, fend);
+ if ((first && pos == fpos) || (!first && pos == fend))
+ goto out;
+
+ rc = filemap_write_and_wait_range(inode->i_mapping, fpos, fend);
+out:
+ folio_put(folio);
+ return rc;
+}
+
static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
struct file *dst_file, loff_t destoff, loff_t len,
unsigned int remap_flags)
{
struct inode *src_inode = file_inode(src_file);
struct inode *target_inode = file_inode(dst_file);
+ struct cifsInodeInfo *src_cifsi = CIFS_I(src_inode);
+ struct cifsInodeInfo *target_cifsi = CIFS_I(target_inode);
struct cifsFileInfo *smb_file_src = src_file->private_data;
- struct cifsFileInfo *smb_file_target;
- struct cifs_tcon *target_tcon;
+ struct cifsFileInfo *smb_file_target = dst_file->private_data;
+ struct cifs_tcon *target_tcon, *src_tcon;
+ unsigned long long destend, fstart, fend, new_size;
unsigned int xid;
int rc;
- if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+ if (remap_flags & REMAP_FILE_DEDUP)
+ return -EOPNOTSUPP;
+ if (remap_flags & ~REMAP_FILE_ADVISORY)
return -EINVAL;
cifs_dbg(FYI, "clone range\n");
xid = get_xid();
- if (!src_file->private_data || !dst_file->private_data) {
+ if (!smb_file_src || !smb_file_target) {
rc = -EBADF;
cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n");
goto out;
}
- smb_file_target = dst_file->private_data;
+ src_tcon = tlink_tcon(smb_file_src->tlink);
target_tcon = tlink_tcon(smb_file_target->tlink);
/*
@@ -1233,20 +1305,63 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
if (len == 0)
len = src_inode->i_size - off;
- cifs_dbg(FYI, "about to flush pages\n");
- /* should we flush first and last page first */
- truncate_inode_pages_range(&target_inode->i_data, destoff,
- PAGE_ALIGN(destoff + len)-1);
+ cifs_dbg(FYI, "clone range\n");
+
+ /* Flush the source buffer */
+ rc = filemap_write_and_wait_range(src_inode->i_mapping, off,
+ off + len - 1);
+ if (rc)
+ goto unlock;
+
+ /* The server-side copy will fail if the source crosses the EOF marker.
+ * Advance the EOF marker after the flush above to the end of the range
+ * if it's short of that.
+ */
+ if (src_cifsi->netfs.remote_i_size < off + len) {
+ rc = cifs_precopy_set_eof(src_inode, src_cifsi, src_tcon, xid, off + len);
+ if (rc < 0)
+ goto unlock;
+ }
+
+ new_size = destoff + len;
+ destend = destoff + len - 1;
+
+ /* Flush the folios at either end of the destination range to prevent
+ * accidental loss of dirty data outside of the range.
+ */
+ fstart = destoff;
+ fend = destend;
- if (target_tcon->ses->server->ops->duplicate_extents)
+ rc = cifs_flush_folio(target_inode, destoff, &fstart, &fend, true);
+ if (rc)
+ goto unlock;
+ rc = cifs_flush_folio(target_inode, destend, &fstart, &fend, false);
+ if (rc)
+ goto unlock;
+
+ /* Discard all the folios that overlap the destination region. */
+ cifs_dbg(FYI, "about to discard pages %llx-%llx\n", fstart, fend);
+ truncate_inode_pages_range(&target_inode->i_data, fstart, fend);
+
+ fscache_invalidate(cifs_inode_cookie(target_inode), NULL,
+ i_size_read(target_inode), 0);
+
+ rc = -EOPNOTSUPP;
+ if (target_tcon->ses->server->ops->duplicate_extents) {
rc = target_tcon->ses->server->ops->duplicate_extents(xid,
smb_file_src, smb_file_target, off, len, destoff);
- else
- rc = -EOPNOTSUPP;
+ if (rc == 0 && new_size > i_size_read(target_inode)) {
+ truncate_setsize(target_inode, new_size);
+ netfs_resize_file(&target_cifsi->netfs, new_size);
+ fscache_resize_cookie(cifs_inode_cookie(target_inode),
+ new_size);
+ }
+ }
/* force revalidate of size and timestamps of target file now
that target is updated on the server */
CIFS_I(target_inode)->time = 0;
+unlock:
/* although unlocking in the reverse order from locking is not
strictly necessary here it is a little cleaner to be consistent */
unlock_two_nondirectories(src_inode, target_inode);
@@ -1262,10 +1377,12 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
{
struct inode *src_inode = file_inode(src_file);
struct inode *target_inode = file_inode(dst_file);
+ struct cifsInodeInfo *src_cifsi = CIFS_I(src_inode);
struct cifsFileInfo *smb_file_src;
struct cifsFileInfo *smb_file_target;
struct cifs_tcon *src_tcon;
struct cifs_tcon *target_tcon;
+ unsigned long long destend, fstart, fend;
ssize_t rc;
cifs_dbg(FYI, "copychunk range\n");
@@ -1305,13 +1422,41 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
if (rc)
goto unlock;
- /* should we flush first and last page first */
- truncate_inode_pages(&target_inode->i_data, 0);
+ /* The server-side copy will fail if the source crosses the EOF marker.
+ * Advance the EOF marker after the flush above to the end of the range
+ * if it's short of that.
+ */
+ if (src_cifsi->server_eof < off + len) {
+ rc = cifs_precopy_set_eof(src_inode, src_cifsi, src_tcon, xid, off + len);
+ if (rc < 0)
+ goto unlock;
+ }
+
+ destend = destoff + len - 1;
+
+ /* Flush the folios at either end of the destination range to prevent
+ * accidental loss of dirty data outside of the range.
+ */
+ fstart = destoff;
+ fend = destend;
+
+ rc = cifs_flush_folio(target_inode, destoff, &fstart, &fend, true);
+ if (rc)
+ goto unlock;
+ rc = cifs_flush_folio(target_inode, destend, &fstart, &fend, false);
+ if (rc)
+ goto unlock;
+
+ /* Discard all the folios that overlap the destination region. */
+ truncate_inode_pages_range(&target_inode->i_data, fstart, fend);
rc = file_modified(dst_file);
- if (!rc)
+ if (!rc) {
rc = target_tcon->ses->server->ops->copychunk_range(xid,
smb_file_src, smb_file_target, off, len, destoff);
+ if (rc > 0 && destoff + rc > i_size_read(target_inode))
+ truncate_setsize(target_inode, destoff + rc);
+ }
file_accessed(src_file);
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index 41daebd220ff..3adea10aa9da 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -127,7 +127,7 @@ extern int cifs_symlink(struct mnt_idmap *idmap, struct inode *inode,
struct dentry *direntry, const char *symname);
#ifdef CONFIG_CIFS_XATTR
-extern const struct xattr_handler *cifs_xattr_handlers[];
+extern const struct xattr_handler * const cifs_xattr_handlers[];
extern ssize_t cifs_listxattr(struct dentry *, char *, size_t);
#else
# define cifs_xattr_handlers NULL
@@ -152,6 +152,6 @@ extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
/* when changing internal version - update following two lines at same time */
-#define SMB3_PRODUCT_BUILD 45
-#define CIFS_VERSION "2.45"
+#define SMB3_PRODUCT_BUILD 46
+#define CIFS_VERSION "2.46"
#endif /* _CIFSFS_H */
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 02082621d8e0..5e32c79f03a7 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -191,7 +191,13 @@ struct cifs_open_info_data {
bool reparse_point;
bool symlink;
};
- __u32 reparse_tag;
+ struct {
+ __u32 tag;
+ union {
+ struct reparse_data_buffer *buf;
+ struct reparse_posix_data *posix;
+ };
+ } reparse;
char *symlink_target;
union {
struct smb2_file_all_info fi;
@@ -395,8 +401,7 @@ struct smb_version_operations {
struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb,
const char *full_path,
- char **target_path,
- struct kvec *rsp_iov);
+ char **target_path);
/* open a file for non-posix mounts */
int (*open)(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock,
void *buf);
@@ -527,7 +532,8 @@ struct smb_version_operations {
struct mid_q_entry **, char **, int *);
enum securityEnum (*select_sectype)(struct TCP_Server_Info *,
enum securityEnum);
- int (*next_header)(char *);
+ int (*next_header)(struct TCP_Server_Info *server, char *buf,
+ unsigned int *noff);
/* ioctl passthrough for query_info */
int (*ioctl_query_info)(const unsigned int xid,
struct cifs_tcon *tcon,
@@ -551,6 +557,9 @@ struct smb_version_operations {
bool (*is_status_io_timeout)(char *buf);
/* Check for STATUS_NETWORK_NAME_DELETED */
bool (*is_network_name_deleted)(char *buf, struct TCP_Server_Info *srv);
+ int (*parse_reparse_point)(struct cifs_sb_info *cifs_sb,
+ struct kvec *rsp_iov,
+ struct cifs_open_info_data *data);
};
struct smb_version_values {
@@ -650,6 +659,7 @@ struct TCP_Server_Info {
bool noautotune; /* do not autotune send buf sizes */
bool nosharesock;
bool tcp_nodelay;
+ bool terminate;
unsigned int credits; /* send no more requests at once */
unsigned int max_credits; /* can override large 32000 default at mnt */
unsigned int in_flight; /* number of requests on the wire to server */
@@ -969,6 +979,8 @@ struct cifs_server_iface {
struct list_head iface_head;
struct kref refcount;
size_t speed;
+ size_t weight_fulfilled;
+ unsigned int num_channels;
unsigned int rdma_capable : 1;
unsigned int rss_capable : 1;
unsigned int is_active : 1; /* unset if non existent */
@@ -982,7 +994,6 @@ release_iface(struct kref *ref)
struct cifs_server_iface *iface = container_of(ref,
struct cifs_server_iface,
refcount);
- list_del_init(&iface->iface_head);
kfree(iface);
}
@@ -1050,6 +1061,7 @@ struct cifs_ses {
spinlock_t chan_lock;
/* ========= begin: protected by chan_lock ======== */
#define CIFS_MAX_CHANNELS 16
+#define CIFS_INVAL_CHAN_INDEX (-1)
#define CIFS_ALL_CHANNELS_SET(ses) \
((1UL << (ses)->chan_count) - 1)
#define CIFS_ALL_CHANS_GOOD(ses) \
@@ -2143,6 +2155,7 @@ static inline int cifs_get_num_sgs(const struct smb_rqst *rqst,
unsigned int len, skip;
unsigned int nents = 0;
unsigned long addr;
+ size_t data_size;
int i, j;
/*
@@ -2158,17 +2171,21 @@ static inline int cifs_get_num_sgs(const struct smb_rqst *rqst,
* rqst[1+].rq_iov[0+] data to be encrypted/decrypted
*/
for (i = 0; i < num_rqst; i++) {
+ data_size = iov_iter_count(&rqst[i].rq_iter);
+
/* We really don't want a mixture of pinned and unpinned pages
* in the sglist. It's hard to keep track of which is what.
* Instead, we convert to a BVEC-type iterator higher up.
*/
- if (WARN_ON_ONCE(user_backed_iter(&rqst[i].rq_iter)))
+ if (data_size &&
+ WARN_ON_ONCE(user_backed_iter(&rqst[i].rq_iter)))
return -EIO;
/* We also don't want to have any extra refs or pins to clean
* up in the sglist.
*/
- if (WARN_ON_ONCE(iov_iter_extract_will_pin(&rqst[i].rq_iter)))
+ if (data_size &&
+ WARN_ON_ONCE(iov_iter_extract_will_pin(&rqst[i].rq_iter)))
return -EIO;
for (j = 0; j < rqst[i].rq_nvec; j++) {
@@ -2184,7 +2201,8 @@ static inline int cifs_get_num_sgs(const struct smb_rqst *rqst,
}
skip = 0;
}
- nents += iov_iter_npages(&rqst[i].rq_iter, INT_MAX);
+ if (data_size)
+ nents += iov_iter_npages(&rqst[i].rq_iter, INT_MAX);
}
nents += DIV_ROUND_UP(offset_in_page(sig) + SMB2_SIGNATURE_SIZE, PAGE_SIZE);
return nents;
diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h
index e17222fec9d2..c0513fbb8a59 100644
--- a/fs/smb/client/cifspdu.h
+++ b/fs/smb/client/cifspdu.h
@@ -882,11 +882,13 @@ typedef struct smb_com_open_rsp {
__u8 OplockLevel;
__u16 Fid;
__le32 CreateAction;
- __le64 CreationTime;
- __le64 LastAccessTime;
- __le64 LastWriteTime;
- __le64 ChangeTime;
- __le32 FileAttributes;
+ struct_group(common_attributes,
+ __le64 CreationTime;
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ __le32 FileAttributes;
+ );
__le64 AllocationSize;
__le64 EndOfFile;
__le16 FileType;
@@ -1356,7 +1358,7 @@ typedef struct smb_com_transaction_ioctl_rsp {
__le32 DataDisplacement;
__u8 SetupCount; /* 1 */
__le16 ReturnedDataLen;
- __u16 ByteCount;
+ __le16 ByteCount;
} __attribute__((packed)) TRANSACT_IOCTL_RSP;
#define CIFS_ACL_OWNER 1
@@ -1509,7 +1511,7 @@ struct reparse_posix_data {
__le16 ReparseDataLength;
__u16 Reserved;
__le64 InodeType; /* LNK, FIFO, CHR etc. */
- char PathBuffer[];
+ __u8 DataBuffer[];
} __attribute__((packed));
struct cifs_quota_data {
@@ -2264,11 +2266,13 @@ typedef struct {
/* QueryFileInfo/QueryPathinfo (also for SetPath/SetFile) data buffer formats */
/******************************************************************************/
typedef struct { /* data block encoding of response to level 263 QPathInfo */
- __le64 CreationTime;
- __le64 LastAccessTime;
- __le64 LastWriteTime;
- __le64 ChangeTime;
- __le32 Attributes;
+ struct_group(common_attributes,
+ __le64 CreationTime;
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ __le32 Attributes;
+ );
__u32 Pad1;
__le64 AllocationSize;
__le64 EndOfFile; /* size ie offset to first free byte in file */
@@ -2570,7 +2574,7 @@ typedef struct {
struct win_dev {
- unsigned char type[8]; /* IntxCHR or IntxBLK */
+ unsigned char type[8]; /* IntxCHR or IntxBLK or LnxFIFO*/
__le64 major;
__le64 minor;
} __attribute__((packed));
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index 0c37eefa18a5..46feaa0880bd 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -81,7 +81,7 @@ extern char *cifs_build_path_to_root(struct smb3_fs_context *ctx,
extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
char *cifs_build_devname(char *nodename, const char *prepath);
extern void delete_mid(struct mid_q_entry *mid);
-extern void release_mid(struct mid_q_entry *mid);
+void __release_mid(struct kref *refcount);
extern void cifs_wake_up_task(struct mid_q_entry *mid);
extern int cifs_handle_standard(struct TCP_Server_Info *server,
struct mid_q_entry *mid);
@@ -132,6 +132,7 @@ extern int SendReceiveBlockingLock(const unsigned int xid,
struct smb_hdr *in_buf,
struct smb_hdr *out_buf,
int *bytes_returned);
+
void
cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server,
bool all_channels);
@@ -209,7 +210,7 @@ int cifs_get_inode_info(struct inode **inode, const char *full_path,
const struct cifs_fid *fid);
bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
struct cifs_fattr *fattr,
- u32 tag);
+ struct cifs_open_info_data *data);
extern int smb311_posix_get_inode_info(struct inode **pinode, const char *search_path,
struct super_block *sb, unsigned int xid);
extern int cifs_get_inode_info_unix(struct inode **pinode,
@@ -457,6 +458,12 @@ extern int CIFSSMBUnixQuerySymLink(const unsigned int xid,
struct cifs_tcon *tcon,
const unsigned char *searchName, char **syminfo,
const struct nls_table *nls_codepage, int remap);
+extern int cifs_query_reparse_point(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
+ const char *full_path,
+ u32 *tag, struct kvec *rsp,
+ int *rsp_buftype);
extern int CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
__u16 fid, char **symlinkinfo,
const struct nls_table *nls_codepage);
@@ -610,13 +617,13 @@ void cifs_free_hash(struct shash_desc **sdesc);
struct cifs_chan *
cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server);
-int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses);
+int cifs_try_adding_channels(struct cifs_ses *ses);
bool is_server_using_iface(struct TCP_Server_Info *server,
struct cifs_server_iface *iface);
bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface);
void cifs_ses_mark_for_reconnect(struct cifs_ses *ses);
-unsigned int
+int
cifs_ses_get_chan_index(struct cifs_ses *ses,
struct TCP_Server_Info *server);
void
@@ -640,6 +647,8 @@ cifs_chan_needs_reconnect(struct cifs_ses *ses,
bool
cifs_chan_is_iface_active(struct cifs_ses *ses,
struct TCP_Server_Info *server);
+void
+cifs_disable_secondary_channels(struct cifs_ses *ses);
int
cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server);
int
@@ -656,6 +665,12 @@ void cifs_put_tcp_super(struct super_block *sb);
int cifs_update_super_prepath(struct cifs_sb_info *cifs_sb, char *prefix);
char *extract_hostname(const char *unc);
char *extract_sharename(const char *unc);
+int parse_reparse_point(struct reparse_data_buffer *buf,
+ u32 plen, struct cifs_sb_info *cifs_sb,
+ bool unicode, struct cifs_open_info_data *data);
+int cifs_sfu_make_node(unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ const char *full_path, umode_t mode, dev_t dev);
#ifdef CONFIG_CIFS_DFS_UPCALL
static inline int get_dfs_path(const unsigned int xid, struct cifs_ses *ses,
@@ -740,4 +755,9 @@ static inline bool dfs_src_pathname_equal(const char *s1, const char *s2)
return true;
}
+static inline void release_mid(struct mid_q_entry *mid)
+{
+ kref_put(&mid->refcount, __release_mid);
+}
+
#endif /* _CIFSPROTO_H */
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 25503f1a4fd2..9ee348e6d106 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -1244,8 +1244,10 @@ openRetry:
*oplock |= CIFS_CREATE_ACTION;
if (buf) {
- /* copy from CreationTime to Attributes */
- memcpy((char *)buf, (char *)&rsp->CreationTime, 36);
+ /* copy commonly used attributes */
+ memcpy(&buf->common_attributes,
+ &rsp->common_attributes,
+ sizeof(buf->common_attributes));
/* the file_info buf is endian converted by caller */
buf->AllocationSize = rsp->AllocationSize;
buf->EndOfFile = rsp->EndOfFile;
@@ -2690,136 +2692,97 @@ querySymLinkRetry:
return rc;
}
-/*
- * Recent Windows versions now create symlinks more frequently
- * and they use the "reparse point" mechanism below. We can of course
- * do symlinks nicely to Samba and other servers which support the
- * CIFS Unix Extensions and we can also do SFU symlinks and "client only"
- * "MF" symlinks optionally, but for recent Windows we really need to
- * reenable the code below and fix the cifs_symlink callers to handle this.
- * In the interim this code has been moved to its own config option so
- * it is not compiled in by default until callers fixed up and more tested.
- */
-int
-CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
- __u16 fid, char **symlinkinfo,
- const struct nls_table *nls_codepage)
+int cifs_query_reparse_point(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
+ const char *full_path,
+ u32 *tag, struct kvec *rsp,
+ int *rsp_buftype)
{
- int rc = 0;
- int bytes_returned;
- struct smb_com_transaction_ioctl_req *pSMB;
- struct smb_com_transaction_ioctl_rsp *pSMBr;
- bool is_unicode;
- unsigned int sub_len;
- char *sub_start;
- struct reparse_symlink_data *reparse_buf;
- struct reparse_posix_data *posix_buf;
+ struct cifs_open_parms oparms;
+ TRANSACT_IOCTL_REQ *io_req = NULL;
+ TRANSACT_IOCTL_RSP *io_rsp = NULL;
+ struct cifs_fid fid;
__u32 data_offset, data_count;
- char *end_of_smb;
+ __u8 *start, *end;
+ int io_rsp_len;
+ int oplock = 0;
+ int rc;
- cifs_dbg(FYI, "In Windows reparse style QueryLink for fid %u\n", fid);
- rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
- (void **) &pSMBr);
+ cifs_tcon_dbg(FYI, "%s: path=%s\n", __func__, full_path);
+
+ if (cap_unix(tcon->ses))
+ return -EOPNOTSUPP;
+
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .cifs_sb = cifs_sb,
+ .desired_access = FILE_READ_ATTRIBUTES,
+ .create_options = cifs_create_options(cifs_sb,
+ OPEN_REPARSE_POINT),
+ .disposition = FILE_OPEN,
+ .path = full_path,
+ .fid = &fid,
+ };
+
+ rc = CIFS_open(xid, &oparms, &oplock, NULL);
if (rc)
return rc;
- pSMB->TotalParameterCount = 0 ;
- pSMB->TotalDataCount = 0;
- pSMB->MaxParameterCount = cpu_to_le32(2);
- /* BB find exact data count max from sess structure BB */
- pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
- pSMB->MaxSetupCount = 4;
- pSMB->Reserved = 0;
- pSMB->ParameterOffset = 0;
- pSMB->DataCount = 0;
- pSMB->DataOffset = 0;
- pSMB->SetupCount = 4;
- pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL);
- pSMB->ParameterCount = pSMB->TotalParameterCount;
- pSMB->FunctionCode = cpu_to_le32(FSCTL_GET_REPARSE_POINT);
- pSMB->IsFsctl = 1; /* FSCTL */
- pSMB->IsRootFlag = 0;
- pSMB->Fid = fid; /* file handle always le */
- pSMB->ByteCount = 0;
+ rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon,
+ (void **)&io_req, (void **)&io_rsp);
+ if (rc)
+ goto error;
- rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
- (struct smb_hdr *) pSMBr, &bytes_returned, 0);
- if (rc) {
- cifs_dbg(FYI, "Send error in QueryReparseLinkInfo = %d\n", rc);
- goto qreparse_out;
- }
+ io_req->TotalParameterCount = 0;
+ io_req->TotalDataCount = 0;
+ io_req->MaxParameterCount = cpu_to_le32(2);
+ /* BB find exact data count max from sess structure BB */
+ io_req->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
+ io_req->MaxSetupCount = 4;
+ io_req->Reserved = 0;
+ io_req->ParameterOffset = 0;
+ io_req->DataCount = 0;
+ io_req->DataOffset = 0;
+ io_req->SetupCount = 4;
+ io_req->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL);
+ io_req->ParameterCount = io_req->TotalParameterCount;
+ io_req->FunctionCode = cpu_to_le32(FSCTL_GET_REPARSE_POINT);
+ io_req->IsFsctl = 1;
+ io_req->IsRootFlag = 0;
+ io_req->Fid = fid.netfid;
+ io_req->ByteCount = 0;
+
+ rc = SendReceive(xid, tcon->ses, (struct smb_hdr *)io_req,
+ (struct smb_hdr *)io_rsp, &io_rsp_len, 0);
+ if (rc)
+ goto error;
- data_offset = le32_to_cpu(pSMBr->DataOffset);
- data_count = le32_to_cpu(pSMBr->DataCount);
- if (get_bcc(&pSMBr->hdr) < 2 || data_offset > 512) {
- /* BB also check enough total bytes returned */
- rc = -EIO; /* bad smb */
- goto qreparse_out;
- }
- if (!data_count || (data_count > 2048)) {
+ data_offset = le32_to_cpu(io_rsp->DataOffset);
+ data_count = le32_to_cpu(io_rsp->DataCount);
+ if (get_bcc(&io_rsp->hdr) < 2 || data_offset > 512 ||
+ !data_count || data_count > 2048) {
rc = -EIO;
- cifs_dbg(FYI, "Invalid return data count on get reparse info ioctl\n");
- goto qreparse_out;
- }
- end_of_smb = 2 + get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount;
- reparse_buf = (struct reparse_symlink_data *)
- ((char *)&pSMBr->hdr.Protocol + data_offset);
- if ((char *)reparse_buf >= end_of_smb) {
- rc = -EIO;
- goto qreparse_out;
- }
- if (reparse_buf->ReparseTag == cpu_to_le32(IO_REPARSE_TAG_NFS)) {
- cifs_dbg(FYI, "NFS style reparse tag\n");
- posix_buf = (struct reparse_posix_data *)reparse_buf;
-
- if (posix_buf->InodeType != cpu_to_le64(NFS_SPECFILE_LNK)) {
- cifs_dbg(FYI, "unsupported file type 0x%llx\n",
- le64_to_cpu(posix_buf->InodeType));
- rc = -EOPNOTSUPP;
- goto qreparse_out;
- }
- is_unicode = true;
- sub_len = le16_to_cpu(reparse_buf->ReparseDataLength);
- if (posix_buf->PathBuffer + sub_len > end_of_smb) {
- cifs_dbg(FYI, "reparse buf beyond SMB\n");
- rc = -EIO;
- goto qreparse_out;
- }
- *symlinkinfo = cifs_strndup_from_utf16(posix_buf->PathBuffer,
- sub_len, is_unicode, nls_codepage);
- goto qreparse_out;
- } else if (reparse_buf->ReparseTag !=
- cpu_to_le32(IO_REPARSE_TAG_SYMLINK)) {
- rc = -EOPNOTSUPP;
- goto qreparse_out;
+ goto error;
}
- /* Reparse tag is NTFS symlink */
- sub_start = le16_to_cpu(reparse_buf->SubstituteNameOffset) +
- reparse_buf->PathBuffer;
- sub_len = le16_to_cpu(reparse_buf->SubstituteNameLength);
- if (sub_start + sub_len > end_of_smb) {
- cifs_dbg(FYI, "reparse buf beyond SMB\n");
+ end = 2 + get_bcc(&io_rsp->hdr) + (__u8 *)&io_rsp->ByteCount;
+ start = (__u8 *)&io_rsp->hdr.Protocol + data_offset;
+ if (start >= end) {
rc = -EIO;
- goto qreparse_out;
+ goto error;
}
- if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
- is_unicode = true;
- else
- is_unicode = false;
-
- /* BB FIXME investigate remapping reserved chars here */
- *symlinkinfo = cifs_strndup_from_utf16(sub_start, sub_len, is_unicode,
- nls_codepage);
- if (!*symlinkinfo)
- rc = -ENOMEM;
-qreparse_out:
- cifs_buf_release(pSMB);
- /*
- * Note: On -EAGAIN error only caller can retry on handle based calls
- * since file handle passed in no longer valid.
- */
+ *tag = le32_to_cpu(((struct reparse_data_buffer *)start)->ReparseTag);
+ rsp->iov_base = io_rsp;
+ rsp->iov_len = io_rsp_len;
+ *rsp_buftype = CIFS_LARGE_BUFFER;
+ CIFSSMBClose(xid, tcon, fid.netfid);
+ return 0;
+
+error:
+ cifs_buf_release(io_req);
+ CIFSSMBClose(xid, tcon, fid.netfid);
return rc;
}
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 7b923e36501b..dc9b95ca71e6 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -119,6 +119,7 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)
static void smb2_query_server_interfaces(struct work_struct *work)
{
int rc;
+ int xid;
struct cifs_tcon *tcon = container_of(work,
struct cifs_tcon,
query_interfaces.work);
@@ -126,8 +127,14 @@ static void smb2_query_server_interfaces(struct work_struct *work)
/*
* query server network interfaces, in case they change
*/
- rc = SMB3_request_interfaces(0, tcon, false);
+ xid = get_xid();
+ rc = SMB3_request_interfaces(xid, tcon, false);
+ free_xid(xid);
+
if (rc) {
+ if (rc == -EOPNOTSUPP)
+ return;
+
cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n",
__func__, rc);
}
@@ -156,20 +163,25 @@ cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server,
/* If server is a channel, select the primary channel */
pserver = SERVER_IS_CHAN(server) ? server->primary_server : server;
- spin_lock(&pserver->srv_lock);
+ /* if we need to signal just this channel */
if (!all_channels) {
- pserver->tcpStatus = CifsNeedReconnect;
- spin_unlock(&pserver->srv_lock);
+ spin_lock(&server->srv_lock);
+ if (server->tcpStatus != CifsExiting)
+ server->tcpStatus = CifsNeedReconnect;
+ spin_unlock(&server->srv_lock);
return;
}
- spin_unlock(&pserver->srv_lock);
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
spin_lock(&ses->chan_lock);
for (i = 0; i < ses->chan_count; i++) {
+ if (!ses->chans[i].server)
+ continue;
+
spin_lock(&ses->chans[i].server->srv_lock);
- ses->chans[i].server->tcpStatus = CifsNeedReconnect;
+ if (ses->chans[i].server->tcpStatus != CifsExiting)
+ ses->chans[i].server->tcpStatus = CifsNeedReconnect;
spin_unlock(&ses->chans[i].server->srv_lock);
}
spin_unlock(&ses->chan_lock);
@@ -204,14 +216,29 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
/* If server is a channel, select the primary channel */
pserver = SERVER_IS_CHAN(server) ? server->primary_server : server;
+ /*
+ * if the server has been marked for termination, there is a
+ * chance that the remaining channels all need reconnect. To be
+ * on the safer side, mark the session and trees for reconnect
+ * for this scenario. This might cause a few redundant session
+ * setup and tree connect requests, but it is better than not doing
+ * a tree connect when needed, and all following requests failing
+ */
+ if (server->terminate) {
+ mark_smb_session = true;
+ server = pserver;
+ }
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry_safe(ses, nses, &pserver->smb_ses_list, smb_ses_list) {
/* check if iface is still active */
- if (!cifs_chan_is_iface_active(ses, server))
+ spin_lock(&ses->chan_lock);
+ if (!cifs_chan_is_iface_active(ses, server)) {
+ spin_unlock(&ses->chan_lock);
cifs_chan_update_iface(ses, server);
+ spin_lock(&ses->chan_lock);
+ }
- spin_lock(&ses->chan_lock);
if (!mark_smb_session && cifs_chan_needs_reconnect(ses, server)) {
spin_unlock(&ses->chan_lock);
continue;
@@ -241,6 +268,8 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
spin_lock(&tcon->tc_lock);
tcon->status = TID_NEED_RECON;
spin_unlock(&tcon->tc_lock);
+
+ cancel_delayed_work(&tcon->query_interfaces);
}
if (ses->tcon_ipc) {
ses->tcon_ipc->need_reconnect = true;
@@ -1179,7 +1208,12 @@ next_pdu:
server->total_read += length;
if (server->ops->next_header) {
- next_offset = server->ops->next_header(buf);
+ if (server->ops->next_header(server, buf, &next_offset)) {
+ cifs_dbg(VFS, "%s: malformed response (next_offset=%u)\n",
+ __func__, next_offset);
+ cifs_reconnect(server, true);
+ continue;
+ }
if (next_offset)
server->pdu_size = next_offset;
}
@@ -1586,10 +1620,6 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect)
list_del_init(&server->tcp_ses_list);
spin_unlock(&cifs_tcp_ses_lock);
- /* For secondary channels, we pick up ref-count on the primary server */
- if (SERVER_IS_CHAN(server))
- cifs_put_tcp_session(server->primary_server, from_reconnect);
-
cancel_delayed_work_sync(&server->echo);
if (from_reconnect)
@@ -1603,6 +1633,10 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect)
else
cancel_delayed_work_sync(&server->reconnect);
+ /* For secondary channels, we pick up ref-count on the primary server */
+ if (SERVER_IS_CHAN(server))
+ cifs_put_tcp_session(server->primary_server, from_reconnect);
+
spin_lock(&server->srv_lock);
server->tcpStatus = CifsExiting;
spin_unlock(&server->srv_lock);
@@ -1969,9 +2003,10 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
void __cifs_put_smb_ses(struct cifs_ses *ses)
{
- unsigned int rc, xid;
- unsigned int chan_count;
struct TCP_Server_Info *server = ses->server;
+ unsigned int xid;
+ size_t i;
+ int rc;
spin_lock(&ses->ses_lock);
if (ses->ses_status == SES_EXITING) {
@@ -2017,20 +2052,20 @@ void __cifs_put_smb_ses(struct cifs_ses *ses)
list_del_init(&ses->smb_ses_list);
spin_unlock(&cifs_tcp_ses_lock);
- chan_count = ses->chan_count;
-
/* close any extra channels */
- if (chan_count > 1) {
- int i;
-
- for (i = 1; i < chan_count; i++) {
- if (ses->chans[i].iface) {
- kref_put(&ses->chans[i].iface->refcount, release_iface);
- ses->chans[i].iface = NULL;
- }
- cifs_put_tcp_session(ses->chans[i].server, 0);
- ses->chans[i].server = NULL;
+ for (i = 1; i < ses->chan_count; i++) {
+ if (ses->chans[i].iface) {
+ kref_put(&ses->chans[i].iface->refcount, release_iface);
+ ses->chans[i].iface = NULL;
}
+ cifs_put_tcp_session(ses->chans[i].server, 0);
+ ses->chans[i].server = NULL;
+ }
+
+ /* we now account for primary channel in iface->refcount */
+ if (ses->chans[0].iface) {
+ kref_put(&ses->chans[0].iface->refcount, release_iface);
+ ses->chans[0].server = NULL;
}
sesInfoFree(ses);
@@ -3560,7 +3595,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
ctx->prepath = NULL;
out:
- cifs_try_adding_channels(cifs_sb, mnt_ctx.ses);
+ cifs_try_adding_channels(mnt_ctx.ses);
rc = mount_setup_tlink(cifs_sb, mnt_ctx.ses, mnt_ctx.tcon);
if (rc)
goto error;
@@ -3849,8 +3884,12 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses);
spin_unlock(&ses->chan_lock);
- if (!is_binding)
+ if (!is_binding) {
ses->ses_status = SES_IN_SETUP;
+
+ /* force iface_list refresh */
+ ses->iface_last_update = 0;
+ }
spin_unlock(&ses->ses_lock);
/* update ses ip_addr only for primary chan */
diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c
index 81b84151450d..a8a1d386da65 100644
--- a/fs/smb/client/dfs.c
+++ b/fs/smb/client/dfs.c
@@ -263,15 +263,23 @@ out:
return rc;
}
-/* Resolve UNC hostname in @ctx->source and set ip addr in @ctx->dstaddr */
+/*
+ * If @ctx->dfs_automount, then update @ctx->dstaddr earlier with the DFS root
+ * server from where we'll start following any referrals. Otherwise rely on the
+ * value provided by mount(2) as the user might not have dns_resolver key set up
+ * and therefore failing to upcall to resolve UNC hostname under @ctx->source.
+ */
static int update_fs_context_dstaddr(struct smb3_fs_context *ctx)
{
struct sockaddr *addr = (struct sockaddr *)&ctx->dstaddr;
- int rc;
+ int rc = 0;
- rc = dns_resolve_server_name_to_ip(ctx->source, addr, NULL);
- if (!rc)
- cifs_set_port(addr, ctx->port);
+ if (!ctx->nodfs && ctx->dfs_automount) {
+ rc = dns_resolve_server_name_to_ip(ctx->source, addr, NULL);
+ if (!rc)
+ cifs_set_port(addr, ctx->port);
+ ctx->dfs_automount = false;
+ }
return rc;
}
diff --git a/fs/smb/client/export.c b/fs/smb/client/export.c
index 37c28415df1e..d606e8cbcb7d 100644
--- a/fs/smb/client/export.c
+++ b/fs/smb/client/export.c
@@ -41,13 +41,12 @@ static struct dentry *cifs_get_parent(struct dentry *dentry)
}
const struct export_operations cifs_export_ops = {
+ .encode_fh = generic_encode_ino32_fh,
.get_parent = cifs_get_parent,
-/* Following five export operations are unneeded so far and can default:
- .get_dentry =
- .get_name =
- .find_exported_dentry =
- .decode_fh =
- .encode_fs = */
+/*
+ * Following export operations are mandatory for NFS export support:
+ * .fh_to_dentry =
+ */
};
#endif /* CONFIG_CIFS_NFSD_EXPORT */
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 2108b3b40ce9..32a8525415d9 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -1085,7 +1085,8 @@ int cifs_close(struct inode *inode, struct file *file)
!test_bit(CIFS_INO_CLOSE_ON_LOCK, &cinode->flags) &&
dclose) {
if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) {
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode,
+ inode_set_ctime_current(inode));
}
spin_lock(&cinode->deferred_lock);
cifs_add_deferred_close(cfile, dclose);
@@ -2596,7 +2597,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
write_data, to - from, &offset);
cifsFileInfo_put(open_file);
/* Does mm or vfs already set times? */
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
if ((bytes_written > 0) && (offset))
rc = 0;
else if (bytes_written < 0)
@@ -4647,11 +4648,13 @@ static void cifs_readahead(struct readahead_control *ractl)
static int cifs_readpage_worker(struct file *file, struct page *page,
loff_t *poffset)
{
+ struct inode *inode = file_inode(file);
+ struct timespec64 atime, mtime;
char *read_data;
int rc;
/* Is the page cached? */
- rc = cifs_readpage_from_fscache(file_inode(file), page);
+ rc = cifs_readpage_from_fscache(inode, page);
if (rc == 0)
goto read_complete;
@@ -4666,11 +4669,10 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
cifs_dbg(FYI, "Bytes read %d\n", rc);
/* we do not want atime to be less than mtime, it broke some apps */
- file_inode(file)->i_atime = current_time(file_inode(file));
- if (timespec64_compare(&(file_inode(file)->i_atime), &(file_inode(file)->i_mtime)))
- file_inode(file)->i_atime = file_inode(file)->i_mtime;
- else
- file_inode(file)->i_atime = current_time(file_inode(file));
+ atime = inode_set_atime_to_ts(inode, current_time(inode));
+ mtime = inode_get_mtime(inode);
+ if (timespec64_compare(&atime, &mtime) < 0)
+ inode_set_atime_to_ts(inode, inode_get_mtime(inode));
if (PAGE_SIZE > rc)
memset(read_data + rc, 0, PAGE_SIZE - rc);
diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h
index 9d8d34af0211..cf46916286d0 100644
--- a/fs/smb/client/fs_context.h
+++ b/fs/smb/client/fs_context.h
@@ -268,6 +268,7 @@ struct smb3_fs_context {
bool witness:1; /* use witness protocol */
char *leaf_fullpath;
struct cifs_ses *dfs_root_ses;
+ bool dfs_automount:1; /* set for dfs automount only */
};
extern const struct fs_parameter_spec smb3_fs_parameters[];
diff --git a/fs/smb/client/fscache.h b/fs/smb/client/fscache.h
index 84f3b09367d2..a3d73720914f 100644
--- a/fs/smb/client/fscache.h
+++ b/fs/smb/client/fscache.h
@@ -49,12 +49,12 @@ static inline
void cifs_fscache_fill_coherency(struct inode *inode,
struct cifs_fscache_inode_coherency_data *cd)
{
- struct cifsInodeInfo *cifsi = CIFS_I(inode);
struct timespec64 ctime = inode_get_ctime(inode);
+ struct timespec64 mtime = inode_get_mtime(inode);
memset(cd, 0, sizeof(*cd));
- cd->last_write_time_sec = cpu_to_le64(cifsi->netfs.inode.i_mtime.tv_sec);
- cd->last_write_time_nsec = cpu_to_le32(cifsi->netfs.inode.i_mtime.tv_nsec);
+ cd->last_write_time_sec = cpu_to_le64(mtime.tv_sec);
+ cd->last_write_time_nsec = cpu_to_le32(mtime.tv_nsec);
cd->last_change_time_sec = cpu_to_le64(ctime.tv_sec);
cd->last_change_time_nsec = cpu_to_le32(ctime.tv_nsec);
}
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index d7c302442c1e..09c5c0f5c96e 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -82,6 +82,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
{
struct cifs_fscache_inode_coherency_data cd;
struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+ struct timespec64 mtime;
cifs_dbg(FYI, "%s: revalidating inode %llu\n",
__func__, cifs_i->uniqueid);
@@ -101,7 +102,8 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
/* revalidate if mtime or size have changed */
fattr->cf_mtime = timestamp_truncate(fattr->cf_mtime, inode);
- if (timespec64_equal(&inode->i_mtime, &fattr->cf_mtime) &&
+ mtime = inode_get_mtime(inode);
+ if (timespec64_equal(&mtime, &fattr->cf_mtime) &&
cifs_i->server_eof == fattr->cf_eof) {
cifs_dbg(FYI, "%s: inode %llu is unchanged\n",
__func__, cifs_i->uniqueid);
@@ -164,10 +166,10 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
fattr->cf_ctime = timestamp_truncate(fattr->cf_ctime, inode);
/* we do not want atime to be less than mtime, it broke some apps */
if (timespec64_compare(&fattr->cf_atime, &fattr->cf_mtime) < 0)
- inode->i_atime = fattr->cf_mtime;
+ inode_set_atime_to_ts(inode, fattr->cf_mtime);
else
- inode->i_atime = fattr->cf_atime;
- inode->i_mtime = fattr->cf_mtime;
+ inode_set_atime_to_ts(inode, fattr->cf_atime);
+ inode_set_mtime_to_ts(inode, fattr->cf_mtime);
inode_set_ctime_to_ts(inode, fattr->cf_ctime);
inode->i_rdev = fattr->cf_rdev;
cifs_nlink_fattr_to_inode(inode, fattr);
@@ -457,8 +459,7 @@ static int cifs_get_unix_fattr(const unsigned char *full_path,
return -EOPNOTSUPP;
rc = server->ops->query_symlink(xid, tcon,
cifs_sb, full_path,
- &fattr->cf_symlink_target,
- NULL);
+ &fattr->cf_symlink_target);
cifs_dbg(FYI, "%s: query_symlink: %d\n", __func__, rc);
}
return rc;
@@ -592,6 +593,10 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
cifs_dbg(FYI, "Symlink\n");
fattr->cf_mode |= S_IFLNK;
fattr->cf_dtype = DT_LNK;
+ } else if (memcmp("LnxFIFO", pbuf, 8) == 0) {
+ cifs_dbg(FYI, "FIFO\n");
+ fattr->cf_mode |= S_IFIFO;
+ fattr->cf_dtype = DT_FIFO;
} else {
fattr->cf_mode |= S_IFREG; /* file? */
fattr->cf_dtype = DT_REG;
@@ -716,10 +721,51 @@ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr,
fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink);
}
+static inline dev_t nfs_mkdev(struct reparse_posix_data *buf)
+{
+ u64 v = le64_to_cpu(*(__le64 *)buf->DataBuffer);
+
+ return MKDEV(v >> 32, v & 0xffffffff);
+}
+
bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
struct cifs_fattr *fattr,
- u32 tag)
+ struct cifs_open_info_data *data)
{
+ struct reparse_posix_data *buf = data->reparse.posix;
+ u32 tag = data->reparse.tag;
+
+ if (tag == IO_REPARSE_TAG_NFS && buf) {
+ switch (le64_to_cpu(buf->InodeType)) {
+ case NFS_SPECFILE_CHR:
+ fattr->cf_mode |= S_IFCHR | cifs_sb->ctx->file_mode;
+ fattr->cf_dtype = DT_CHR;
+ fattr->cf_rdev = nfs_mkdev(buf);
+ break;
+ case NFS_SPECFILE_BLK:
+ fattr->cf_mode |= S_IFBLK | cifs_sb->ctx->file_mode;
+ fattr->cf_dtype = DT_BLK;
+ fattr->cf_rdev = nfs_mkdev(buf);
+ break;
+ case NFS_SPECFILE_FIFO:
+ fattr->cf_mode |= S_IFIFO | cifs_sb->ctx->file_mode;
+ fattr->cf_dtype = DT_FIFO;
+ break;
+ case NFS_SPECFILE_SOCK:
+ fattr->cf_mode |= S_IFSOCK | cifs_sb->ctx->file_mode;
+ fattr->cf_dtype = DT_SOCK;
+ break;
+ case NFS_SPECFILE_LNK:
+ fattr->cf_mode = S_IFLNK | cifs_sb->ctx->file_mode;
+ fattr->cf_dtype = DT_LNK;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return false;
+ }
+ return true;
+ }
+
switch (tag) {
case IO_REPARSE_TAG_LX_SYMLINK:
fattr->cf_mode |= S_IFLNK | cifs_sb->ctx->file_mode;
@@ -744,7 +790,7 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
case 0: /* SMB1 symlink */
case IO_REPARSE_TAG_SYMLINK:
case IO_REPARSE_TAG_NFS:
- fattr->cf_mode = S_IFLNK;
+ fattr->cf_mode = S_IFLNK | cifs_sb->ctx->file_mode;
fattr->cf_dtype = DT_LNK;
break;
default:
@@ -785,7 +831,7 @@ static void cifs_open_info_to_fattr(struct cifs_fattr *fattr,
fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
if (cifs_open_data_reparse(data) &&
- cifs_reparse_point_to_fattr(cifs_sb, fattr, data->reparse_tag))
+ cifs_reparse_point_to_fattr(cifs_sb, fattr, data))
goto out_reparse;
if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
@@ -819,6 +865,8 @@ static void cifs_open_info_to_fattr(struct cifs_fattr *fattr,
out_reparse:
if (S_ISLNK(fattr->cf_mode)) {
+ if (likely(data->symlink_target))
+ fattr->cf_eof = strnlen(data->symlink_target, PATH_MAX);
fattr->cf_symlink_target = data->symlink_target;
data->symlink_target = NULL;
}
@@ -850,7 +898,7 @@ cifs_get_file_info(struct file *filp)
data.adjust_tz = false;
if (data.symlink_target) {
data.symlink = true;
- data.reparse_tag = IO_REPARSE_TAG_SYMLINK;
+ data.reparse.tag = IO_REPARSE_TAG_SYMLINK;
}
cifs_open_info_to_fattr(&fattr, &data, inode->i_sb);
break;
@@ -1019,7 +1067,7 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data,
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct kvec rsp_iov, *iov = NULL;
int rsp_buftype = CIFS_NO_BUFFER;
- u32 tag = data->reparse_tag;
+ u32 tag = data->reparse.tag;
int rc = 0;
if (!tag && server->ops->query_reparse_point) {
@@ -1029,22 +1077,28 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data,
if (!rc)
iov = &rsp_iov;
}
- switch ((data->reparse_tag = tag)) {
+
+ rc = -EOPNOTSUPP;
+ switch ((data->reparse.tag = tag)) {
case 0: /* SMB1 symlink */
- iov = NULL;
- fallthrough;
- case IO_REPARSE_TAG_NFS:
- case IO_REPARSE_TAG_SYMLINK:
- if (!data->symlink_target && server->ops->query_symlink) {
+ if (server->ops->query_symlink) {
rc = server->ops->query_symlink(xid, tcon,
cifs_sb, full_path,
- &data->symlink_target,
- iov);
+ &data->symlink_target);
}
break;
case IO_REPARSE_TAG_MOUNT_POINT:
cifs_create_junction_fattr(fattr, sb);
+ rc = 0;
goto out;
+ default:
+ if (data->symlink_target) {
+ rc = 0;
+ } else if (server->ops->parse_reparse_point) {
+ rc = server->ops->parse_reparse_point(cifs_sb,
+ iov, data);
+ }
+ break;
}
cifs_open_info_to_fattr(fattr, data, sb);
@@ -1816,7 +1870,7 @@ out_reval:
when needed */
inode_set_ctime_current(inode);
}
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
cifs_inode = CIFS_I(dir);
CIFS_I(dir)->time = 0; /* force revalidate of dir as well */
unlink_out:
@@ -2131,7 +2185,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
cifsInode->time = 0;
inode_set_ctime_current(d_inode(direntry));
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
rmdir_exit:
free_dentry_path(page);
@@ -2337,9 +2391,6 @@ unlink_target:
/* force revalidate to go get info when needed */
CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0;
- source_dir->i_mtime = target_dir->i_mtime = inode_set_ctime_to_ts(source_dir,
- inode_set_ctime_current(target_dir));
-
cifs_rename_exit:
kfree(info_buf_source);
free_dentry_path(page2);
diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c
index f7160003e0ed..e2f92c21fff5 100644
--- a/fs/smb/client/ioctl.c
+++ b/fs/smb/client/ioctl.c
@@ -117,6 +117,20 @@ out_drop_write:
return rc;
}
+static long smb_mnt_get_tcon_info(struct cifs_tcon *tcon, void __user *arg)
+{
+ int rc = 0;
+ struct smb_mnt_tcon_info tcon_inf;
+
+ tcon_inf.tid = tcon->tid;
+ tcon_inf.session_id = tcon->ses->Suid;
+
+ if (copy_to_user(arg, &tcon_inf, sizeof(struct smb_mnt_tcon_info)))
+ rc = -EFAULT;
+
+ return rc;
+}
+
static long smb_mnt_get_fsinfo(unsigned int xid, struct cifs_tcon *tcon,
void __user *arg)
{
@@ -129,6 +143,7 @@ static long smb_mnt_get_fsinfo(unsigned int xid, struct cifs_tcon *tcon,
fsinf->version = 1;
fsinf->protocol_id = tcon->ses->server->vals->protocol_id;
+ fsinf->tcon_flags = tcon->Flags;
fsinf->device_characteristics =
le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics);
fsinf->device_type = le32_to_cpu(tcon->fsDevInfo.DeviceType);
@@ -414,6 +429,17 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
tcon = tlink_tcon(pSMBFile->tlink);
rc = smb_mnt_get_fsinfo(xid, tcon, (void __user *)arg);
break;
+ case CIFS_IOC_GET_TCON_INFO:
+ cifs_sb = CIFS_SB(inode->i_sb);
+ tlink = cifs_sb_tlink(cifs_sb);
+ if (IS_ERR(tlink)) {
+ rc = PTR_ERR(tlink);
+ break;
+ }
+ tcon = tlink_tcon(tlink);
+ rc = smb_mnt_get_tcon_info(tcon, (void __user *)arg);
+ cifs_put_tlink(tlink);
+ break;
case CIFS_ENUMERATE_SNAPSHOTS:
if (pSMBFile == NULL)
break;
diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c
index c66be4904e1f..a1da50e66fbb 100644
--- a/fs/smb/client/link.c
+++ b/fs/smb/client/link.c
@@ -42,23 +42,11 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
rc = cifs_alloc_hash("md5", &md5);
if (rc)
- goto symlink_hash_err;
+ return rc;
- rc = crypto_shash_init(md5);
- if (rc) {
- cifs_dbg(VFS, "%s: Could not init md5 shash\n", __func__);
- goto symlink_hash_err;
- }
- rc = crypto_shash_update(md5, link_str, link_len);
- if (rc) {
- cifs_dbg(VFS, "%s: Could not update with link_str\n", __func__);
- goto symlink_hash_err;
- }
- rc = crypto_shash_final(md5, md5_hash);
+ rc = crypto_shash_digest(md5, link_str, link_len, md5_hash);
if (rc)
cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
-
-symlink_hash_err:
cifs_free_hash(&md5);
return rc;
}
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index 35b176457bbe..c2137ea3c253 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -363,6 +363,10 @@ checkSMB(char *buf, unsigned int total_read, struct TCP_Server_Info *server)
cifs_dbg(VFS, "Length less than smb header size\n");
}
return -EIO;
+ } else if (total_read < sizeof(*smb) + 2 * smb->WordCount) {
+ cifs_dbg(VFS, "%s: can't read BCC due to invalid WordCount(%u)\n",
+ __func__, smb->WordCount);
+ return -EIO;
}
/* otherwise, there is enough to get to the BCC */
diff --git a/fs/smb/client/namespace.c b/fs/smb/client/namespace.c
index c8f5ed8a69f1..a6968573b775 100644
--- a/fs/smb/client/namespace.c
+++ b/fs/smb/client/namespace.c
@@ -117,6 +117,18 @@ cifs_build_devname(char *nodename, const char *prepath)
return dev;
}
+static bool is_dfs_mount(struct dentry *dentry)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
+ struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+ bool ret;
+
+ spin_lock(&tcon->tc_lock);
+ ret = !!tcon->origin_fullpath;
+ spin_unlock(&tcon->tc_lock);
+ return ret;
+}
+
/* Return full path out of a dentry set for automount */
static char *automount_fullpath(struct dentry *dentry, void *page)
{
@@ -212,8 +224,9 @@ static struct vfsmount *cifs_do_automount(struct path *path)
ctx->source = NULL;
goto out;
}
- cifs_dbg(FYI, "%s: ctx: source=%s UNC=%s prepath=%s\n",
- __func__, ctx->source, ctx->UNC, ctx->prepath);
+ ctx->dfs_automount = is_dfs_mount(mntpt);
+ cifs_dbg(FYI, "%s: ctx: source=%s UNC=%s prepath=%s dfs_automount=%d\n",
+ __func__, ctx->source, ctx->UNC, ctx->prepath, ctx->dfs_automount);
mnt = fc_mount(fc);
out:
diff --git a/fs/smb/client/ntlmssp.h b/fs/smb/client/ntlmssp.h
index 2c5dde2ece58..875de43b72de 100644
--- a/fs/smb/client/ntlmssp.h
+++ b/fs/smb/client/ntlmssp.h
@@ -133,8 +133,8 @@ typedef struct _AUTHENTICATE_MESSAGE {
SECURITY_BUFFER WorkstationName;
SECURITY_BUFFER SessionKey;
__le32 NegotiateFlags;
- /* SECURITY_BUFFER for version info not present since we
- do not set the version is present flag */
+ struct ntlmssp_version Version;
+ /* SECURITY_BUFFER */
char UserString[];
} __attribute__((packed)) AUTHENTICATE_MESSAGE, *PAUTHENTICATE_MESSAGE;
diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c
index 47fc22de8d20..d30ea2005eb3 100644
--- a/fs/smb/client/readdir.c
+++ b/fs/smb/client/readdir.c
@@ -153,6 +153,10 @@ static bool reparse_file_needs_reval(const struct cifs_fattr *fattr)
static void
cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
{
+ struct cifs_open_info_data data = {
+ .reparse = { .tag = fattr->cf_cifstag, },
+ };
+
fattr->cf_uid = cifs_sb->ctx->linux_uid;
fattr->cf_gid = cifs_sb->ctx->linux_gid;
@@ -165,7 +169,7 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
* reasonably map some of them to directories vs. files vs. symlinks
*/
if ((fattr->cf_cifsattrs & ATTR_REPARSE) &&
- cifs_reparse_point_to_fattr(cifs_sb, fattr, fattr->cf_cifstag))
+ cifs_reparse_point_to_fattr(cifs_sb, fattr, &data))
goto out_reparse;
if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c
index 79f26c560edf..2d3b332a79a1 100644
--- a/fs/smb/client/sess.c
+++ b/fs/smb/client/sess.c
@@ -24,7 +24,7 @@
#include "fs_context.h"
static int
-cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
+cifs_ses_add_channel(struct cifs_ses *ses,
struct cifs_server_iface *iface);
bool
@@ -69,7 +69,7 @@ bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface)
/* channel helper functions. assumed that chan_lock is held by caller. */
-unsigned int
+int
cifs_ses_get_chan_index(struct cifs_ses *ses,
struct TCP_Server_Info *server)
{
@@ -85,14 +85,17 @@ cifs_ses_get_chan_index(struct cifs_ses *ses,
cifs_dbg(VFS, "unable to get chan index for server: 0x%llx",
server->conn_id);
WARN_ON(1);
- return 0;
+ return CIFS_INVAL_CHAN_INDEX;
}
void
cifs_chan_set_in_reconnect(struct cifs_ses *ses,
struct TCP_Server_Info *server)
{
- unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
+ int chan_index = cifs_ses_get_chan_index(ses, server);
+
+ if (chan_index == CIFS_INVAL_CHAN_INDEX)
+ return;
ses->chans[chan_index].in_reconnect = true;
}
@@ -103,6 +106,9 @@ cifs_chan_clear_in_reconnect(struct cifs_ses *ses,
{
unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
+ if (chan_index == CIFS_INVAL_CHAN_INDEX)
+ return;
+
ses->chans[chan_index].in_reconnect = false;
}
@@ -112,6 +118,9 @@ cifs_chan_in_reconnect(struct cifs_ses *ses,
{
unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
+ if (chan_index == CIFS_INVAL_CHAN_INDEX)
+ return true; /* err on the safer side */
+
return CIFS_CHAN_IN_RECONNECT(ses, chan_index);
}
@@ -121,6 +130,9 @@ cifs_chan_set_need_reconnect(struct cifs_ses *ses,
{
unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
+ if (chan_index == CIFS_INVAL_CHAN_INDEX)
+ return;
+
set_bit(chan_index, &ses->chans_need_reconnect);
cifs_dbg(FYI, "Set reconnect bitmask for chan %u; now 0x%lx\n",
chan_index, ses->chans_need_reconnect);
@@ -132,6 +144,9 @@ cifs_chan_clear_need_reconnect(struct cifs_ses *ses,
{
unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
+ if (chan_index == CIFS_INVAL_CHAN_INDEX)
+ return;
+
clear_bit(chan_index, &ses->chans_need_reconnect);
cifs_dbg(FYI, "Cleared reconnect bitmask for chan %u; now 0x%lx\n",
chan_index, ses->chans_need_reconnect);
@@ -143,6 +158,9 @@ cifs_chan_needs_reconnect(struct cifs_ses *ses,
{
unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
+ if (chan_index == CIFS_INVAL_CHAN_INDEX)
+ return true; /* err on the safer side */
+
return CIFS_CHAN_NEEDS_RECONNECT(ses, chan_index);
}
@@ -152,19 +170,24 @@ cifs_chan_is_iface_active(struct cifs_ses *ses,
{
unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
+ if (chan_index == CIFS_INVAL_CHAN_INDEX)
+ return true; /* err on the safer side */
+
return ses->chans[chan_index].iface &&
ses->chans[chan_index].iface->is_active;
}
/* returns number of channels added */
-int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
+int cifs_try_adding_channels(struct cifs_ses *ses)
{
struct TCP_Server_Info *server = ses->server;
int old_chan_count, new_chan_count;
int left;
int rc = 0;
int tries = 0;
+ size_t iface_weight = 0, iface_min_speed = 0;
struct cifs_server_iface *iface = NULL, *niface = NULL;
+ struct cifs_server_iface *last_iface = NULL;
spin_lock(&ses->chan_lock);
@@ -186,28 +209,17 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
}
if (!(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
- ses->chan_max = 1;
spin_unlock(&ses->chan_lock);
cifs_server_dbg(VFS, "no multichannel support\n");
return 0;
}
spin_unlock(&ses->chan_lock);
- /*
- * Keep connecting to same, fastest, iface for all channels as
- * long as its RSS. Try next fastest one if not RSS or channel
- * creation fails.
- */
- spin_lock(&ses->iface_lock);
- iface = list_first_entry(&ses->iface_list, struct cifs_server_iface,
- iface_head);
- spin_unlock(&ses->iface_lock);
-
while (left > 0) {
tries++;
if (tries > 3*ses->chan_max) {
- cifs_dbg(FYI, "too many channel open attempts (%d channels left to open)\n",
+ cifs_dbg(VFS, "too many channel open attempts (%d channels left to open)\n",
left);
break;
}
@@ -215,23 +227,41 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
spin_lock(&ses->iface_lock);
if (!ses->iface_count) {
spin_unlock(&ses->iface_lock);
+ cifs_dbg(VFS, "server %s does not advertise interfaces\n",
+ ses->server->hostname);
break;
}
+ if (!iface)
+ iface = list_first_entry(&ses->iface_list, struct cifs_server_iface,
+ iface_head);
+ last_iface = list_last_entry(&ses->iface_list, struct cifs_server_iface,
+ iface_head);
+ iface_min_speed = last_iface->speed;
+
list_for_each_entry_safe_from(iface, niface, &ses->iface_list,
iface_head) {
+ /* do not mix rdma and non-rdma interfaces */
+ if (iface->rdma_capable != ses->server->rdma)
+ continue;
+
/* skip ifaces that are unusable */
if (!iface->is_active ||
(is_ses_using_iface(ses, iface) &&
- !iface->rss_capable)) {
+ !iface->rss_capable))
+ continue;
+
+ /* check if we already allocated enough channels */
+ iface_weight = iface->speed / iface_min_speed;
+
+ if (iface->weight_fulfilled >= iface_weight)
continue;
- }
/* take ref before unlock */
kref_get(&iface->refcount);
spin_unlock(&ses->iface_lock);
- rc = cifs_ses_add_channel(cifs_sb, ses, iface);
+ rc = cifs_ses_add_channel(ses, iface);
spin_lock(&ses->iface_lock);
if (rc) {
@@ -242,10 +272,21 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
continue;
}
- cifs_dbg(FYI, "successfully opened new channel on iface:%pIS\n",
+ iface->num_channels++;
+ iface->weight_fulfilled++;
+ cifs_dbg(VFS, "successfully opened new channel on iface:%pIS\n",
&iface->sockaddr);
break;
}
+
+ /* reached end of list. reset weight_fulfilled and start over */
+ if (list_entry_is_head(iface, &ses->iface_list, iface_head)) {
+ list_for_each_entry(iface, &ses->iface_list, iface_head)
+ iface->weight_fulfilled = 0;
+ spin_unlock(&ses->iface_lock);
+ iface = NULL;
+ continue;
+ }
spin_unlock(&ses->iface_lock);
left--;
@@ -256,6 +297,64 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
}
/*
+ * called when multichannel is disabled by the server.
+ * this always gets called from smb2_reconnect
+ * and cannot get called in parallel threads.
+ */
+void
+cifs_disable_secondary_channels(struct cifs_ses *ses)
+{
+ int i, chan_count;
+ struct TCP_Server_Info *server;
+ struct cifs_server_iface *iface;
+
+ spin_lock(&ses->chan_lock);
+ chan_count = ses->chan_count;
+ if (chan_count == 1)
+ goto done;
+
+ ses->chan_count = 1;
+
+ /* for all secondary channels reset the need reconnect bit */
+ ses->chans_need_reconnect &= 1;
+
+ for (i = 1; i < chan_count; i++) {
+ iface = ses->chans[i].iface;
+ server = ses->chans[i].server;
+
+ /*
+ * remove these references first, since we need to unlock
+ * the chan_lock here, since iface_lock is a higher lock
+ */
+ ses->chans[i].iface = NULL;
+ ses->chans[i].server = NULL;
+ spin_unlock(&ses->chan_lock);
+
+ if (iface) {
+ spin_lock(&ses->iface_lock);
+ iface->num_channels--;
+ if (iface->weight_fulfilled)
+ iface->weight_fulfilled--;
+ kref_put(&iface->refcount, release_iface);
+ spin_unlock(&ses->iface_lock);
+ }
+
+ if (server) {
+ if (!server->terminate) {
+ server->terminate = true;
+ cifs_signal_cifsd_for_reconnect(server, false);
+ }
+ cifs_put_tcp_session(server, false);
+ }
+
+ spin_lock(&ses->chan_lock);
+ }
+
+done:
+ spin_unlock(&ses->chan_lock);
+}
+
+/*
* update the iface for the channel if necessary.
* will return 0 when iface is updated, 1 if removed, 2 otherwise
* Must be called with chan_lock held.
@@ -264,13 +363,16 @@ int
cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
{
unsigned int chan_index;
+ size_t iface_weight = 0, iface_min_speed = 0;
struct cifs_server_iface *iface = NULL;
struct cifs_server_iface *old_iface = NULL;
+ struct cifs_server_iface *last_iface = NULL;
+ struct sockaddr_storage ss;
int rc = 0;
spin_lock(&ses->chan_lock);
chan_index = cifs_ses_get_chan_index(ses, server);
- if (!chan_index) {
+ if (chan_index == CIFS_INVAL_CHAN_INDEX) {
spin_unlock(&ses->chan_lock);
return 0;
}
@@ -284,14 +386,49 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
}
spin_unlock(&ses->chan_lock);
+ spin_lock(&server->srv_lock);
+ ss = server->dstaddr;
+ spin_unlock(&server->srv_lock);
+
spin_lock(&ses->iface_lock);
+ if (!ses->iface_count) {
+ spin_unlock(&ses->iface_lock);
+ cifs_dbg(VFS, "server %s does not advertise interfaces\n", ses->server->hostname);
+ return 0;
+ }
+
+ last_iface = list_last_entry(&ses->iface_list, struct cifs_server_iface,
+ iface_head);
+ iface_min_speed = last_iface->speed;
+
/* then look for a new one */
list_for_each_entry(iface, &ses->iface_list, iface_head) {
+ if (!chan_index) {
+ /* if we're trying to get the updated iface for primary channel */
+ if (!cifs_match_ipaddr((struct sockaddr *) &ss,
+ (struct sockaddr *) &iface->sockaddr))
+ continue;
+
+ kref_get(&iface->refcount);
+ break;
+ }
+
+ /* do not mix rdma and non-rdma interfaces */
+ if (iface->rdma_capable != server->rdma)
+ continue;
+
if (!iface->is_active ||
(is_ses_using_iface(ses, iface) &&
!iface->rss_capable)) {
continue;
}
+
+ /* check if we already allocated enough channels */
+ iface_weight = iface->speed / iface_min_speed;
+
+ if (iface->weight_fulfilled >= iface_weight)
+ continue;
+
kref_get(&iface->refcount);
break;
}
@@ -302,34 +439,52 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
cifs_dbg(FYI, "unable to find a suitable iface\n");
}
+ if (!iface) {
+ cifs_dbg(FYI, "unable to get the interface matching: %pIS\n",
+ &ss);
+ spin_unlock(&ses->iface_lock);
+ return 0;
+ }
+
/* now drop the ref to the current iface */
- if (old_iface && iface) {
+ if (old_iface) {
cifs_dbg(FYI, "replacing iface: %pIS with %pIS\n",
&old_iface->sockaddr,
&iface->sockaddr);
+
+ old_iface->num_channels--;
+ if (old_iface->weight_fulfilled)
+ old_iface->weight_fulfilled--;
+ iface->num_channels++;
+ iface->weight_fulfilled++;
+
kref_put(&old_iface->refcount, release_iface);
} else if (old_iface) {
- cifs_dbg(FYI, "releasing ref to iface: %pIS\n",
+ /* if a new candidate is not found, keep things as is */
+ cifs_dbg(FYI, "could not replace iface: %pIS\n",
&old_iface->sockaddr);
- kref_put(&old_iface->refcount, release_iface);
- } else {
- WARN_ON(!iface);
- cifs_dbg(FYI, "adding new iface: %pIS\n", &iface->sockaddr);
+ } else if (!chan_index) {
+ /* special case: update interface for primary channel */
+ if (iface) {
+ cifs_dbg(FYI, "referencing primary channel iface: %pIS\n",
+ &iface->sockaddr);
+ iface->num_channels++;
+ iface->weight_fulfilled++;
+ }
}
spin_unlock(&ses->iface_lock);
- spin_lock(&ses->chan_lock);
- chan_index = cifs_ses_get_chan_index(ses, server);
- ses->chans[chan_index].iface = iface;
-
- /* No iface is found. if secondary chan, drop connection */
- if (!iface && SERVER_IS_CHAN(server))
- ses->chans[chan_index].server = NULL;
-
- spin_unlock(&ses->chan_lock);
+ if (iface) {
+ spin_lock(&ses->chan_lock);
+ chan_index = cifs_ses_get_chan_index(ses, server);
+ if (chan_index == CIFS_INVAL_CHAN_INDEX) {
+ spin_unlock(&ses->chan_lock);
+ return 0;
+ }
- if (!iface && SERVER_IS_CHAN(server))
- cifs_put_tcp_session(server, false);
+ ses->chans[chan_index].iface = iface;
+ spin_unlock(&ses->chan_lock);
+ }
return rc;
}
@@ -355,7 +510,7 @@ cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server)
}
static int
-cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
+cifs_ses_add_channel(struct cifs_ses *ses,
struct cifs_server_iface *iface)
{
struct TCP_Server_Info *chan_server;
@@ -434,7 +589,7 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
* This will be used for encoding/decoding user/domain/pw
* during sess setup auth.
*/
- ctx->local_nls = cifs_sb->local_nls;
+ ctx->local_nls = ses->local_nls;
/* Use RDMA if possible */
ctx->rdma = iface->rdma_capable;
@@ -480,20 +635,16 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
rc = cifs_negotiate_protocol(xid, ses, chan->server);
if (!rc)
- rc = cifs_setup_session(xid, ses, chan->server, cifs_sb->local_nls);
+ rc = cifs_setup_session(xid, ses, chan->server, ses->local_nls);
mutex_unlock(&ses->session_mutex);
out:
if (rc && chan->server) {
- /*
- * we should avoid race with these delayed works before we
- * remove this channel
- */
- cancel_delayed_work_sync(&chan->server->echo);
- cancel_delayed_work_sync(&chan->server->reconnect);
+ cifs_put_tcp_session(chan->server, 0);
spin_lock(&ses->chan_lock);
+
/* we rely on all bits beyond chan_count to be clear */
cifs_chan_clear_need_reconnect(ses, chan->server);
ses->chan_count--;
@@ -503,8 +654,6 @@ out:
*/
WARN_ON(ses->chan_count < 1);
spin_unlock(&ses->chan_lock);
-
- cifs_put_tcp_session(chan->server, 0);
}
kfree(ctx->UNC);
@@ -536,8 +685,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses,
/* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
- /* BB verify whether signing required on neg or just on auth frame
- (and NTLM case) */
+ /* BB verify whether signing required on neg or just auth frame (and NTLM case) */
capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
CAP_LARGE_WRITE_X | CAP_LARGE_READ_X;
@@ -594,8 +742,10 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,
/* copy domain */
if (ses->domainName == NULL) {
- /* Sending null domain better than using a bogus domain name (as
- we did briefly in 2.6.18) since server will use its default */
+ /*
+ * Sending null domain better than using a bogus domain name (as
+ * we did briefly in 2.6.18) since server will use its default
+ */
*bcc_ptr = 0;
*(bcc_ptr+1) = 0;
bytes_ret = 0;
@@ -614,8 +764,7 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
char *bcc_ptr = *pbcc_area;
int bytes_ret = 0;
- /* BB FIXME add check that strings total less
- than 335 or will need to send them as arrays */
+ /* BB FIXME add check that strings less than 335 or will need to send as arrays */
/* copy user */
if (ses->user_name == NULL) {
@@ -660,8 +809,7 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
if (WARN_ON_ONCE(len < 0))
len = CIFS_MAX_DOMAINNAME_LEN - 1;
bcc_ptr += len;
- } /* else we will send a null domain name
- so the server will default to its own domain */
+ } /* else we send a null domain name so server will default to its own domain */
*bcc_ptr = 0;
bcc_ptr++;
@@ -757,11 +905,14 @@ static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
if (len > bleft)
return;
- /* No domain field in LANMAN case. Domain is
- returned by old servers in the SMB negprot response */
- /* BB For newer servers which do not support Unicode,
- but thus do return domain here we could add parsing
- for it later, but it is not very important */
+ /*
+ * No domain field in LANMAN case. Domain is
+ * returned by old servers in the SMB negprot response
+ *
+ * BB For newer servers which do not support Unicode,
+ * but thus do return domain here, we could add parsing
+ * for it later, but it is not very important
+ */
cifs_dbg(FYI, "ascii: bytes left %d\n", bleft);
}
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
@@ -817,9 +968,12 @@ int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
ses->ntlmssp->server_flags = server_flags;
memcpy(ses->ntlmssp->cryptkey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE);
- /* In particular we can examine sign flags */
- /* BB spec says that if AvId field of MsvAvTimestamp is populated then
- we must set the MIC field of the AUTHENTICATE_MESSAGE */
+ /*
+ * In particular we can examine sign flags
+ *
+ * BB spec says that if AvId field of MsvAvTimestamp is populated then
+ * we must set the MIC field of the AUTHENTICATE_MESSAGE
+ */
tioffset = le32_to_cpu(pblob->TargetInfoArray.BufferOffset);
tilen = le16_to_cpu(pblob->TargetInfoArray.Length);
@@ -1060,10 +1214,16 @@ int build_ntlmssp_auth_blob(unsigned char **pbuffer,
memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
sec_blob->MessageType = NtLmAuthenticate;
+ /* send version information in ntlmssp authenticate also */
flags = ses->ntlmssp->server_flags | NTLMSSP_REQUEST_TARGET |
- NTLMSSP_NEGOTIATE_TARGET_INFO | NTLMSSP_NEGOTIATE_WORKSTATION_SUPPLIED;
- /* we only send version information in ntlmssp negotiate, so do not set this flag */
- flags = flags & ~NTLMSSP_NEGOTIATE_VERSION;
+ NTLMSSP_NEGOTIATE_TARGET_INFO | NTLMSSP_NEGOTIATE_VERSION |
+ NTLMSSP_NEGOTIATE_WORKSTATION_SUPPLIED;
+
+ sec_blob->Version.ProductMajorVersion = LINUX_VERSION_MAJOR;
+ sec_blob->Version.ProductMinorVersion = LINUX_VERSION_PATCHLEVEL;
+ sec_blob->Version.ProductBuild = cpu_to_le16(SMB3_PRODUCT_BUILD);
+ sec_blob->Version.NTLMRevisionCurrent = NTLMSSP_REVISION_W2K3;
+
tmp = *pbuffer + sizeof(AUTHENTICATE_MESSAGE);
sec_blob->NegotiateFlags = cpu_to_le32(flags);
diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c
index 9bf8735cdd1e..a9eaba8083b0 100644
--- a/fs/smb/client/smb1ops.c
+++ b/fs/smb/client/smb1ops.c
@@ -976,64 +976,37 @@ static int cifs_query_symlink(const unsigned int xid,
struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb,
const char *full_path,
- char **target_path,
- struct kvec *rsp_iov)
+ char **target_path)
{
int rc;
- int oplock = 0;
- bool is_reparse_point = !!rsp_iov;
- struct cifs_fid fid;
- struct cifs_open_parms oparms;
- cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
+ cifs_tcon_dbg(FYI, "%s: path=%s\n", __func__, full_path);
- if (is_reparse_point) {
- cifs_dbg(VFS, "reparse points not handled for SMB1 symlinks\n");
+ if (!cap_unix(tcon->ses))
return -EOPNOTSUPP;
- }
-
- /* Check for unix extensions */
- if (cap_unix(tcon->ses)) {
- rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, target_path,
- cifs_sb->local_nls,
- cifs_remap(cifs_sb));
- if (rc == -EREMOTE)
- rc = cifs_unix_dfs_readlink(xid, tcon, full_path,
- target_path,
- cifs_sb->local_nls);
-
- goto out;
- }
-
- oparms = (struct cifs_open_parms) {
- .tcon = tcon,
- .cifs_sb = cifs_sb,
- .desired_access = FILE_READ_ATTRIBUTES,
- .create_options = cifs_create_options(cifs_sb,
- OPEN_REPARSE_POINT),
- .disposition = FILE_OPEN,
- .path = full_path,
- .fid = &fid,
- };
-
- rc = CIFS_open(xid, &oparms, &oplock, NULL);
- if (rc)
- goto out;
-
- rc = CIFSSMBQuerySymLink(xid, tcon, fid.netfid, target_path,
- cifs_sb->local_nls);
- if (rc)
- goto out_close;
- convert_delimiter(*target_path, '/');
-out_close:
- CIFSSMBClose(xid, tcon, fid.netfid);
-out:
- if (!rc)
- cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path);
+ rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, target_path,
+ cifs_sb->local_nls, cifs_remap(cifs_sb));
+ if (rc == -EREMOTE)
+ rc = cifs_unix_dfs_readlink(xid, tcon, full_path,
+ target_path, cifs_sb->local_nls);
return rc;
}
+static int cifs_parse_reparse_point(struct cifs_sb_info *cifs_sb,
+ struct kvec *rsp_iov,
+ struct cifs_open_info_data *data)
+{
+ struct reparse_data_buffer *buf;
+ TRANSACT_IOCTL_RSP *io = rsp_iov->iov_base;
+ bool unicode = !!(io->hdr.Flags2 & SMBFLG2_UNICODE);
+ u32 plen = le16_to_cpu(io->ByteCount);
+
+ buf = (struct reparse_data_buffer *)((__u8 *)&io->hdr.Protocol +
+ le32_to_cpu(io->DataOffset));
+ return parse_reparse_point(buf, plen, cifs_sb, unicode, data);
+}
+
static bool
cifs_is_read_op(__u32 oplock)
{
@@ -1068,15 +1041,7 @@ cifs_make_node(unsigned int xid, struct inode *inode,
{
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct inode *newinode = NULL;
- int rc = -EPERM;
- struct cifs_open_info_data buf = {};
- struct cifs_io_parms io_parms;
- __u32 oplock = 0;
- struct cifs_fid fid;
- struct cifs_open_parms oparms;
- unsigned int bytes_written;
- struct win_dev *pdev;
- struct kvec iov[2];
+ int rc;
if (tcon->unix_ext) {
/*
@@ -1110,74 +1075,18 @@ cifs_make_node(unsigned int xid, struct inode *inode,
d_instantiate(dentry, newinode);
return rc;
}
-
/*
- * SMB1 SFU emulation: should work with all servers, but only
- * support block and char device (no socket & fifo)
+ * Check if mounted with mount parm 'sfu' mount parm.
+ * SFU emulation should work with all servers, but only
+ * supports block and char device (no socket & fifo),
+ * and was used by default in earlier versions of Windows
*/
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
- return rc;
-
- if (!S_ISCHR(mode) && !S_ISBLK(mode))
- return rc;
-
- cifs_dbg(FYI, "sfu compat create special file\n");
-
- oparms = (struct cifs_open_parms) {
- .tcon = tcon,
- .cifs_sb = cifs_sb,
- .desired_access = GENERIC_WRITE,
- .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR |
- CREATE_OPTION_SPECIAL),
- .disposition = FILE_CREATE,
- .path = full_path,
- .fid = &fid,
- };
-
- if (tcon->ses->server->oplocks)
- oplock = REQ_OPLOCK;
- else
- oplock = 0;
- rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, &buf);
- if (rc)
- return rc;
-
- /*
- * BB Do not bother to decode buf since no local inode yet to put
- * timestamps in, but we can reuse it safely.
- */
-
- pdev = (struct win_dev *)&buf.fi;
- io_parms.pid = current->tgid;
- io_parms.tcon = tcon;
- io_parms.offset = 0;
- io_parms.length = sizeof(struct win_dev);
- iov[1].iov_base = &buf.fi;
- iov[1].iov_len = sizeof(struct win_dev);
- if (S_ISCHR(mode)) {
- memcpy(pdev->type, "IntxCHR", 8);
- pdev->major = cpu_to_le64(MAJOR(dev));
- pdev->minor = cpu_to_le64(MINOR(dev));
- rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
- &bytes_written, iov, 1);
- } else if (S_ISBLK(mode)) {
- memcpy(pdev->type, "IntxBLK", 8);
- pdev->major = cpu_to_le64(MAJOR(dev));
- pdev->minor = cpu_to_le64(MINOR(dev));
- rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
- &bytes_written, iov, 1);
- }
- tcon->ses->server->ops->close(xid, tcon, &fid);
- d_drop(dentry);
-
- /* FIXME: add code here to set EAs */
-
- cifs_free_open_info(&buf);
- return rc;
+ return -EPERM;
+ return cifs_sfu_make_node(xid, inode, dentry, tcon,
+ full_path, mode, dev);
}
-
-
struct smb_version_operations smb1_operations = {
.send_cancel = send_nt_cancel,
.compare_fids = cifs_compare_fids,
@@ -1214,6 +1123,7 @@ struct smb_version_operations smb1_operations = {
.is_path_accessible = cifs_is_path_accessible,
.can_echo = cifs_can_echo,
.query_path_info = cifs_query_path_info,
+ .query_reparse_point = cifs_query_reparse_point,
.query_file_info = cifs_query_file_info,
.get_srv_inum = cifs_get_srv_inum,
.set_path_size = CIFSSMBSetEOF,
@@ -1229,6 +1139,7 @@ struct smb_version_operations smb1_operations = {
.rename = CIFSSMBRename,
.create_hardlink = CIFSCreateHardLink,
.query_symlink = cifs_query_symlink,
+ .parse_reparse_point = cifs_parse_reparse_point,
.open = cifs_open_file,
.set_fid = cifs_set_fid,
.close = cifs_close_file,
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index 0b89f7008ac0..c94940af5d4b 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -555,7 +555,7 @@ static int parse_create_response(struct cifs_open_info_data *data,
break;
}
data->reparse_point = reparse_point;
- data->reparse_tag = tag;
+ data->reparse.tag = tag;
return rc;
}
diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c
index 25f7cd6f23d6..82b84a4941dd 100644
--- a/fs/smb/client/smb2misc.c
+++ b/fs/smb/client/smb2misc.c
@@ -173,6 +173,21 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server)
}
mid = le64_to_cpu(shdr->MessageId);
+ if (check_smb2_hdr(shdr, mid))
+ return 1;
+
+ if (shdr->StructureSize != SMB2_HEADER_STRUCTURE_SIZE) {
+ cifs_dbg(VFS, "Invalid structure size %u\n",
+ le16_to_cpu(shdr->StructureSize));
+ return 1;
+ }
+
+ command = le16_to_cpu(shdr->Command);
+ if (command >= NUMBER_OF_SMB2_COMMANDS) {
+ cifs_dbg(VFS, "Invalid SMB2 command %d\n", command);
+ return 1;
+ }
+
if (len < pdu_size) {
if ((len >= hdr_size)
&& (shdr->Status != 0)) {
@@ -193,21 +208,6 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server)
return 1;
}
- if (check_smb2_hdr(shdr, mid))
- return 1;
-
- if (shdr->StructureSize != SMB2_HEADER_STRUCTURE_SIZE) {
- cifs_dbg(VFS, "Invalid structure size %u\n",
- le16_to_cpu(shdr->StructureSize));
- return 1;
- }
-
- command = le16_to_cpu(shdr->Command);
- if (command >= NUMBER_OF_SMB2_COMMANDS) {
- cifs_dbg(VFS, "Invalid SMB2 command %d\n", command);
- return 1;
- }
-
if (smb2_rsp_struct_sizes[command] != pdu->StructureSize2) {
if (command != SMB2_OPLOCK_BREAK_HE && (shdr->Status == 0 ||
pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2_LE)) {
@@ -313,6 +313,9 @@ static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = {
char *
smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *shdr)
{
+ const int max_off = 4096;
+ const int max_len = 128 * 1024;
+
*off = 0;
*len = 0;
@@ -384,29 +387,20 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *shdr)
* Invalid length or offset probably means data area is invalid, but
* we have little choice but to ignore the data area in this case.
*/
- if (*off > 4096) {
- cifs_dbg(VFS, "offset %d too large, data area ignored\n", *off);
- *len = 0;
+ if (unlikely(*off < 0 || *off > max_off ||
+ *len < 0 || *len > max_len)) {
+ cifs_dbg(VFS, "%s: invalid data area (off=%d len=%d)\n",
+ __func__, *off, *len);
*off = 0;
- } else if (*off < 0) {
- cifs_dbg(VFS, "negative offset %d to data invalid ignore data area\n",
- *off);
- *off = 0;
- *len = 0;
- } else if (*len < 0) {
- cifs_dbg(VFS, "negative data length %d invalid, data area ignored\n",
- *len);
*len = 0;
- } else if (*len > 128 * 1024) {
- cifs_dbg(VFS, "data area larger than 128K: %d\n", *len);
+ } else if (*off == 0) {
*len = 0;
}
/* return pointer to beginning of data area, ie offset from SMB start */
- if ((*off != 0) && (*len != 0))
+ if (*off > 0 && *len > 0)
return (char *)shdr + *off;
- else
- return NULL;
+ return NULL;
}
/*
@@ -787,7 +781,7 @@ __smb2_handle_cancelled_cmd(struct cifs_tcon *tcon, __u16 cmd, __u64 mid,
{
struct close_cancelled_open *cancelled;
- cancelled = kzalloc(sizeof(*cancelled), GFP_ATOMIC);
+ cancelled = kzalloc(sizeof(*cancelled), GFP_KERNEL);
if (!cancelled)
return -ENOMEM;
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 9aeecee6b91b..14bc745de199 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -403,8 +403,10 @@ smb2_dump_detail(void *buf, struct TCP_Server_Info *server)
cifs_server_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d\n",
shdr->Command, shdr->Status, shdr->Flags, shdr->MessageId,
shdr->Id.SyncId.ProcessId);
- cifs_server_dbg(VFS, "smb buf %p len %u\n", buf,
- server->ops->calc_smb_size(buf));
+ if (!server->ops->check_message(buf, server->total_read, server)) {
+ cifs_server_dbg(VFS, "smb buf %p len %u\n", buf,
+ server->ops->calc_smb_size(buf));
+ }
#endif
}
@@ -593,16 +595,12 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
}
/*
- * Go through iface_list and do kref_put to remove
- * any unused ifaces. ifaces in use will be removed
- * when the last user calls a kref_put on it
+ * Go through iface_list and mark them as inactive
*/
list_for_each_entry_safe(iface, niface, &ses->iface_list,
- iface_head) {
+ iface_head)
iface->is_active = 0;
- kref_put(&iface->refcount, release_iface);
- ses->iface_count--;
- }
+
spin_unlock(&ses->iface_lock);
/*
@@ -676,10 +674,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
iface_head) {
ret = iface_cmp(iface, &tmp_iface);
if (!ret) {
- /* just get a ref so that it doesn't get picked/freed */
iface->is_active = 1;
- kref_get(&iface->refcount);
- ses->iface_count++;
spin_unlock(&ses->iface_lock);
goto next_iface;
} else if (ret < 0) {
@@ -746,6 +741,20 @@ next_iface:
}
out:
+ /*
+ * Go through the list again and put the inactive entries
+ */
+ spin_lock(&ses->iface_lock);
+ list_for_each_entry_safe(iface, niface, &ses->iface_list,
+ iface_head) {
+ if (!iface->is_active) {
+ list_del(&iface->iface_head);
+ kref_put(&iface->refcount, release_iface);
+ ses->iface_count--;
+ }
+ }
+ spin_unlock(&ses->iface_lock);
+
return rc;
}
@@ -756,6 +765,7 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon, bool in_
unsigned int ret_data_len = 0;
struct network_interface_info_ioctl_rsp *out_buf = NULL;
struct cifs_ses *ses = tcon->ses;
+ struct TCP_Server_Info *pserver;
/* do not query too frequently */
if (ses->iface_last_update &&
@@ -780,6 +790,16 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon, bool in_
if (rc)
goto out;
+ /* check if iface is still active */
+ spin_lock(&ses->chan_lock);
+ pserver = ses->chans[0].server;
+ if (pserver && !cifs_chan_is_iface_active(ses, pserver)) {
+ spin_unlock(&ses->chan_lock);
+ cifs_chan_update_iface(ses, pserver);
+ spin_lock(&ses->chan_lock);
+ }
+ spin_unlock(&ses->chan_lock);
+
out:
kfree(out_buf);
return rc;
@@ -1403,12 +1423,14 @@ smb2_close_getattr(const unsigned int xid, struct cifs_tcon *tcon,
/* Creation time should not need to be updated on close */
if (file_inf.LastWriteTime)
- inode->i_mtime = cifs_NTtimeToUnix(file_inf.LastWriteTime);
+ inode_set_mtime_to_ts(inode,
+ cifs_NTtimeToUnix(file_inf.LastWriteTime));
if (file_inf.ChangeTime)
inode_set_ctime_to_ts(inode,
cifs_NTtimeToUnix(file_inf.ChangeTime));
if (file_inf.LastAccessTime)
- inode->i_atime = cifs_NTtimeToUnix(file_inf.LastAccessTime);
+ inode_set_atime_to_ts(inode,
+ cifs_NTtimeToUnix(file_inf.LastAccessTime));
/*
* i_blocks is not related to (i_size / i_blksize),
@@ -2828,6 +2850,8 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses,
usleep_range(512, 2048);
} while (++retry_count < 5);
+ if (!rc && !dfs_rsp)
+ rc = -EIO;
if (rc) {
if (!is_retryable_error(rc) && rc != -ENOENT && rc != -EOPNOTSUPP)
cifs_tcon_dbg(VFS, "%s: ioctl error: rc=%d\n", __func__, rc);
@@ -2858,115 +2882,119 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses,
return rc;
}
-static int
-parse_reparse_posix(struct reparse_posix_data *symlink_buf,
- u32 plen, char **target_path,
- struct cifs_sb_info *cifs_sb)
+/* See MS-FSCC 2.1.2.6 for the 'NFS' style reparse tags */
+static int parse_reparse_posix(struct reparse_posix_data *buf,
+ struct cifs_sb_info *cifs_sb,
+ struct cifs_open_info_data *data)
{
unsigned int len;
-
- /* See MS-FSCC 2.1.2.6 for the 'NFS' style reparse tags */
- len = le16_to_cpu(symlink_buf->ReparseDataLength);
-
- if (le64_to_cpu(symlink_buf->InodeType) != NFS_SPECFILE_LNK) {
- cifs_dbg(VFS, "%lld not a supported symlink type\n",
- le64_to_cpu(symlink_buf->InodeType));
+ u64 type;
+
+ switch ((type = le64_to_cpu(buf->InodeType))) {
+ case NFS_SPECFILE_LNK:
+ len = le16_to_cpu(buf->ReparseDataLength);
+ data->symlink_target = cifs_strndup_from_utf16(buf->DataBuffer,
+ len, true,
+ cifs_sb->local_nls);
+ if (!data->symlink_target)
+ return -ENOMEM;
+ convert_delimiter(data->symlink_target, '/');
+ cifs_dbg(FYI, "%s: target path: %s\n",
+ __func__, data->symlink_target);
+ break;
+ case NFS_SPECFILE_CHR:
+ case NFS_SPECFILE_BLK:
+ case NFS_SPECFILE_FIFO:
+ case NFS_SPECFILE_SOCK:
+ break;
+ default:
+ cifs_dbg(VFS, "%s: unhandled inode type: 0x%llx\n",
+ __func__, type);
return -EOPNOTSUPP;
}
-
- *target_path = cifs_strndup_from_utf16(
- symlink_buf->PathBuffer,
- len, true, cifs_sb->local_nls);
- if (!(*target_path))
- return -ENOMEM;
-
- convert_delimiter(*target_path, '/');
- cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path);
-
return 0;
}
-static int
-parse_reparse_symlink(struct reparse_symlink_data_buffer *symlink_buf,
- u32 plen, char **target_path,
- struct cifs_sb_info *cifs_sb)
+static int parse_reparse_symlink(struct reparse_symlink_data_buffer *sym,
+ u32 plen, bool unicode,
+ struct cifs_sb_info *cifs_sb,
+ struct cifs_open_info_data *data)
{
- unsigned int sub_len;
- unsigned int sub_offset;
+ unsigned int len;
+ unsigned int offs;
/* We handle Symbolic Link reparse tag here. See: MS-FSCC 2.1.2.4 */
- sub_offset = le16_to_cpu(symlink_buf->SubstituteNameOffset);
- sub_len = le16_to_cpu(symlink_buf->SubstituteNameLength);
- if (sub_offset + 20 > plen ||
- sub_offset + sub_len + 20 > plen) {
+ offs = le16_to_cpu(sym->SubstituteNameOffset);
+ len = le16_to_cpu(sym->SubstituteNameLength);
+ if (offs + 20 > plen || offs + len + 20 > plen) {
cifs_dbg(VFS, "srv returned malformed symlink buffer\n");
return -EIO;
}
- *target_path = cifs_strndup_from_utf16(
- symlink_buf->PathBuffer + sub_offset,
- sub_len, true, cifs_sb->local_nls);
- if (!(*target_path))
+ data->symlink_target = cifs_strndup_from_utf16(sym->PathBuffer + offs,
+ len, unicode,
+ cifs_sb->local_nls);
+ if (!data->symlink_target)
return -ENOMEM;
- convert_delimiter(*target_path, '/');
- cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path);
+ convert_delimiter(data->symlink_target, '/');
+ cifs_dbg(FYI, "%s: target path: %s\n", __func__, data->symlink_target);
return 0;
}
-static int
-parse_reparse_point(struct reparse_data_buffer *buf,
- u32 plen, char **target_path,
- struct cifs_sb_info *cifs_sb)
+int parse_reparse_point(struct reparse_data_buffer *buf,
+ u32 plen, struct cifs_sb_info *cifs_sb,
+ bool unicode, struct cifs_open_info_data *data)
{
- if (plen < sizeof(struct reparse_data_buffer)) {
- cifs_dbg(VFS, "reparse buffer is too small. Must be at least 8 bytes but was %d\n",
- plen);
+ if (plen < sizeof(*buf)) {
+ cifs_dbg(VFS, "%s: reparse buffer is too small. Must be at least 8 bytes but was %d\n",
+ __func__, plen);
return -EIO;
}
- if (plen < le16_to_cpu(buf->ReparseDataLength) +
- sizeof(struct reparse_data_buffer)) {
- cifs_dbg(VFS, "srv returned invalid reparse buf length: %d\n",
- plen);
+ if (plen < le16_to_cpu(buf->ReparseDataLength) + sizeof(*buf)) {
+ cifs_dbg(VFS, "%s: invalid reparse buf length: %d\n",
+ __func__, plen);
return -EIO;
}
+ data->reparse.buf = buf;
+
/* See MS-FSCC 2.1.2 */
switch (le32_to_cpu(buf->ReparseTag)) {
case IO_REPARSE_TAG_NFS:
- return parse_reparse_posix(
- (struct reparse_posix_data *)buf,
- plen, target_path, cifs_sb);
+ return parse_reparse_posix((struct reparse_posix_data *)buf,
+ cifs_sb, data);
case IO_REPARSE_TAG_SYMLINK:
return parse_reparse_symlink(
(struct reparse_symlink_data_buffer *)buf,
- plen, target_path, cifs_sb);
+ plen, unicode, cifs_sb, data);
+ case IO_REPARSE_TAG_LX_SYMLINK:
+ case IO_REPARSE_TAG_AF_UNIX:
+ case IO_REPARSE_TAG_LX_FIFO:
+ case IO_REPARSE_TAG_LX_CHR:
+ case IO_REPARSE_TAG_LX_BLK:
+ return 0;
default:
- cifs_dbg(VFS, "srv returned unknown symlink buffer tag:0x%08x\n",
- le32_to_cpu(buf->ReparseTag));
+ cifs_dbg(VFS, "%s: unhandled reparse tag: 0x%08x\n",
+ __func__, le32_to_cpu(buf->ReparseTag));
return -EOPNOTSUPP;
}
}
-static int smb2_query_symlink(const unsigned int xid,
- struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb,
- const char *full_path,
- char **target_path,
- struct kvec *rsp_iov)
+static int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb,
+ struct kvec *rsp_iov,
+ struct cifs_open_info_data *data)
{
struct reparse_data_buffer *buf;
struct smb2_ioctl_rsp *io = rsp_iov->iov_base;
u32 plen = le32_to_cpu(io->OutputCount);
- cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
-
buf = (struct reparse_data_buffer *)((u8 *)io +
le32_to_cpu(io->OutputOffset));
- return parse_reparse_point(buf, plen, target_path, cifs_sb);
+ return parse_reparse_point(buf, plen, cifs_sb, true, data);
}
static int smb2_query_reparse_point(const unsigned int xid,
@@ -2989,7 +3017,7 @@ static int smb2_query_reparse_point(const unsigned int xid,
struct kvec *rsp_iov;
struct smb2_ioctl_rsp *ioctl_rsp;
struct reparse_data_buffer *reparse_buf;
- u32 plen;
+ u32 off, count, len;
cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
@@ -3070,16 +3098,22 @@ static int smb2_query_reparse_point(const unsigned int xid,
*/
if (rc == 0) {
/* See MS-FSCC 2.3.23 */
+ off = le32_to_cpu(ioctl_rsp->OutputOffset);
+ count = le32_to_cpu(ioctl_rsp->OutputCount);
+ if (check_add_overflow(off, count, &len) ||
+ len > rsp_iov[1].iov_len) {
+ cifs_tcon_dbg(VFS, "%s: invalid ioctl: off=%d count=%d\n",
+ __func__, off, count);
+ rc = -EIO;
+ goto query_rp_exit;
+ }
- reparse_buf = (struct reparse_data_buffer *)
- ((char *)ioctl_rsp +
- le32_to_cpu(ioctl_rsp->OutputOffset));
- plen = le32_to_cpu(ioctl_rsp->OutputCount);
-
- if (plen + le32_to_cpu(ioctl_rsp->OutputOffset) >
- rsp_iov[1].iov_len) {
- cifs_tcon_dbg(FYI, "srv returned invalid ioctl len: %d\n",
- plen);
+ reparse_buf = (void *)((u8 *)ioctl_rsp + off);
+ len = sizeof(*reparse_buf);
+ if (count < len ||
+ count < le16_to_cpu(reparse_buf->ReparseDataLength) + len) {
+ cifs_tcon_dbg(VFS, "%s: invalid ioctl: off=%d count=%d\n",
+ __func__, off, count);
rc = -EIO;
goto query_rp_exit;
}
@@ -3299,6 +3333,7 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
struct inode *inode = file_inode(file);
struct cifsInodeInfo *cifsi = CIFS_I(inode);
struct cifsFileInfo *cfile = file->private_data;
+ unsigned long long new_size;
long rc;
unsigned int xid;
__le64 eof;
@@ -3329,10 +3364,15 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
/*
* do we also need to change the size of the file?
*/
- if (keep_size == false && i_size_read(inode) < offset + len) {
- eof = cpu_to_le64(offset + len);
+ new_size = offset + len;
+ if (keep_size == false && (unsigned long long)i_size_read(inode) < new_size) {
+ eof = cpu_to_le64(new_size);
rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid, cfile->pid, &eof);
+ if (rc >= 0) {
+ truncate_setsize(inode, new_size);
+ fscache_resize_cookie(cifs_inode_cookie(inode), new_size);
+ }
}
zero_range_exit:
@@ -3727,6 +3767,9 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon,
if (rc < 0)
goto out_2;
+ truncate_setsize(inode, old_eof + len);
+ fscache_resize_cookie(cifs_inode_cookie(inode), i_size_read(inode));
+
rc = smb2_copychunk_range(xid, cfile, cfile, off, count, off + len);
if (rc < 0)
goto out_2;
@@ -4920,6 +4963,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server,
struct smb2_hdr *shdr;
unsigned int pdu_length = server->pdu_size;
unsigned int buf_size;
+ unsigned int next_cmd;
struct mid_q_entry *mid_entry;
int next_is_large;
char *next_buffer = NULL;
@@ -4948,14 +4992,15 @@ receive_encrypted_standard(struct TCP_Server_Info *server,
next_is_large = server->large_buf;
one_more:
shdr = (struct smb2_hdr *)buf;
- if (shdr->NextCommand) {
+ next_cmd = le32_to_cpu(shdr->NextCommand);
+ if (next_cmd) {
+ if (WARN_ON_ONCE(next_cmd > pdu_length))
+ return -1;
if (next_is_large)
next_buffer = (char *)cifs_buf_get();
else
next_buffer = (char *)cifs_small_buf_get();
- memcpy(next_buffer,
- buf + le32_to_cpu(shdr->NextCommand),
- pdu_length - le32_to_cpu(shdr->NextCommand));
+ memcpy(next_buffer, buf + next_cmd, pdu_length - next_cmd);
}
mid_entry = smb2_find_mid(server, buf);
@@ -4979,8 +5024,8 @@ one_more:
else
ret = cifs_handle_standard(server, mid_entry);
- if (ret == 0 && shdr->NextCommand) {
- pdu_length -= le32_to_cpu(shdr->NextCommand);
+ if (ret == 0 && next_cmd) {
+ pdu_length -= next_cmd;
server->large_buf = next_is_large;
if (next_is_large)
server->bigbuf = buf = next_buffer;
@@ -5043,54 +5088,42 @@ smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid)
NULL, 0, false);
}
-static int
-smb2_next_header(char *buf)
+static int smb2_next_header(struct TCP_Server_Info *server, char *buf,
+ unsigned int *noff)
{
struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
struct smb2_transform_hdr *t_hdr = (struct smb2_transform_hdr *)buf;
- if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM)
- return sizeof(struct smb2_transform_hdr) +
- le32_to_cpu(t_hdr->OriginalMessageSize);
-
- return le32_to_cpu(hdr->NextCommand);
+ if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) {
+ *noff = le32_to_cpu(t_hdr->OriginalMessageSize);
+ if (unlikely(check_add_overflow(*noff, sizeof(*t_hdr), noff)))
+ return -EINVAL;
+ } else {
+ *noff = le32_to_cpu(hdr->NextCommand);
+ }
+ if (unlikely(*noff && *noff < MID_HEADER_SIZE(server)))
+ return -EINVAL;
+ return 0;
}
-static int
-smb2_make_node(unsigned int xid, struct inode *inode,
- struct dentry *dentry, struct cifs_tcon *tcon,
- const char *full_path, umode_t mode, dev_t dev)
+int cifs_sfu_make_node(unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ const char *full_path, umode_t mode, dev_t dev)
{
- struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
- int rc = -EPERM;
struct cifs_open_info_data buf = {};
- struct cifs_io_parms io_parms = {0};
- __u32 oplock = 0;
- struct cifs_fid fid;
+ struct TCP_Server_Info *server = tcon->ses->server;
struct cifs_open_parms oparms;
+ struct cifs_io_parms io_parms = {};
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct cifs_fid fid;
unsigned int bytes_written;
struct win_dev *pdev;
struct kvec iov[2];
+ __u32 oplock = server->oplocks ? REQ_OPLOCK : 0;
+ int rc;
- /*
- * Check if mounted with mount parm 'sfu' mount parm.
- * SFU emulation should work with all servers, but only
- * supports block and char device (no socket & fifo),
- * and was used by default in earlier versions of Windows
- */
- if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
- return rc;
-
- /*
- * TODO: Add ability to create instead via reparse point. Windows (e.g.
- * their current NFS server) uses this approach to expose special files
- * over SMB2/SMB3 and Samba will do this with SMB3.1.1 POSIX Extensions
- */
-
- if (!S_ISCHR(mode) && !S_ISBLK(mode))
- return rc;
-
- cifs_dbg(FYI, "sfu compat create special file\n");
+ if (!S_ISCHR(mode) && !S_ISBLK(mode) && !S_ISFIFO(mode))
+ return -EPERM;
oparms = (struct cifs_open_parms) {
.tcon = tcon,
@@ -5103,11 +5136,7 @@ smb2_make_node(unsigned int xid, struct inode *inode,
.fid = &fid,
};
- if (tcon->ses->server->oplocks)
- oplock = REQ_OPLOCK;
- else
- oplock = 0;
- rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, &buf);
+ rc = server->ops->open(xid, &oparms, &oplock, &buf);
if (rc)
return rc;
@@ -5115,36 +5144,56 @@ smb2_make_node(unsigned int xid, struct inode *inode,
* BB Do not bother to decode buf since no local inode yet to put
* timestamps in, but we can reuse it safely.
*/
-
pdev = (struct win_dev *)&buf.fi;
io_parms.pid = current->tgid;
io_parms.tcon = tcon;
- io_parms.offset = 0;
- io_parms.length = sizeof(struct win_dev);
- iov[1].iov_base = &buf.fi;
- iov[1].iov_len = sizeof(struct win_dev);
+ io_parms.length = sizeof(*pdev);
+ iov[1].iov_base = pdev;
+ iov[1].iov_len = sizeof(*pdev);
if (S_ISCHR(mode)) {
memcpy(pdev->type, "IntxCHR", 8);
pdev->major = cpu_to_le64(MAJOR(dev));
pdev->minor = cpu_to_le64(MINOR(dev));
- rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
- &bytes_written, iov, 1);
} else if (S_ISBLK(mode)) {
memcpy(pdev->type, "IntxBLK", 8);
pdev->major = cpu_to_le64(MAJOR(dev));
pdev->minor = cpu_to_le64(MINOR(dev));
- rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
- &bytes_written, iov, 1);
+ } else if (S_ISFIFO(mode)) {
+ memcpy(pdev->type, "LnxFIFO", 8);
}
- tcon->ses->server->ops->close(xid, tcon, &fid);
- d_drop(dentry);
+ rc = server->ops->sync_write(xid, &fid, &io_parms,
+ &bytes_written, iov, 1);
+ server->ops->close(xid, tcon, &fid);
+ d_drop(dentry);
/* FIXME: add code here to set EAs */
-
cifs_free_open_info(&buf);
return rc;
}
+static int smb2_make_node(unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ const char *full_path, umode_t mode, dev_t dev)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+
+ /*
+ * Check if mounted with mount parm 'sfu' mount parm.
+ * SFU emulation should work with all servers, but only
+ * supports block and char device (no socket & fifo),
+ * and was used by default in earlier versions of Windows
+ */
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
+ return -EPERM;
+ /*
+ * TODO: Add ability to create instead via reparse point. Windows (e.g.
+ * their current NFS server) uses this approach to expose special files
+ * over SMB2/SMB3 and Samba will do this with SMB3.1.1 POSIX Extensions
+ */
+ return cifs_sfu_make_node(xid, inode, dentry, tcon,
+ full_path, mode, dev);
+}
+
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
struct smb_version_operations smb20_operations = {
.compare_fids = smb2_compare_fids,
@@ -5195,7 +5244,7 @@ struct smb_version_operations smb20_operations = {
.unlink = smb2_unlink,
.rename = smb2_rename_path,
.create_hardlink = smb2_create_hardlink,
- .query_symlink = smb2_query_symlink,
+ .parse_reparse_point = smb2_parse_reparse_point,
.query_mf_symlink = smb3_query_mf_symlink,
.create_mf_symlink = smb3_create_mf_symlink,
.open = smb2_open_file,
@@ -5297,7 +5346,7 @@ struct smb_version_operations smb21_operations = {
.unlink = smb2_unlink,
.rename = smb2_rename_path,
.create_hardlink = smb2_create_hardlink,
- .query_symlink = smb2_query_symlink,
+ .parse_reparse_point = smb2_parse_reparse_point,
.query_mf_symlink = smb3_query_mf_symlink,
.create_mf_symlink = smb3_create_mf_symlink,
.open = smb2_open_file,
@@ -5402,7 +5451,7 @@ struct smb_version_operations smb30_operations = {
.unlink = smb2_unlink,
.rename = smb2_rename_path,
.create_hardlink = smb2_create_hardlink,
- .query_symlink = smb2_query_symlink,
+ .parse_reparse_point = smb2_parse_reparse_point,
.query_mf_symlink = smb3_query_mf_symlink,
.create_mf_symlink = smb3_create_mf_symlink,
.open = smb2_open_file,
@@ -5516,7 +5565,7 @@ struct smb_version_operations smb311_operations = {
.unlink = smb2_unlink,
.rename = smb2_rename_path,
.create_hardlink = smb2_create_hardlink,
- .query_symlink = smb2_query_symlink,
+ .parse_reparse_point = smb2_parse_reparse_point,
.query_mf_symlink = smb3_query_mf_symlink,
.create_mf_symlink = smb3_create_mf_symlink,
.open = smb2_open_file,
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index c75a80bb6d9e..4f971c1061f0 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -158,11 +158,14 @@ out:
static int
smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
- struct TCP_Server_Info *server)
+ struct TCP_Server_Info *server, bool from_reconnect)
{
int rc = 0;
struct nls_table *nls_codepage = NULL;
struct cifs_ses *ses;
+ int xid;
+ struct TCP_Server_Info *pserver;
+ unsigned int chan_index;
/*
* SMB2s NegProt, SessSetup, Logoff do not have tcon yet so
@@ -223,6 +226,12 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
return -EAGAIN;
}
}
+
+ /* if server is marked for termination, cifsd will cleanup */
+ if (server->terminate) {
+ spin_unlock(&server->srv_lock);
+ return -EHOSTDOWN;
+ }
spin_unlock(&server->srv_lock);
again:
@@ -242,11 +251,23 @@ again:
mutex_lock(&ses->session_mutex);
/*
+ * if this is called by delayed work, and the channel has been disabled
+ * in parallel, the delayed work can continue to execute in parallel
+ * there's a chance that this channel may not exist anymore
+ */
+ spin_lock(&server->srv_lock);
+ if (server->tcpStatus == CifsExiting) {
+ spin_unlock(&server->srv_lock);
+ mutex_unlock(&ses->session_mutex);
+ rc = -EHOSTDOWN;
+ goto out;
+ }
+
+ /*
* Recheck after acquire mutex. If another thread is negotiating
* and the server never sends an answer the socket will be closed
* and tcpStatus set to reconnect.
*/
- spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsNeedReconnect) {
spin_unlock(&server->srv_lock);
mutex_unlock(&ses->session_mutex);
@@ -283,6 +304,53 @@ again:
rc = cifs_negotiate_protocol(0, ses, server);
if (!rc) {
+ /*
+ * if server stopped supporting multichannel
+ * and the first channel reconnected, disable all the others.
+ */
+ if (ses->chan_count > 1 &&
+ !(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
+ if (SERVER_IS_CHAN(server)) {
+ cifs_dbg(VFS, "server %s does not support " \
+ "multichannel anymore. skipping secondary channel\n",
+ ses->server->hostname);
+
+ spin_lock(&ses->chan_lock);
+ chan_index = cifs_ses_get_chan_index(ses, server);
+ if (chan_index == CIFS_INVAL_CHAN_INDEX) {
+ spin_unlock(&ses->chan_lock);
+ goto skip_terminate;
+ }
+
+ ses->chans[chan_index].server = NULL;
+ spin_unlock(&ses->chan_lock);
+
+ /*
+ * the above reference of server by channel
+ * needs to be dropped without holding chan_lock
+ * as cifs_put_tcp_session takes a higher lock
+ * i.e. cifs_tcp_ses_lock
+ */
+ cifs_put_tcp_session(server, from_reconnect);
+
+ server->terminate = true;
+ cifs_signal_cifsd_for_reconnect(server, false);
+
+ /* mark primary server as needing reconnect */
+ pserver = server->primary_server;
+ cifs_signal_cifsd_for_reconnect(pserver, false);
+
+skip_terminate:
+ mutex_unlock(&ses->session_mutex);
+ rc = -EHOSTDOWN;
+ goto out;
+ } else {
+ cifs_server_dbg(VFS, "does not support " \
+ "multichannel anymore. disabling all other channels\n");
+ cifs_disable_secondary_channels(ses);
+ }
+ }
+
rc = cifs_setup_session(0, ses, server, nls_codepage);
if ((rc == -EACCES) && !tcon->retry) {
mutex_unlock(&ses->session_mutex);
@@ -307,15 +375,41 @@ skip_sess_setup:
tcon->need_reopen_files = true;
rc = cifs_tree_connect(0, tcon, nls_codepage);
- mutex_unlock(&ses->session_mutex);
cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc);
if (rc) {
/* If sess reconnected but tcon didn't, something strange ... */
+ mutex_unlock(&ses->session_mutex);
cifs_dbg(VFS, "reconnect tcon failed rc = %d\n", rc);
goto out;
}
+ if (!rc &&
+ (server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
+ mutex_unlock(&ses->session_mutex);
+
+ /*
+ * query server network interfaces, in case they change
+ */
+ xid = get_xid();
+ rc = SMB3_request_interfaces(xid, tcon, false);
+ free_xid(xid);
+
+ if (rc)
+ cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n",
+ __func__, rc);
+
+ if (ses->chan_max > ses->chan_count &&
+ !SERVER_IS_CHAN(server)) {
+ if (ses->chan_count == 1)
+ cifs_server_dbg(VFS, "supports multichannel now\n");
+
+ cifs_try_adding_channels(ses);
+ }
+ } else {
+ mutex_unlock(&ses->session_mutex);
+ }
+
if (smb2_command != SMB2_INTERNAL_CMD)
mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
@@ -376,10 +470,15 @@ static int __smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon,
void **request_buf, unsigned int *total_len)
{
/* BB eventually switch this to SMB2 specific small buf size */
- if (smb2_command == SMB2_SET_INFO)
+ switch (smb2_command) {
+ case SMB2_SET_INFO:
+ case SMB2_QUERY_INFO:
*request_buf = cifs_buf_get();
- else
+ break;
+ default:
*request_buf = cifs_small_buf_get();
+ break;
+ }
if (*request_buf == NULL) {
/* BB should we add a retry in here if not a writepage? */
return -ENOMEM;
@@ -404,7 +503,7 @@ static int smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon,
{
int rc;
- rc = smb2_reconnect(smb2_command, tcon, server);
+ rc = smb2_reconnect(smb2_command, tcon, server, false);
if (rc)
return rc;
@@ -2141,17 +2240,18 @@ parse_posix_ctxt(struct create_context *cc, struct smb2_file_all_info *info,
posix->nlink, posix->mode, posix->reparse_tag);
}
-void
-smb2_parse_contexts(struct TCP_Server_Info *server,
- struct smb2_create_rsp *rsp,
- unsigned int *epoch, char *lease_key, __u8 *oplock,
- struct smb2_file_all_info *buf,
- struct create_posix_rsp *posix)
+int smb2_parse_contexts(struct TCP_Server_Info *server,
+ struct kvec *rsp_iov,
+ unsigned int *epoch,
+ char *lease_key, __u8 *oplock,
+ struct smb2_file_all_info *buf,
+ struct create_posix_rsp *posix)
{
- char *data_offset;
+ struct smb2_create_rsp *rsp = rsp_iov->iov_base;
struct create_context *cc;
- unsigned int next;
- unsigned int remaining;
+ size_t rem, off, len;
+ size_t doff, dlen;
+ size_t noff, nlen;
char *name;
static const char smb3_create_tag_posix[] = {
0x93, 0xAD, 0x25, 0x50, 0x9C,
@@ -2160,45 +2260,63 @@ smb2_parse_contexts(struct TCP_Server_Info *server,
};
*oplock = 0;
- data_offset = (char *)rsp + le32_to_cpu(rsp->CreateContextsOffset);
- remaining = le32_to_cpu(rsp->CreateContextsLength);
- cc = (struct create_context *)data_offset;
+
+ off = le32_to_cpu(rsp->CreateContextsOffset);
+ rem = le32_to_cpu(rsp->CreateContextsLength);
+ if (check_add_overflow(off, rem, &len) || len > rsp_iov->iov_len)
+ return -EINVAL;
+ cc = (struct create_context *)((u8 *)rsp + off);
/* Initialize inode number to 0 in case no valid data in qfid context */
if (buf)
buf->IndexNumber = 0;
- while (remaining >= sizeof(struct create_context)) {
- name = le16_to_cpu(cc->NameOffset) + (char *)cc;
- if (le16_to_cpu(cc->NameLength) == 4 &&
- strncmp(name, SMB2_CREATE_REQUEST_LEASE, 4) == 0)
- *oplock = server->ops->parse_lease_buf(cc, epoch,
- lease_key);
- else if (buf && (le16_to_cpu(cc->NameLength) == 4) &&
- strncmp(name, SMB2_CREATE_QUERY_ON_DISK_ID, 4) == 0)
- parse_query_id_ctxt(cc, buf);
- else if ((le16_to_cpu(cc->NameLength) == 16)) {
- if (posix &&
- memcmp(name, smb3_create_tag_posix, 16) == 0)
+ while (rem >= sizeof(*cc)) {
+ doff = le16_to_cpu(cc->DataOffset);
+ dlen = le32_to_cpu(cc->DataLength);
+ if (check_add_overflow(doff, dlen, &len) || len > rem)
+ return -EINVAL;
+
+ noff = le16_to_cpu(cc->NameOffset);
+ nlen = le16_to_cpu(cc->NameLength);
+ if (noff + nlen >= doff)
+ return -EINVAL;
+
+ name = (char *)cc + noff;
+ switch (nlen) {
+ case 4:
+ if (!strncmp(name, SMB2_CREATE_REQUEST_LEASE, 4)) {
+ *oplock = server->ops->parse_lease_buf(cc, epoch,
+ lease_key);
+ } else if (buf &&
+ !strncmp(name, SMB2_CREATE_QUERY_ON_DISK_ID, 4)) {
+ parse_query_id_ctxt(cc, buf);
+ }
+ break;
+ case 16:
+ if (posix && !memcmp(name, smb3_create_tag_posix, 16))
parse_posix_ctxt(cc, buf, posix);
+ break;
+ default:
+ cifs_dbg(FYI, "%s: unhandled context (nlen=%zu dlen=%zu)\n",
+ __func__, nlen, dlen);
+ if (IS_ENABLED(CONFIG_CIFS_DEBUG2))
+ cifs_dump_mem("context data: ", cc, dlen);
+ break;
}
- /* else {
- cifs_dbg(FYI, "Context not matched with len %d\n",
- le16_to_cpu(cc->NameLength));
- cifs_dump_mem("Cctxt name: ", name, 4);
- } */
-
- next = le32_to_cpu(cc->Next);
- if (!next)
+
+ off = le32_to_cpu(cc->Next);
+ if (!off)
break;
- remaining -= next;
- cc = (struct create_context *)((char *)cc + next);
+ if (check_sub_overflow(rem, off, &rem))
+ return -EINVAL;
+ cc = (struct create_context *)((u8 *)cc + off);
}
if (rsp->OplockLevel != SMB2_OPLOCK_LEVEL_LEASE)
*oplock = rsp->OplockLevel;
- return;
+ return 0;
}
static int
@@ -3029,8 +3147,8 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
}
- smb2_parse_contexts(server, rsp, &oparms->fid->epoch,
- oparms->fid->lease_key, oplock, buf, posix);
+ rc = smb2_parse_contexts(server, &rsp_iov, &oparms->fid->epoch,
+ oparms->fid->lease_key, oplock, buf, posix);
creat_exit:
SMB2_open_free(&rqst);
free_rsp_buf(resp_buftype, rsp);
@@ -3377,12 +3495,10 @@ __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
} else {
trace_smb3_close_done(xid, persistent_fid, tcon->tid,
ses->Suid);
- /*
- * Note that have to subtract 4 since struct network_open_info
- * has a final 4 byte pad that close response does not have
- */
if (pbuf)
- memcpy(pbuf, (char *)&rsp->CreationTime, sizeof(*pbuf) - 4);
+ memcpy(&pbuf->network_open_info,
+ &rsp->network_open_info,
+ sizeof(pbuf->network_open_info));
}
atomic_dec(&tcon->num_remote_opens);
@@ -3475,8 +3591,13 @@ SMB2_query_info_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
struct smb2_query_info_req *req;
struct kvec *iov = rqst->rq_iov;
unsigned int total_len;
+ size_t len;
int rc;
+ if (unlikely(check_add_overflow(input_len, sizeof(*req), &len) ||
+ len > CIFSMaxBufSize))
+ return -EINVAL;
+
rc = smb2_plain_req_init(SMB2_QUERY_INFO, tcon, server,
(void **) &req, &total_len);
if (rc)
@@ -3498,7 +3619,7 @@ SMB2_query_info_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
iov[0].iov_base = (char *)req;
/* 1 for Buffer */
- iov[0].iov_len = total_len - 1 + input_len;
+ iov[0].iov_len = len;
return 0;
}
@@ -3506,7 +3627,7 @@ void
SMB2_query_info_free(struct smb_rqst *rqst)
{
if (rqst && rqst->rq_iov)
- cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */
+ cifs_buf_release(rqst->rq_iov[0].iov_base); /* request */
}
static int
@@ -3802,12 +3923,28 @@ void smb2_reconnect_server(struct work_struct *work)
int rc;
bool resched = false;
+ /* first check if ref count has reached 0, if not inc ref count */
+ spin_lock(&cifs_tcp_ses_lock);
+ if (!server->srv_count) {
+ spin_unlock(&cifs_tcp_ses_lock);
+ return;
+ }
+ server->srv_count++;
+ spin_unlock(&cifs_tcp_ses_lock);
+
/* If server is a channel, select the primary channel */
pserver = SERVER_IS_CHAN(server) ? server->primary_server : server;
/* Prevent simultaneous reconnects that can corrupt tcon->rlist list */
mutex_lock(&pserver->reconnect_mutex);
+ /* if the server is marked for termination, drop the ref count here */
+ if (server->terminate) {
+ cifs_put_tcp_session(server, true);
+ mutex_unlock(&pserver->reconnect_mutex);
+ return;
+ }
+
INIT_LIST_HEAD(&tmp_list);
INIT_LIST_HEAD(&tmp_ses_list);
cifs_dbg(FYI, "Reconnecting tcons and channels\n");
@@ -3852,17 +3989,10 @@ void smb2_reconnect_server(struct work_struct *work)
}
spin_unlock(&ses->chan_lock);
}
- /*
- * Get the reference to server struct to be sure that the last call of
- * cifs_put_tcon() in the loop below won't release the server pointer.
- */
- if (tcon_exist || ses_exist)
- server->srv_count++;
-
spin_unlock(&cifs_tcp_ses_lock);
list_for_each_entry_safe(tcon, tcon2, &tmp_list, rlist) {
- rc = smb2_reconnect(SMB2_INTERNAL_CMD, tcon, server);
+ rc = smb2_reconnect(SMB2_INTERNAL_CMD, tcon, server, true);
if (!rc)
cifs_reopen_persistent_handles(tcon);
else
@@ -3895,7 +4025,7 @@ void smb2_reconnect_server(struct work_struct *work)
/* now reconnect sessions for necessary channels */
list_for_each_entry_safe(ses, ses2, &tmp_ses_list, rlist) {
tcon->ses = ses;
- rc = smb2_reconnect(SMB2_INTERNAL_CMD, tcon, server);
+ rc = smb2_reconnect(SMB2_INTERNAL_CMD, tcon, server, true);
if (rc)
resched = true;
list_del_init(&ses->rlist);
@@ -3910,8 +4040,7 @@ done:
mutex_unlock(&pserver->reconnect_mutex);
/* now we can safely release srv struct */
- if (tcon_exist || ses_exist)
- cifs_put_tcp_session(server, 1);
+ cifs_put_tcp_session(server, true);
}
int
@@ -5373,6 +5502,11 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon,
return 0;
}
+static inline void free_qfs_info_req(struct kvec *iov)
+{
+ cifs_buf_release(iov->iov_base);
+}
+
int
SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, struct kstatfs *fsdata)
@@ -5404,7 +5538,7 @@ SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon,
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
- cifs_small_buf_release(iov.iov_base);
+ free_qfs_info_req(&iov);
if (rc) {
cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
goto posix_qfsinf_exit;
@@ -5455,7 +5589,7 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
- cifs_small_buf_release(iov.iov_base);
+ free_qfs_info_req(&iov);
if (rc) {
cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
goto qfsinf_exit;
@@ -5522,7 +5656,7 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
- cifs_small_buf_release(iov.iov_base);
+ free_qfs_info_req(&iov);
if (rc) {
cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
goto qfsattr_exit;
diff --git a/fs/smb/client/smb2pdu.h b/fs/smb/client/smb2pdu.h
index 220994d0a0f7..db08194484e0 100644
--- a/fs/smb/client/smb2pdu.h
+++ b/fs/smb/client/smb2pdu.h
@@ -319,13 +319,15 @@ struct smb2_file_reparse_point_info {
} __packed;
struct smb2_file_network_open_info {
- __le64 CreationTime;
- __le64 LastAccessTime;
- __le64 LastWriteTime;
- __le64 ChangeTime;
- __le64 AllocationSize;
- __le64 EndOfFile;
- __le32 Attributes;
+ struct_group(network_open_info,
+ __le64 CreationTime;
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ __le64 AllocationSize;
+ __le64 EndOfFile;
+ __le32 Attributes;
+ );
__le32 Reserved;
} __packed; /* level 34 Query also similar returned in close rsp and open rsp */
diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h
index 46eff9ec302a..0e371f7e2854 100644
--- a/fs/smb/client/smb2proto.h
+++ b/fs/smb/client/smb2proto.h
@@ -251,11 +251,13 @@ extern int smb3_validate_negotiate(const unsigned int, struct cifs_tcon *);
extern enum securityEnum smb2_select_sectype(struct TCP_Server_Info *,
enum securityEnum);
-extern void smb2_parse_contexts(struct TCP_Server_Info *server,
- struct smb2_create_rsp *rsp,
- unsigned int *epoch, char *lease_key,
- __u8 *oplock, struct smb2_file_all_info *buf,
- struct create_posix_rsp *posix);
+int smb2_parse_contexts(struct TCP_Server_Info *server,
+ struct kvec *rsp_iov,
+ unsigned int *epoch,
+ char *lease_key, __u8 *oplock,
+ struct smb2_file_all_info *buf,
+ struct create_posix_rsp *posix);
+
extern int smb3_encryption_required(const struct cifs_tcon *tcon);
extern int smb2_validate_iov(unsigned int offset, unsigned int buffer_length,
struct kvec *iov, unsigned int min_buf_size);
diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c
index 23c50ed7d4b5..5a3ca62d2f07 100644
--- a/fs/smb/client/smb2transport.c
+++ b/fs/smb/client/smb2transport.c
@@ -413,7 +413,13 @@ generate_smb3signingkey(struct cifs_ses *ses,
ses->ses_status == SES_GOOD);
chan_index = cifs_ses_get_chan_index(ses, server);
- /* TODO: introduce ref counting for channels when the can be freed */
+ if (chan_index == CIFS_INVAL_CHAN_INDEX) {
+ spin_unlock(&ses->chan_lock);
+ spin_unlock(&ses->ses_lock);
+
+ return -EINVAL;
+ }
+
spin_unlock(&ses->chan_lock);
spin_unlock(&ses->ses_lock);
@@ -452,6 +458,8 @@ generate_smb3signingkey(struct cifs_ses *ses,
ptriplet->encryption.context,
ses->smb3encryptionkey,
SMB3_ENC_DEC_KEY_SIZE);
+ if (rc)
+ return rc;
rc = generate_key(ses, ptriplet->decryption.label,
ptriplet->decryption.context,
ses->smb3decryptionkey,
@@ -460,9 +468,6 @@ generate_smb3signingkey(struct cifs_ses *ses,
return rc;
}
- if (rc)
- return rc;
-
#ifdef CONFIG_CIFS_DEBUG_DUMP_KEYS
cifs_dbg(VFS, "%s: dumping generated AES session keys\n", __func__);
/*
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index 14710afdc2a3..4f717ad7c21b 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -76,7 +76,7 @@ alloc_mid(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
return temp;
}
-static void __release_mid(struct kref *refcount)
+void __release_mid(struct kref *refcount)
{
struct mid_q_entry *midEntry =
container_of(refcount, struct mid_q_entry, refcount);
@@ -156,15 +156,6 @@ static void __release_mid(struct kref *refcount)
mempool_free(midEntry, cifs_mid_poolp);
}
-void release_mid(struct mid_q_entry *mid)
-{
- struct TCP_Server_Info *server = mid->server;
-
- spin_lock(&server->mid_lock);
- kref_put(&mid->refcount, __release_mid);
- spin_unlock(&server->mid_lock);
-}
-
void
delete_mid(struct mid_q_entry *mid)
{
@@ -1032,7 +1023,7 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses)
spin_lock(&ses->chan_lock);
for (i = 0; i < ses->chan_count; i++) {
server = ses->chans[i].server;
- if (!server)
+ if (!server || server->terminate)
continue;
/*
diff --git a/fs/smb/client/xattr.c b/fs/smb/client/xattr.c
index 4ad5531686d8..6780aa3e98a1 100644
--- a/fs/smb/client/xattr.c
+++ b/fs/smb/client/xattr.c
@@ -150,10 +150,13 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
goto out;
- if (pTcon->ses->server->ops->set_EA)
+ if (pTcon->ses->server->ops->set_EA) {
rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
full_path, name, value, (__u16)size,
cifs_sb->local_nls, cifs_sb);
+ if (rc == 0)
+ inode_set_ctime_current(inode);
+ }
break;
case XATTR_CIFS_ACL:
@@ -478,7 +481,7 @@ static const struct xattr_handler smb3_ntsd_full_xattr_handler = {
.set = cifs_xattr_set,
};
-const struct xattr_handler *cifs_xattr_handlers[] = {
+const struct xattr_handler * const cifs_xattr_handlers[] = {
&cifs_user_xattr_handler,
&cifs_os2_xattr_handler,
&cifs_cifs_acl_xattr_handler,
diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h
index 319fb9ffc6a0..57f2343164a3 100644
--- a/fs/smb/common/smb2pdu.h
+++ b/fs/smb/common/smb2pdu.h
@@ -34,6 +34,7 @@
#define SMB2_QUERY_INFO_HE 0x0010
#define SMB2_SET_INFO_HE 0x0011
#define SMB2_OPLOCK_BREAK_HE 0x0012
+#define SMB2_SERVER_TO_CLIENT_NOTIFICATION 0x0013
/* The same list in little endian */
#define SMB2_NEGOTIATE cpu_to_le16(SMB2_NEGOTIATE_HE)
@@ -411,6 +412,7 @@ struct smb2_tree_disconnect_rsp {
#define SMB2_GLOBAL_CAP_PERSISTENT_HANDLES 0x00000010 /* New to SMB3 */
#define SMB2_GLOBAL_CAP_DIRECTORY_LEASING 0x00000020 /* New to SMB3 */
#define SMB2_GLOBAL_CAP_ENCRYPTION 0x00000040 /* New to SMB3 */
+#define SMB2_GLOBAL_CAP_NOTIFICATIONS 0x00000080 /* New to SMB3.1.1 */
/* Internal types */
#define SMB2_NT_FIND 0x00100000
#define SMB2_LARGE_FILES 0x00200000
@@ -700,13 +702,16 @@ struct smb2_close_rsp {
__le16 StructureSize; /* 60 */
__le16 Flags;
__le32 Reserved;
- __le64 CreationTime;
- __le64 LastAccessTime;
- __le64 LastWriteTime;
- __le64 ChangeTime;
- __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */
- __le64 EndOfFile;
- __le32 Attributes;
+ struct_group(network_open_info,
+ __le64 CreationTime;
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ /* Beginning of FILE_STANDARD_INFO equivalent */
+ __le64 AllocationSize;
+ __le64 EndOfFile;
+ __le32 Attributes;
+ );
} __packed;
@@ -981,6 +986,19 @@ struct smb2_change_notify_rsp {
__u8 Buffer[]; /* array of file notify structs */
} __packed;
+/*
+ * SMB2_SERVER_TO_CLIENT_NOTIFICATION: See MS-SMB2 section 2.2.44
+ */
+
+#define SMB2_NOTIFY_SESSION_CLOSED 0x0000
+
+struct smb2_server_client_notification {
+ struct smb2_hdr hdr;
+ __le16 StructureSize;
+ __u16 Reserved; /* MBZ */
+ __le32 NotificationType;
+ __u8 NotificationBuffer[4]; /* MBZ */
+} __packed;
/*
* SMB2_CREATE See MS-SMB2 section 2.2.13
@@ -1097,16 +1115,23 @@ struct smb2_change_notify_rsp {
#define FILE_WRITE_THROUGH_LE cpu_to_le32(0x00000002)
#define FILE_SEQUENTIAL_ONLY_LE cpu_to_le32(0x00000004)
#define FILE_NO_INTERMEDIATE_BUFFERING_LE cpu_to_le32(0x00000008)
+/* FILE_SYNCHRONOUS_IO_ALERT_LE cpu_to_le32(0x00000010) should be zero, ignored */
+/* FILE_SYNCHRONOUS_IO_NONALERT cpu_to_le32(0x00000020) should be zero, ignored */
#define FILE_NON_DIRECTORY_FILE_LE cpu_to_le32(0x00000040)
#define FILE_COMPLETE_IF_OPLOCKED_LE cpu_to_le32(0x00000100)
#define FILE_NO_EA_KNOWLEDGE_LE cpu_to_le32(0x00000200)
+/* FILE_OPEN_REMOTE_INSTANCE cpu_to_le32(0x00000400) should be zero, ignored */
#define FILE_RANDOM_ACCESS_LE cpu_to_le32(0x00000800)
-#define FILE_DELETE_ON_CLOSE_LE cpu_to_le32(0x00001000)
+#define FILE_DELETE_ON_CLOSE_LE cpu_to_le32(0x00001000) /* MBZ */
#define FILE_OPEN_BY_FILE_ID_LE cpu_to_le32(0x00002000)
#define FILE_OPEN_FOR_BACKUP_INTENT_LE cpu_to_le32(0x00004000)
#define FILE_NO_COMPRESSION_LE cpu_to_le32(0x00008000)
+/* FILE_OPEN_REQUIRING_OPLOCK cpu_to_le32(0x00010000) should be zero, ignored */
+/* FILE_DISALLOW_EXCLUSIVE cpu_to_le32(0x00020000) should be zero, ignored */
+/* FILE_RESERVE_OPFILTER cpu_to_le32(0x00100000) MBZ */
#define FILE_OPEN_REPARSE_POINT_LE cpu_to_le32(0x00200000)
#define FILE_OPEN_NO_RECALL_LE cpu_to_le32(0x00400000)
+/* #define FILE_OPEN_FOR_FREE_SPACE_QUERY cpu_to_le32(0x00800000) should be zero, ignored */
#define CREATE_OPTIONS_MASK_LE cpu_to_le32(0x00FFFFFF)
#define FILE_READ_RIGHTS_LE (FILE_READ_DATA_LE | FILE_READ_EA_LE \
@@ -1120,7 +1145,7 @@ struct smb2_change_notify_rsp {
#define SMB2_CREATE_SD_BUFFER "SecD" /* security descriptor */
#define SMB2_CREATE_DURABLE_HANDLE_REQUEST "DHnQ"
#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT "DHnC"
-#define SMB2_CREATE_ALLOCATION_SIZE "AISi"
+#define SMB2_CREATE_ALLOCATION_SIZE "AlSi"
#define SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST "MxAc"
#define SMB2_CREATE_TIMEWARP_REQUEST "TWrp"
#define SMB2_CREATE_QUERY_ON_DISK_ID "QFid"
@@ -1228,6 +1253,7 @@ struct create_mxac_rsp {
#define SMB2_LEASE_WRITE_CACHING_LE cpu_to_le32(0x04)
#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS_LE cpu_to_le32(0x02)
+#define SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE cpu_to_le32(0x04)
#define SMB2_LEASE_KEY_SIZE 16
diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c
index 4b38c3a285f6..b6fa1e285c40 100644
--- a/fs/smb/server/connection.c
+++ b/fs/smb/server/connection.c
@@ -167,23 +167,7 @@ void ksmbd_all_conn_set_status(u64 sess_id, u32 status)
void ksmbd_conn_wait_idle(struct ksmbd_conn *conn, u64 sess_id)
{
- struct ksmbd_conn *bind_conn;
-
wait_event(conn->req_running_q, atomic_read(&conn->req_running) < 2);
-
- down_read(&conn_list_lock);
- list_for_each_entry(bind_conn, &conn_list, conns_list) {
- if (bind_conn == conn)
- continue;
-
- if ((bind_conn->binding || xa_load(&bind_conn->sessions, sess_id)) &&
- !ksmbd_conn_releasing(bind_conn) &&
- atomic_read(&bind_conn->req_running)) {
- wait_event(bind_conn->req_running_q,
- atomic_read(&bind_conn->req_running) == 0);
- }
- }
- up_read(&conn_list_lock);
}
int ksmbd_conn_write(struct ksmbd_work *work)
diff --git a/fs/smb/server/ksmbd_spnego_negtokeninit.asn1 b/fs/smb/server/ksmbd_spnego_negtokeninit.asn1
index 0065f191b54b..001513806fc0 100644
--- a/fs/smb/server/ksmbd_spnego_negtokeninit.asn1
+++ b/fs/smb/server/ksmbd_spnego_negtokeninit.asn1
@@ -1,3 +1,11 @@
+-- SPDX-License-Identifier: BSD-3-Clause
+--
+-- Copyright (C) 1998, 2000 IETF Trust and the persons identified as authors
+-- of the code
+--
+-- https://www.rfc-editor.org/rfc/rfc2478#section-3.2.1
+-- https://www.rfc-editor.org/rfc/rfc2743#section-3.1
+
GSSAPI ::=
[APPLICATION 0] IMPLICIT SEQUENCE {
thisMech
diff --git a/fs/smb/server/ksmbd_spnego_negtokentarg.asn1 b/fs/smb/server/ksmbd_spnego_negtokentarg.asn1
index 1151933e7b9c..797e485d57f1 100644
--- a/fs/smb/server/ksmbd_spnego_negtokentarg.asn1
+++ b/fs/smb/server/ksmbd_spnego_negtokentarg.asn1
@@ -1,3 +1,10 @@
+-- SPDX-License-Identifier: BSD-3-Clause
+--
+-- Copyright (C) 1998 IETF Trust and the persons identified as authors
+-- of the code
+--
+-- https://www.rfc-editor.org/rfc/rfc2478#section-3.2.1
+
GSSAPI ::=
CHOICE {
negTokenInit
diff --git a/fs/smb/server/ksmbd_work.c b/fs/smb/server/ksmbd_work.c
index 51def3ca74c0..d7c676c151e2 100644
--- a/fs/smb/server/ksmbd_work.c
+++ b/fs/smb/server/ksmbd_work.c
@@ -56,6 +56,9 @@ void ksmbd_free_work_struct(struct ksmbd_work *work)
kfree(work->tr_buf);
kvfree(work->request_buf);
kfree(work->iov);
+ if (!list_empty(&work->interim_entry))
+ list_del(&work->interim_entry);
+
if (work->async_id)
ksmbd_release_id(&work->conn->async_ida, work->async_id);
kmem_cache_free(work_cache, work);
@@ -95,32 +98,42 @@ bool ksmbd_queue_work(struct ksmbd_work *work)
return queue_work(ksmbd_wq, &work->work);
}
-static int ksmbd_realloc_iov_pin(struct ksmbd_work *work, void *ib,
- unsigned int ib_len)
+static inline void __ksmbd_iov_pin(struct ksmbd_work *work, void *ib,
+ unsigned int ib_len)
+{
+ work->iov[++work->iov_idx].iov_base = ib;
+ work->iov[work->iov_idx].iov_len = ib_len;
+ work->iov_cnt++;
+}
+
+static int __ksmbd_iov_pin_rsp(struct ksmbd_work *work, void *ib, int len,
+ void *aux_buf, unsigned int aux_size)
{
+ struct aux_read *ar = NULL;
+ int need_iov_cnt = 1;
+
+ if (aux_size) {
+ need_iov_cnt++;
+ ar = kmalloc(sizeof(struct aux_read), GFP_KERNEL);
+ if (!ar)
+ return -ENOMEM;
+ }
- if (work->iov_alloc_cnt <= work->iov_cnt) {
+ if (work->iov_alloc_cnt < work->iov_cnt + need_iov_cnt) {
struct kvec *new;
work->iov_alloc_cnt += 4;
new = krealloc(work->iov,
sizeof(struct kvec) * work->iov_alloc_cnt,
GFP_KERNEL | __GFP_ZERO);
- if (!new)
+ if (!new) {
+ kfree(ar);
+ work->iov_alloc_cnt -= 4;
return -ENOMEM;
+ }
work->iov = new;
}
- work->iov[++work->iov_idx].iov_base = ib;
- work->iov[work->iov_idx].iov_len = ib_len;
- work->iov_cnt++;
-
- return 0;
-}
-
-static int __ksmbd_iov_pin_rsp(struct ksmbd_work *work, void *ib, int len,
- void *aux_buf, unsigned int aux_size)
-{
/* Plus rfc_length size on first iov */
if (!work->iov_idx) {
work->iov[work->iov_idx].iov_base = work->response_buf;
@@ -129,19 +142,13 @@ static int __ksmbd_iov_pin_rsp(struct ksmbd_work *work, void *ib, int len,
work->iov_cnt++;
}
- ksmbd_realloc_iov_pin(work, ib, len);
+ __ksmbd_iov_pin(work, ib, len);
inc_rfc1001_len(work->iov[0].iov_base, len);
if (aux_size) {
- struct aux_read *ar;
-
- ksmbd_realloc_iov_pin(work, aux_buf, aux_size);
+ __ksmbd_iov_pin(work, aux_buf, aux_size);
inc_rfc1001_len(work->iov[0].iov_base, aux_size);
- ar = kmalloc(sizeof(struct aux_read), GFP_KERNEL);
- if (!ar)
- return -ENOMEM;
-
ar->buf = aux_buf;
list_add(&ar->entry, &work->aux_read_list);
}
diff --git a/fs/smb/server/mgmt/user_config.h b/fs/smb/server/mgmt/user_config.h
index 6a44109617f1..e068a19fd904 100644
--- a/fs/smb/server/mgmt/user_config.h
+++ b/fs/smb/server/mgmt/user_config.h
@@ -18,7 +18,6 @@ struct ksmbd_user {
size_t passkey_sz;
char *passkey;
- unsigned int failed_login_count;
};
static inline bool user_guest(struct ksmbd_user *user)
diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c
index 9bc0103720f5..562b180459a1 100644
--- a/fs/smb/server/oplock.c
+++ b/fs/smb/server/oplock.c
@@ -102,9 +102,10 @@ static int alloc_lease(struct oplock_info *opinfo, struct lease_ctx_info *lctx)
lease->new_state = 0;
lease->flags = lctx->flags;
lease->duration = lctx->duration;
+ lease->is_dir = lctx->is_dir;
memcpy(lease->parent_lease_key, lctx->parent_lease_key, SMB2_LEASE_KEY_SIZE);
lease->version = lctx->version;
- lease->epoch = 0;
+ lease->epoch = le16_to_cpu(lctx->epoch);
INIT_LIST_HEAD(&opinfo->lease_entry);
opinfo->o_lease = lease;
@@ -395,8 +396,8 @@ void close_id_del_oplock(struct ksmbd_file *fp)
{
struct oplock_info *opinfo;
- if (S_ISDIR(file_inode(fp->filp)->i_mode))
- return;
+ if (fp->reserve_lease_break)
+ smb_lazy_parent_lease_break_close(fp);
opinfo = opinfo_get(fp);
if (!opinfo)
@@ -543,12 +544,13 @@ static struct oplock_info *same_client_has_lease(struct ksmbd_inode *ci,
/* upgrading lease */
if ((atomic_read(&ci->op_count) +
atomic_read(&ci->sop_count)) == 1) {
- if (lease->state ==
- (lctx->req_state & lease->state)) {
+ if (lease->state != SMB2_LEASE_NONE_LE &&
+ lease->state == (lctx->req_state & lease->state)) {
lease->state |= lctx->req_state;
if (lctx->req_state &
SMB2_LEASE_WRITE_CACHING_LE)
lease_read_to_write(opinfo);
+
}
} else if ((atomic_read(&ci->op_count) +
atomic_read(&ci->sop_count)) > 1) {
@@ -833,7 +835,8 @@ static int smb2_lease_break_noti(struct oplock_info *opinfo)
interim_entry);
setup_async_work(in_work, NULL, NULL);
smb2_send_interim_resp(in_work, STATUS_PENDING);
- list_del(&in_work->interim_entry);
+ list_del_init(&in_work->interim_entry);
+ release_async_work(in_work);
}
INIT_WORK(&work->work, __smb2_lease_break_noti);
ksmbd_queue_work(work);
@@ -899,7 +902,8 @@ static int oplock_break(struct oplock_info *brk_opinfo, int req_op_level)
lease->new_state =
SMB2_LEASE_READ_CACHING_LE;
} else {
- if (lease->state & SMB2_LEASE_HANDLE_CACHING_LE)
+ if (lease->state & SMB2_LEASE_HANDLE_CACHING_LE &&
+ !lease->is_dir)
lease->new_state =
SMB2_LEASE_READ_CACHING_LE;
else
@@ -1031,6 +1035,7 @@ static void copy_lease(struct oplock_info *op1, struct oplock_info *op2)
SMB2_LEASE_KEY_SIZE);
lease2->duration = lease1->duration;
lease2->flags = lease1->flags;
+ lease2->epoch = lease1->epoch++;
}
static int add_lease_global_list(struct oplock_info *opinfo)
@@ -1080,6 +1085,89 @@ static void set_oplock_level(struct oplock_info *opinfo, int level,
}
}
+void smb_send_parent_lease_break_noti(struct ksmbd_file *fp,
+ struct lease_ctx_info *lctx)
+{
+ struct oplock_info *opinfo;
+ struct ksmbd_inode *p_ci = NULL;
+
+ if (lctx->version != 2)
+ return;
+
+ p_ci = ksmbd_inode_lookup_lock(fp->filp->f_path.dentry->d_parent);
+ if (!p_ci)
+ return;
+
+ read_lock(&p_ci->m_lock);
+ list_for_each_entry(opinfo, &p_ci->m_op_list, op_entry) {
+ if (!opinfo->is_lease)
+ continue;
+
+ if (opinfo->o_lease->state != SMB2_OPLOCK_LEVEL_NONE &&
+ (!(lctx->flags & SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE) ||
+ !compare_guid_key(opinfo, fp->conn->ClientGUID,
+ lctx->parent_lease_key))) {
+ if (!atomic_inc_not_zero(&opinfo->refcount))
+ continue;
+
+ atomic_inc(&opinfo->conn->r_count);
+ if (ksmbd_conn_releasing(opinfo->conn)) {
+ atomic_dec(&opinfo->conn->r_count);
+ continue;
+ }
+
+ read_unlock(&p_ci->m_lock);
+ oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE);
+ opinfo_conn_put(opinfo);
+ read_lock(&p_ci->m_lock);
+ }
+ }
+ read_unlock(&p_ci->m_lock);
+
+ ksmbd_inode_put(p_ci);
+}
+
+void smb_lazy_parent_lease_break_close(struct ksmbd_file *fp)
+{
+ struct oplock_info *opinfo;
+ struct ksmbd_inode *p_ci = NULL;
+
+ rcu_read_lock();
+ opinfo = rcu_dereference(fp->f_opinfo);
+ rcu_read_unlock();
+
+ if (!opinfo->is_lease || opinfo->o_lease->version != 2)
+ return;
+
+ p_ci = ksmbd_inode_lookup_lock(fp->filp->f_path.dentry->d_parent);
+ if (!p_ci)
+ return;
+
+ read_lock(&p_ci->m_lock);
+ list_for_each_entry(opinfo, &p_ci->m_op_list, op_entry) {
+ if (!opinfo->is_lease)
+ continue;
+
+ if (opinfo->o_lease->state != SMB2_OPLOCK_LEVEL_NONE) {
+ if (!atomic_inc_not_zero(&opinfo->refcount))
+ continue;
+
+ atomic_inc(&opinfo->conn->r_count);
+ if (ksmbd_conn_releasing(opinfo->conn)) {
+ atomic_dec(&opinfo->conn->r_count);
+ continue;
+ }
+ read_unlock(&p_ci->m_lock);
+ oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE);
+ opinfo_conn_put(opinfo);
+ read_lock(&p_ci->m_lock);
+ }
+ }
+ read_unlock(&p_ci->m_lock);
+
+ ksmbd_inode_put(p_ci);
+}
+
/**
* smb_grant_oplock() - handle oplock/lease request on file open
* @work: smb work
@@ -1103,10 +1191,6 @@ int smb_grant_oplock(struct ksmbd_work *work, int req_op_level, u64 pid,
bool prev_op_has_lease;
__le32 prev_op_state = 0;
- /* not support directory lease */
- if (S_ISDIR(file_inode(fp->filp)->i_mode))
- return 0;
-
opinfo = alloc_opinfo(work, pid, tid);
if (!opinfo)
return -ENOMEM;
@@ -1363,6 +1447,7 @@ void create_lease_buf(u8 *rbuf, struct lease *lease)
memcpy(buf->lcontext.LeaseKey, lease->lease_key,
SMB2_LEASE_KEY_SIZE);
buf->lcontext.LeaseFlags = lease->flags;
+ buf->lcontext.Epoch = cpu_to_le16(++lease->epoch);
buf->lcontext.LeaseState = lease->state;
memcpy(buf->lcontext.ParentLeaseKey, lease->parent_lease_key,
SMB2_LEASE_KEY_SIZE);
@@ -1399,10 +1484,11 @@ void create_lease_buf(u8 *rbuf, struct lease *lease)
/**
* parse_lease_state() - parse lease context containted in file open request
* @open_req: buffer containing smb2 file open(create) request
+ * @is_dir: whether leasing file is directory
*
* Return: oplock state, -ENOENT if create lease context not found
*/
-struct lease_ctx_info *parse_lease_state(void *open_req)
+struct lease_ctx_info *parse_lease_state(void *open_req, bool is_dir)
{
struct create_context *cc;
struct smb2_create_req *req = (struct smb2_create_req *)open_req;
@@ -1420,8 +1506,14 @@ struct lease_ctx_info *parse_lease_state(void *open_req)
struct create_lease_v2 *lc = (struct create_lease_v2 *)cc;
memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE);
- lreq->req_state = lc->lcontext.LeaseState;
+ if (is_dir) {
+ lreq->req_state = lc->lcontext.LeaseState &
+ ~SMB2_LEASE_WRITE_CACHING_LE;
+ lreq->is_dir = true;
+ } else
+ lreq->req_state = lc->lcontext.LeaseState;
lreq->flags = lc->lcontext.LeaseFlags;
+ lreq->epoch = lc->lcontext.Epoch;
lreq->duration = lc->lcontext.LeaseDuration;
memcpy(lreq->parent_lease_key, lc->lcontext.ParentLeaseKey,
SMB2_LEASE_KEY_SIZE);
diff --git a/fs/smb/server/oplock.h b/fs/smb/server/oplock.h
index 4b0fe6da7694..5b93ea9196c0 100644
--- a/fs/smb/server/oplock.h
+++ b/fs/smb/server/oplock.h
@@ -34,7 +34,9 @@ struct lease_ctx_info {
__le32 flags;
__le64 duration;
__u8 parent_lease_key[SMB2_LEASE_KEY_SIZE];
+ __le16 epoch;
int version;
+ bool is_dir;
};
struct lease_table {
@@ -53,6 +55,7 @@ struct lease {
__u8 parent_lease_key[SMB2_LEASE_KEY_SIZE];
int version;
unsigned short epoch;
+ bool is_dir;
struct lease_table *l_lb;
};
@@ -108,7 +111,7 @@ void opinfo_put(struct oplock_info *opinfo);
/* Lease related functions */
void create_lease_buf(u8 *rbuf, struct lease *lease);
-struct lease_ctx_info *parse_lease_state(void *open_req);
+struct lease_ctx_info *parse_lease_state(void *open_req, bool is_dir);
__u8 smb2_map_lease_to_oplock(__le32 lease_state);
int lease_read_to_write(struct oplock_info *opinfo);
@@ -124,4 +127,7 @@ struct oplock_info *lookup_lease_in_table(struct ksmbd_conn *conn,
int find_same_lease_key(struct ksmbd_session *sess, struct ksmbd_inode *ci,
struct lease_ctx_info *lctx);
void destroy_lease_table(struct ksmbd_conn *conn);
+void smb_send_parent_lease_break_noti(struct ksmbd_file *fp,
+ struct lease_ctx_info *lctx);
+void smb_lazy_parent_lease_break_close(struct ksmbd_file *fp);
#endif /* __KSMBD_OPLOCK_H */
diff --git a/fs/smb/server/smb2misc.c b/fs/smb/server/smb2misc.c
index 23bd3d1209df..03dded29a980 100644
--- a/fs/smb/server/smb2misc.c
+++ b/fs/smb/server/smb2misc.c
@@ -106,16 +106,25 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len,
break;
case SMB2_CREATE:
{
+ unsigned short int name_off =
+ le16_to_cpu(((struct smb2_create_req *)hdr)->NameOffset);
+ unsigned short int name_len =
+ le16_to_cpu(((struct smb2_create_req *)hdr)->NameLength);
+
if (((struct smb2_create_req *)hdr)->CreateContextsLength) {
*off = le32_to_cpu(((struct smb2_create_req *)
hdr)->CreateContextsOffset);
*len = le32_to_cpu(((struct smb2_create_req *)
hdr)->CreateContextsLength);
- break;
+ if (!name_len)
+ break;
+
+ if (name_off + name_len < (u64)*off + *len)
+ break;
}
- *off = le16_to_cpu(((struct smb2_create_req *)hdr)->NameOffset);
- *len = le16_to_cpu(((struct smb2_create_req *)hdr)->NameLength);
+ *off = name_off;
+ *len = name_len;
break;
}
case SMB2_QUERY_INFO:
diff --git a/fs/smb/server/smb2ops.c b/fs/smb/server/smb2ops.c
index aed7704a0672..27a9dce3e03a 100644
--- a/fs/smb/server/smb2ops.c
+++ b/fs/smb/server/smb2ops.c
@@ -221,7 +221,8 @@ void init_smb3_0_server(struct ksmbd_conn *conn)
conn->signing_algorithm = SIGNING_ALG_AES_CMAC_LE;
if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES)
- conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING;
+ conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING |
+ SMB2_GLOBAL_CAP_DIRECTORY_LEASING;
if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION &&
conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION)
@@ -245,7 +246,8 @@ void init_smb3_02_server(struct ksmbd_conn *conn)
conn->signing_algorithm = SIGNING_ALG_AES_CMAC_LE;
if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES)
- conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING;
+ conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING |
+ SMB2_GLOBAL_CAP_DIRECTORY_LEASING;
if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION ||
(!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) &&
@@ -270,7 +272,8 @@ int init_smb3_11_server(struct ksmbd_conn *conn)
conn->signing_algorithm = SIGNING_ALG_AES_CMAC_LE;
if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES)
- conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING;
+ conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING |
+ SMB2_GLOBAL_CAP_DIRECTORY_LEASING;
if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION ||
(!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) &&
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 93262ca3f58a..652ab429bf2e 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -657,13 +657,9 @@ smb2_get_name(const char *src, const int maxlen, struct nls_table *local_nls)
int setup_async_work(struct ksmbd_work *work, void (*fn)(void **), void **arg)
{
- struct smb2_hdr *rsp_hdr;
struct ksmbd_conn *conn = work->conn;
int id;
- rsp_hdr = ksmbd_resp_buf_next(work);
- rsp_hdr->Flags |= SMB2_FLAGS_ASYNC_COMMAND;
-
id = ksmbd_acquire_async_msg_id(&conn->async_ida);
if (id < 0) {
pr_err("Failed to alloc async message id\n");
@@ -671,7 +667,6 @@ int setup_async_work(struct ksmbd_work *work, void (*fn)(void **), void **arg)
}
work->asynchronous = true;
work->async_id = id;
- rsp_hdr->Id.AsyncId = cpu_to_le64(id);
ksmbd_debug(SMB,
"Send interim Response to inform async request id : %d\n",
@@ -723,6 +718,8 @@ void smb2_send_interim_resp(struct ksmbd_work *work, __le32 status)
__SMB2_HEADER_STRUCTURE_SIZE);
rsp_hdr = smb2_get_msg(in_work->response_buf);
+ rsp_hdr->Flags |= SMB2_FLAGS_ASYNC_COMMAND;
+ rsp_hdr->Id.AsyncId = cpu_to_le64(work->async_id);
smb2_set_err_rsp(in_work);
rsp_hdr->Status = status;
@@ -2380,7 +2377,8 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len,
rc = 0;
} else {
rc = ksmbd_vfs_setxattr(idmap, path, attr_name, value,
- le16_to_cpu(eabuf->EaValueLength), 0);
+ le16_to_cpu(eabuf->EaValueLength),
+ 0, true);
if (rc < 0) {
ksmbd_debug(SMB,
"ksmbd_vfs_setxattr is failed(%d)\n",
@@ -2443,7 +2441,7 @@ static noinline int smb2_set_stream_name_xattr(const struct path *path,
return -EBADF;
}
- rc = ksmbd_vfs_setxattr(idmap, path, xattr_stream_name, NULL, 0, 0);
+ rc = ksmbd_vfs_setxattr(idmap, path, xattr_stream_name, NULL, 0, 0, false);
if (rc < 0)
pr_err("Failed to store XATTR stream name :%d\n", rc);
return 0;
@@ -2518,7 +2516,7 @@ static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, const struct path *
da.flags = XATTR_DOSINFO_ATTRIB | XATTR_DOSINFO_CREATE_TIME |
XATTR_DOSINFO_ITIME;
- rc = ksmbd_vfs_set_dos_attrib_xattr(mnt_idmap(path->mnt), path, &da);
+ rc = ksmbd_vfs_set_dos_attrib_xattr(mnt_idmap(path->mnt), path, &da, true);
if (rc)
ksmbd_debug(SMB, "failed to store file attribute into xattr\n");
}
@@ -2608,7 +2606,7 @@ static int smb2_create_sd_buffer(struct ksmbd_work *work,
sizeof(struct create_sd_buf_req))
return -EINVAL;
return set_info_sec(work->conn, work->tcon, path, &sd_buf->ntsd,
- le32_to_cpu(sd_buf->ccontext.DataLength), true);
+ le32_to_cpu(sd_buf->ccontext.DataLength), true, false);
}
static void ksmbd_acls_fattr(struct smb_fattr *fattr,
@@ -2690,7 +2688,7 @@ int smb2_open(struct ksmbd_work *work)
*(char *)req->Buffer == '\\') {
pr_err("not allow directory name included leading slash\n");
rc = -EINVAL;
- goto err_out1;
+ goto err_out2;
}
name = smb2_get_name(req->Buffer,
@@ -2701,7 +2699,7 @@ int smb2_open(struct ksmbd_work *work)
if (rc != -ENOMEM)
rc = -ENOENT;
name = NULL;
- goto err_out1;
+ goto err_out2;
}
ksmbd_debug(SMB, "converted name = %s\n", name);
@@ -2709,48 +2707,44 @@ int smb2_open(struct ksmbd_work *work)
if (!test_share_config_flag(work->tcon->share_conf,
KSMBD_SHARE_FLAG_STREAMS)) {
rc = -EBADF;
- goto err_out1;
+ goto err_out2;
}
rc = parse_stream_name(name, &stream_name, &s_type);
if (rc < 0)
- goto err_out1;
+ goto err_out2;
}
rc = ksmbd_validate_filename(name);
if (rc < 0)
- goto err_out1;
+ goto err_out2;
if (ksmbd_share_veto_filename(share, name)) {
rc = -ENOENT;
ksmbd_debug(SMB, "Reject open(), vetoed file: %s\n",
name);
- goto err_out1;
+ goto err_out2;
}
} else {
name = kstrdup("", GFP_KERNEL);
if (!name) {
rc = -ENOMEM;
- goto err_out1;
+ goto err_out2;
}
}
- req_op_level = req->RequestedOplockLevel;
- if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE)
- lc = parse_lease_state(req);
-
if (le32_to_cpu(req->ImpersonationLevel) > le32_to_cpu(IL_DELEGATE)) {
pr_err("Invalid impersonationlevel : 0x%x\n",
le32_to_cpu(req->ImpersonationLevel));
rc = -EIO;
rsp->hdr.Status = STATUS_BAD_IMPERSONATION_LEVEL;
- goto err_out1;
+ goto err_out2;
}
if (req->CreateOptions && !(req->CreateOptions & CREATE_OPTIONS_MASK_LE)) {
pr_err("Invalid create options : 0x%x\n",
le32_to_cpu(req->CreateOptions));
rc = -EINVAL;
- goto err_out1;
+ goto err_out2;
} else {
if (req->CreateOptions & FILE_SEQUENTIAL_ONLY_LE &&
req->CreateOptions & FILE_RANDOM_ACCESS_LE)
@@ -2760,13 +2754,13 @@ int smb2_open(struct ksmbd_work *work)
(FILE_OPEN_BY_FILE_ID_LE | CREATE_TREE_CONNECTION |
FILE_RESERVE_OPFILTER_LE)) {
rc = -EOPNOTSUPP;
- goto err_out1;
+ goto err_out2;
}
if (req->CreateOptions & FILE_DIRECTORY_FILE_LE) {
if (req->CreateOptions & FILE_NON_DIRECTORY_FILE_LE) {
rc = -EINVAL;
- goto err_out1;
+ goto err_out2;
} else if (req->CreateOptions & FILE_NO_COMPRESSION_LE) {
req->CreateOptions = ~(FILE_NO_COMPRESSION_LE);
}
@@ -2778,21 +2772,21 @@ int smb2_open(struct ksmbd_work *work)
pr_err("Invalid create disposition : 0x%x\n",
le32_to_cpu(req->CreateDisposition));
rc = -EINVAL;
- goto err_out1;
+ goto err_out2;
}
if (!(req->DesiredAccess & DESIRED_ACCESS_MASK)) {
pr_err("Invalid desired access : 0x%x\n",
le32_to_cpu(req->DesiredAccess));
rc = -EACCES;
- goto err_out1;
+ goto err_out2;
}
if (req->FileAttributes && !(req->FileAttributes & FILE_ATTRIBUTE_MASK_LE)) {
pr_err("Invalid file attribute : 0x%x\n",
le32_to_cpu(req->FileAttributes));
rc = -EINVAL;
- goto err_out1;
+ goto err_out2;
}
if (req->CreateContextsOffset) {
@@ -2800,19 +2794,19 @@ int smb2_open(struct ksmbd_work *work)
context = smb2_find_context_vals(req, SMB2_CREATE_EA_BUFFER, 4);
if (IS_ERR(context)) {
rc = PTR_ERR(context);
- goto err_out1;
+ goto err_out2;
} else if (context) {
ea_buf = (struct create_ea_buf_req *)context;
if (le16_to_cpu(context->DataOffset) +
le32_to_cpu(context->DataLength) <
sizeof(struct create_ea_buf_req)) {
rc = -EINVAL;
- goto err_out1;
+ goto err_out2;
}
if (req->CreateOptions & FILE_NO_EA_KNOWLEDGE_LE) {
rsp->hdr.Status = STATUS_ACCESS_DENIED;
rc = -EACCES;
- goto err_out1;
+ goto err_out2;
}
}
@@ -2820,7 +2814,7 @@ int smb2_open(struct ksmbd_work *work)
SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST, 4);
if (IS_ERR(context)) {
rc = PTR_ERR(context);
- goto err_out1;
+ goto err_out2;
} else if (context) {
ksmbd_debug(SMB,
"get query maximal access context\n");
@@ -2831,11 +2825,11 @@ int smb2_open(struct ksmbd_work *work)
SMB2_CREATE_TIMEWARP_REQUEST, 4);
if (IS_ERR(context)) {
rc = PTR_ERR(context);
- goto err_out1;
+ goto err_out2;
} else if (context) {
ksmbd_debug(SMB, "get timewarp context\n");
rc = -EBADF;
- goto err_out1;
+ goto err_out2;
}
if (tcon->posix_extensions) {
@@ -2843,7 +2837,7 @@ int smb2_open(struct ksmbd_work *work)
SMB2_CREATE_TAG_POSIX, 16);
if (IS_ERR(context)) {
rc = PTR_ERR(context);
- goto err_out1;
+ goto err_out2;
} else if (context) {
struct create_posix *posix =
(struct create_posix *)context;
@@ -2851,7 +2845,7 @@ int smb2_open(struct ksmbd_work *work)
le32_to_cpu(context->DataLength) <
sizeof(struct create_posix) - 4) {
rc = -EINVAL;
- goto err_out1;
+ goto err_out2;
}
ksmbd_debug(SMB, "get posix context\n");
@@ -2863,7 +2857,7 @@ int smb2_open(struct ksmbd_work *work)
if (ksmbd_override_fsids(work)) {
rc = -ENOMEM;
- goto err_out1;
+ goto err_out2;
}
rc = ksmbd_vfs_kern_path_locked(work, name, LOOKUP_NO_SYMLINKS,
@@ -3038,7 +3032,7 @@ int smb2_open(struct ksmbd_work *work)
}
}
- rc = ksmbd_query_inode_status(d_inode(path.dentry->d_parent));
+ rc = ksmbd_query_inode_status(path.dentry->d_parent);
if (rc == KSMBD_INODE_STATUS_PENDING_DELETE) {
rc = -EBUSY;
goto err_out;
@@ -3152,7 +3146,8 @@ int smb2_open(struct ksmbd_work *work)
idmap,
&path,
pntsd,
- pntsd_size);
+ pntsd_size,
+ false);
kfree(pntsd);
if (rc)
pr_err("failed to store ntacl in xattr : %d\n",
@@ -3175,11 +3170,6 @@ int smb2_open(struct ksmbd_work *work)
fp->attrib_only = !(req->DesiredAccess & ~(FILE_READ_ATTRIBUTES_LE |
FILE_WRITE_ATTRIBUTES_LE | FILE_SYNCHRONIZE_LE));
- if (!S_ISDIR(file_inode(filp)->i_mode) && open_flags & O_TRUNC &&
- !fp->attrib_only && !stream_name) {
- smb_break_all_oplock(work, fp);
- need_truncate = 1;
- }
/* fp should be searchable through ksmbd_inode.m_fp_list
* after daccess, saccess, attrib_only, and stream are
@@ -3195,23 +3185,43 @@ int smb2_open(struct ksmbd_work *work)
goto err_out;
}
+ if (file_present || created)
+ ksmbd_vfs_kern_path_unlock(&parent_path, &path);
+
+ if (!S_ISDIR(file_inode(filp)->i_mode) && open_flags & O_TRUNC &&
+ !fp->attrib_only && !stream_name) {
+ smb_break_all_oplock(work, fp);
+ need_truncate = 1;
+ }
+
+ req_op_level = req->RequestedOplockLevel;
+ if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE)
+ lc = parse_lease_state(req, S_ISDIR(file_inode(filp)->i_mode));
+
share_ret = ksmbd_smb_check_shared_mode(fp->filp, fp);
if (!test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_OPLOCKS) ||
(req_op_level == SMB2_OPLOCK_LEVEL_LEASE &&
!(conn->vals->capabilities & SMB2_GLOBAL_CAP_LEASING))) {
if (share_ret < 0 && !S_ISDIR(file_inode(fp->filp)->i_mode)) {
rc = share_ret;
- goto err_out;
+ goto err_out1;
}
} else {
if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE) {
+ /*
+ * Compare parent lease using parent key. If there is no
+ * a lease that has same parent key, Send lease break
+ * notification.
+ */
+ smb_send_parent_lease_break_noti(fp, lc);
+
req_op_level = smb2_map_lease_to_oplock(lc->req_state);
ksmbd_debug(SMB,
"lease req for(%s) req oplock state 0x%x, lease state 0x%x\n",
name, req_op_level, lc->req_state);
rc = find_same_lease_key(sess, fp->f_ci, lc);
if (rc)
- goto err_out;
+ goto err_out1;
} else if (open_flags == O_RDONLY &&
(req_op_level == SMB2_OPLOCK_LEVEL_BATCH ||
req_op_level == SMB2_OPLOCK_LEVEL_EXCLUSIVE))
@@ -3222,16 +3232,16 @@ int smb2_open(struct ksmbd_work *work)
le32_to_cpu(req->hdr.Id.SyncId.TreeId),
lc, share_ret);
if (rc < 0)
- goto err_out;
+ goto err_out1;
}
if (req->CreateOptions & FILE_DELETE_ON_CLOSE_LE)
ksmbd_fd_set_delete_on_close(fp, file_info);
if (need_truncate) {
- rc = smb2_create_truncate(&path);
+ rc = smb2_create_truncate(&fp->filp->f_path);
if (rc)
- goto err_out;
+ goto err_out1;
}
if (req->CreateContextsOffset) {
@@ -3241,7 +3251,7 @@ int smb2_open(struct ksmbd_work *work)
SMB2_CREATE_ALLOCATION_SIZE, 4);
if (IS_ERR(az_req)) {
rc = PTR_ERR(az_req);
- goto err_out;
+ goto err_out1;
} else if (az_req) {
loff_t alloc_size;
int err;
@@ -3250,7 +3260,7 @@ int smb2_open(struct ksmbd_work *work)
le32_to_cpu(az_req->ccontext.DataLength) <
sizeof(struct create_alloc_size_req)) {
rc = -EINVAL;
- goto err_out;
+ goto err_out1;
}
alloc_size = le64_to_cpu(az_req->AllocationSize);
ksmbd_debug(SMB,
@@ -3268,7 +3278,7 @@ int smb2_open(struct ksmbd_work *work)
context = smb2_find_context_vals(req, SMB2_CREATE_QUERY_ON_DISK_ID, 4);
if (IS_ERR(context)) {
rc = PTR_ERR(context);
- goto err_out;
+ goto err_out1;
} else if (context) {
ksmbd_debug(SMB, "get query on disk id context\n");
query_disk_id = 1;
@@ -3277,7 +3287,7 @@ int smb2_open(struct ksmbd_work *work)
rc = ksmbd_vfs_getattr(&path, &stat);
if (rc)
- goto err_out;
+ goto err_out1;
if (stat.result_mask & STATX_BTIME)
fp->create_time = ksmbd_UnixTimeToNT(stat.btime);
@@ -3398,13 +3408,13 @@ int smb2_open(struct ksmbd_work *work)
}
err_out:
- if (file_present || created) {
- inode_unlock(d_inode(parent_path.dentry));
- path_put(&path);
- path_put(&parent_path);
- }
- ksmbd_revert_fsids(work);
+ if (rc && (file_present || created))
+ ksmbd_vfs_kern_path_unlock(&parent_path, &path);
+
err_out1:
+ ksmbd_revert_fsids(work);
+
+err_out2:
if (!rc) {
ksmbd_update_fstate(&work->sess->file_table, fp, FP_INITED);
rc = ksmbd_iov_pin_rsp(work, (void *)rsp, iov_len);
@@ -4834,9 +4844,9 @@ static void find_file_posix_info(struct smb2_query_info_rsp *rsp,
file_info = (struct smb311_posix_qinfo *)rsp->Buffer;
file_info->CreationTime = cpu_to_le64(fp->create_time);
- time = ksmbd_UnixTimeToNT(inode->i_atime);
+ time = ksmbd_UnixTimeToNT(inode_get_atime(inode));
file_info->LastAccessTime = cpu_to_le64(time);
- time = ksmbd_UnixTimeToNT(inode->i_mtime);
+ time = ksmbd_UnixTimeToNT(inode_get_mtime(inode));
file_info->LastWriteTime = cpu_to_le64(time);
time = ksmbd_UnixTimeToNT(inode_get_ctime(inode));
file_info->ChangeTime = cpu_to_le64(time);
@@ -5443,9 +5453,9 @@ int smb2_close(struct ksmbd_work *work)
rsp->EndOfFile = cpu_to_le64(inode->i_size);
rsp->Attributes = fp->f_ci->m_fattr;
rsp->CreationTime = cpu_to_le64(fp->create_time);
- time = ksmbd_UnixTimeToNT(inode->i_atime);
+ time = ksmbd_UnixTimeToNT(inode_get_atime(inode));
rsp->LastAccessTime = cpu_to_le64(time);
- time = ksmbd_UnixTimeToNT(inode->i_mtime);
+ time = ksmbd_UnixTimeToNT(inode_get_mtime(inode));
rsp->LastWriteTime = cpu_to_le64(time);
time = ksmbd_UnixTimeToNT(inode_get_ctime(inode));
rsp->ChangeTime = cpu_to_le64(time);
@@ -5537,7 +5547,7 @@ static int smb2_rename(struct ksmbd_work *work,
rc = ksmbd_vfs_setxattr(file_mnt_idmap(fp->filp),
&fp->filp->f_path,
xattr_stream_name,
- NULL, 0, 0);
+ NULL, 0, 0, true);
if (rc < 0) {
pr_err("failed to store stream name in xattr: %d\n",
rc);
@@ -5630,11 +5640,9 @@ static int smb2_create_link(struct ksmbd_work *work,
if (rc)
rc = -EINVAL;
out:
- if (file_present) {
- inode_unlock(d_inode(parent_path.dentry));
- path_put(&path);
- path_put(&parent_path);
- }
+ if (file_present)
+ ksmbd_vfs_kern_path_unlock(&parent_path, &path);
+
if (!IS_ERR(link_name))
kfree(link_name);
kfree(pathname);
@@ -5701,7 +5709,8 @@ static int set_file_basic_info(struct ksmbd_file *fp,
da.flags = XATTR_DOSINFO_ATTRIB | XATTR_DOSINFO_CREATE_TIME |
XATTR_DOSINFO_ITIME;
- rc = ksmbd_vfs_set_dos_attrib_xattr(idmap, &filp->f_path, &da);
+ rc = ksmbd_vfs_set_dos_attrib_xattr(idmap, &filp->f_path, &da,
+ true);
if (rc)
ksmbd_debug(SMB,
"failed to restore file attribute in EA\n");
@@ -6013,7 +6022,7 @@ static int smb2_set_info_sec(struct ksmbd_file *fp, int addition_info,
fp->saccess |= FILE_SHARE_DELETE_LE;
return set_info_sec(fp->conn, fp->tcon, &fp->filp->f_path, pntsd,
- buf_len, false);
+ buf_len, false, true);
}
/**
@@ -7078,6 +7087,7 @@ skip:
smb2_remove_blocked_lock,
argv);
if (rc) {
+ kfree(argv);
err = -ENOMEM;
goto out;
}
@@ -7582,7 +7592,8 @@ static inline int fsctl_set_sparse(struct ksmbd_work *work, u64 id,
da.attr = le32_to_cpu(fp->f_ci->m_fattr);
ret = ksmbd_vfs_set_dos_attrib_xattr(idmap,
- &fp->filp->f_path, &da);
+ &fp->filp->f_path,
+ &da, true);
if (ret)
fp->f_ci->m_fattr = old_fattr;
}
@@ -8208,6 +8219,11 @@ static void smb21_lease_break_ack(struct ksmbd_work *work)
le32_to_cpu(req->LeaseState));
}
+ if (ret < 0) {
+ rsp->hdr.Status = err;
+ goto err_out;
+ }
+
lease_state = lease->state;
opinfo->op_state = OPLOCK_STATE_NONE;
wake_up_interruptible_all(&opinfo->oplock_q);
@@ -8215,11 +8231,6 @@ static void smb21_lease_break_ack(struct ksmbd_work *work)
wake_up_interruptible_all(&opinfo->oplock_brk);
opinfo_put(opinfo);
- if (ret < 0) {
- rsp->hdr.Status = err;
- goto err_out;
- }
-
rsp->StructureSize = cpu_to_le16(36);
rsp->Reserved = 0;
rsp->Flags = 0;
@@ -8231,7 +8242,6 @@ static void smb21_lease_break_ack(struct ksmbd_work *work)
return;
err_out:
- opinfo->op_state = OPLOCK_STATE_NONE;
wake_up_interruptible_all(&opinfo->oplock_q);
atomic_dec(&opinfo->breaking_cnt);
wake_up_interruptible_all(&opinfo->oplock_brk);
diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c
index e6ba1e9b8589..6691ae68af0c 100644
--- a/fs/smb/server/smb_common.c
+++ b/fs/smb/server/smb_common.c
@@ -366,11 +366,22 @@ static int smb1_allocate_rsp_buf(struct ksmbd_work *work)
return 0;
}
+/**
+ * set_smb1_rsp_status() - set error type in smb response header
+ * @work: smb work containing smb response header
+ * @err: error code to set in response
+ */
+static void set_smb1_rsp_status(struct ksmbd_work *work, __le32 err)
+{
+ work->send_no_response = 1;
+}
+
static struct smb_version_ops smb1_server_ops = {
.get_cmd_val = get_smb1_cmd_val,
.init_rsp_hdr = init_smb1_rsp_hdr,
.allocate_rsp_buf = smb1_allocate_rsp_buf,
.check_user_session = smb1_check_user_session,
+ .set_rsp_status = set_smb1_rsp_status,
};
static int smb1_negotiate(struct ksmbd_work *work)
diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c
index 6c0305be895e..1164365533f0 100644
--- a/fs/smb/server/smbacl.c
+++ b/fs/smb/server/smbacl.c
@@ -1107,6 +1107,7 @@ pass:
struct smb_acl *pdacl;
struct smb_sid *powner_sid = NULL, *pgroup_sid = NULL;
int powner_sid_size = 0, pgroup_sid_size = 0, pntsd_size;
+ int pntsd_alloc_size;
if (parent_pntsd->osidoffset) {
powner_sid = (struct smb_sid *)((char *)parent_pntsd +
@@ -1119,9 +1120,10 @@ pass:
pgroup_sid_size = 1 + 1 + 6 + (pgroup_sid->num_subauth * 4);
}
- pntsd = kzalloc(sizeof(struct smb_ntsd) + powner_sid_size +
- pgroup_sid_size + sizeof(struct smb_acl) +
- nt_size, GFP_KERNEL);
+ pntsd_alloc_size = sizeof(struct smb_ntsd) + powner_sid_size +
+ pgroup_sid_size + sizeof(struct smb_acl) + nt_size;
+
+ pntsd = kzalloc(pntsd_alloc_size, GFP_KERNEL);
if (!pntsd) {
rc = -ENOMEM;
goto free_aces_base;
@@ -1136,6 +1138,27 @@ pass:
pntsd->gsidoffset = parent_pntsd->gsidoffset;
pntsd->dacloffset = parent_pntsd->dacloffset;
+ if ((u64)le32_to_cpu(pntsd->osidoffset) + powner_sid_size >
+ pntsd_alloc_size) {
+ rc = -EINVAL;
+ kfree(pntsd);
+ goto free_aces_base;
+ }
+
+ if ((u64)le32_to_cpu(pntsd->gsidoffset) + pgroup_sid_size >
+ pntsd_alloc_size) {
+ rc = -EINVAL;
+ kfree(pntsd);
+ goto free_aces_base;
+ }
+
+ if ((u64)le32_to_cpu(pntsd->dacloffset) + sizeof(struct smb_acl) + nt_size >
+ pntsd_alloc_size) {
+ rc = -EINVAL;
+ kfree(pntsd);
+ goto free_aces_base;
+ }
+
if (pntsd->osidoffset) {
struct smb_sid *owner_sid = (struct smb_sid *)((char *)pntsd +
le32_to_cpu(pntsd->osidoffset));
@@ -1162,7 +1185,7 @@ pass:
pntsd_size += sizeof(struct smb_acl) + nt_size;
}
- ksmbd_vfs_set_sd_xattr(conn, idmap, path, pntsd, pntsd_size);
+ ksmbd_vfs_set_sd_xattr(conn, idmap, path, pntsd, pntsd_size, false);
kfree(pntsd);
}
@@ -1354,7 +1377,7 @@ err_out:
int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon,
const struct path *path, struct smb_ntsd *pntsd, int ntsd_len,
- bool type_check)
+ bool type_check, bool get_write)
{
int rc;
struct smb_fattr fattr = {{0}};
@@ -1414,7 +1437,8 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon,
if (test_share_config_flag(tcon->share_conf, KSMBD_SHARE_FLAG_ACL_XATTR)) {
/* Update WinACL in xattr */
ksmbd_vfs_remove_sd_xattrs(idmap, path);
- ksmbd_vfs_set_sd_xattr(conn, idmap, path, pntsd, ntsd_len);
+ ksmbd_vfs_set_sd_xattr(conn, idmap, path, pntsd, ntsd_len,
+ get_write);
}
out:
diff --git a/fs/smb/server/smbacl.h b/fs/smb/server/smbacl.h
index 49a8c292bd2e..2b52861707d8 100644
--- a/fs/smb/server/smbacl.h
+++ b/fs/smb/server/smbacl.h
@@ -207,7 +207,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path,
__le32 *pdaccess, int uid);
int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon,
const struct path *path, struct smb_ntsd *pntsd, int ntsd_len,
- bool type_check);
+ bool type_check, bool get_write);
void id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid);
void ksmbd_init_domain(u32 *sub_auth);
diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c
index 3b269e1f523a..c5629a68c8b7 100644
--- a/fs/smb/server/transport_rdma.c
+++ b/fs/smb/server/transport_rdma.c
@@ -2140,8 +2140,7 @@ static int smb_direct_ib_client_add(struct ib_device *ib_dev)
if (ib_dev->node_type != RDMA_NODE_IB_CA)
smb_direct_port = SMB_DIRECT_PORT_IWARP;
- if (!ib_dev->ops.get_netdev ||
- !rdma_frwr_is_supported(&ib_dev->attrs))
+ if (!rdma_frwr_is_supported(&ib_dev->attrs))
return 0;
smb_dev = kzalloc(sizeof(*smb_dev), GFP_KERNEL);
@@ -2241,17 +2240,38 @@ bool ksmbd_rdma_capable_netdev(struct net_device *netdev)
for (i = 0; i < smb_dev->ib_dev->phys_port_cnt; i++) {
struct net_device *ndev;
- ndev = smb_dev->ib_dev->ops.get_netdev(smb_dev->ib_dev,
- i + 1);
- if (!ndev)
- continue;
+ if (smb_dev->ib_dev->ops.get_netdev) {
+ ndev = smb_dev->ib_dev->ops.get_netdev(
+ smb_dev->ib_dev, i + 1);
+ if (!ndev)
+ continue;
- if (ndev == netdev) {
+ if (ndev == netdev) {
+ dev_put(ndev);
+ rdma_capable = true;
+ goto out;
+ }
dev_put(ndev);
- rdma_capable = true;
- goto out;
+ /* if ib_dev does not implement ops.get_netdev
+ * check for matching infiniband GUID in hw_addr
+ */
+ } else if (netdev->type == ARPHRD_INFINIBAND) {
+ struct netdev_hw_addr *ha;
+ union ib_gid gid;
+ u32 port_num;
+ int ret;
+
+ netdev_hw_addr_list_for_each(
+ ha, &netdev->dev_addrs) {
+ memcpy(&gid, ha->addr + 4, sizeof(gid));
+ ret = ib_find_gid(smb_dev->ib_dev, &gid,
+ &port_num, NULL);
+ if (!ret) {
+ rdma_capable = true;
+ goto out;
+ }
+ }
}
- dev_put(ndev);
}
}
out:
diff --git a/fs/smb/server/unicode.c b/fs/smb/server/unicode.c
index 393dd4a7432b..43ed29ee44ea 100644
--- a/fs/smb/server/unicode.c
+++ b/fs/smb/server/unicode.c
@@ -14,45 +14,9 @@
#include "smb_common.h"
/*
- * smb_utf16_bytes() - how long will a string be after conversion?
- * @from: pointer to input string
- * @maxbytes: don't go past this many bytes of input string
- * @codepage: destination codepage
- *
- * Walk a utf16le string and return the number of bytes that the string will
- * be after being converted to the given charset, not including any null
- * termination required. Don't walk past maxbytes in the source buffer.
- *
- * Return: string length after conversion
- */
-static int smb_utf16_bytes(const __le16 *from, int maxbytes,
- const struct nls_table *codepage)
-{
- int i;
- int charlen, outlen = 0;
- int maxwords = maxbytes / 2;
- char tmp[NLS_MAX_CHARSET_SIZE];
- __u16 ftmp;
-
- for (i = 0; i < maxwords; i++) {
- ftmp = get_unaligned_le16(&from[i]);
- if (ftmp == 0)
- break;
-
- charlen = codepage->uni2char(ftmp, tmp, NLS_MAX_CHARSET_SIZE);
- if (charlen > 0)
- outlen += charlen;
- else
- outlen++;
- }
-
- return outlen;
-}
-
-/*
* cifs_mapchar() - convert a host-endian char to proper char in codepage
* @target: where converted character should be copied
- * @src_char: 2 byte host-endian source character
+ * @from: host-endian source string
* @cp: codepage to which character should be converted
* @mapchar: should character be mapped according to mapchars mount option?
*
@@ -63,10 +27,13 @@ static int smb_utf16_bytes(const __le16 *from, int maxbytes,
* Return: string length after conversion
*/
static int
-cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
+cifs_mapchar(char *target, const __u16 *from, const struct nls_table *cp,
bool mapchar)
{
int len = 1;
+ __u16 src_char;
+
+ src_char = *from;
if (!mapchar)
goto cp_convert;
@@ -104,12 +71,66 @@ out:
cp_convert:
len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
- if (len <= 0) {
- *target = '?';
- len = 1;
- }
+ if (len <= 0)
+ goto surrogate_pair;
goto out;
+
+surrogate_pair:
+ /* convert SURROGATE_PAIR and IVS */
+ if (strcmp(cp->charset, "utf8"))
+ goto unknown;
+ len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6);
+ if (len <= 0)
+ goto unknown;
+ return len;
+
+unknown:
+ *target = '?';
+ len = 1;
+ goto out;
+}
+
+/*
+ * smb_utf16_bytes() - compute converted string length
+ * @from: pointer to input string
+ * @maxbytes: input string length
+ * @codepage: destination codepage
+ *
+ * Walk a utf16le string and return the number of bytes that the string will
+ * be after being converted to the given charset, not including any null
+ * termination required. Don't walk past maxbytes in the source buffer.
+ *
+ * Return: string length after conversion
+ */
+static int smb_utf16_bytes(const __le16 *from, int maxbytes,
+ const struct nls_table *codepage)
+{
+ int i, j;
+ int charlen, outlen = 0;
+ int maxwords = maxbytes / 2;
+ char tmp[NLS_MAX_CHARSET_SIZE];
+ __u16 ftmp[3];
+
+ for (i = 0; i < maxwords; i++) {
+ ftmp[0] = get_unaligned_le16(&from[i]);
+ if (ftmp[0] == 0)
+ break;
+ for (j = 1; j <= 2; j++) {
+ if (i + j < maxwords)
+ ftmp[j] = get_unaligned_le16(&from[i + j]);
+ else
+ ftmp[j] = 0;
+ }
+
+ charlen = cifs_mapchar(tmp, ftmp, codepage, 0);
+ if (charlen > 0)
+ outlen += charlen;
+ else
+ outlen++;
+ }
+
+ return outlen;
}
/*
@@ -139,12 +160,12 @@ cp_convert:
static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
const struct nls_table *codepage, bool mapchar)
{
- int i, charlen, safelen;
+ int i, j, charlen, safelen;
int outlen = 0;
int nullsize = nls_nullsize(codepage);
int fromwords = fromlen / 2;
char tmp[NLS_MAX_CHARSET_SIZE];
- __u16 ftmp;
+ __u16 ftmp[3]; /* ftmp[3] = 3array x 2bytes = 6bytes UTF-16 */
/*
* because the chars can be of varying widths, we need to take care
@@ -155,9 +176,15 @@ static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
for (i = 0; i < fromwords; i++) {
- ftmp = get_unaligned_le16(&from[i]);
- if (ftmp == 0)
+ ftmp[0] = get_unaligned_le16(&from[i]);
+ if (ftmp[0] == 0)
break;
+ for (j = 1; j <= 2; j++) {
+ if (i + j < fromwords)
+ ftmp[j] = get_unaligned_le16(&from[i + j]);
+ else
+ ftmp[j] = 0;
+ }
/*
* check to see if converting this character might make the
@@ -172,6 +199,19 @@ static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
/* put converted char into 'to' buffer */
charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar);
outlen += charlen;
+
+ /*
+ * charlen (=bytes of UTF-8 for 1 character)
+ * 4bytes UTF-8(surrogate pair) is charlen=4
+ * (4bytes UTF-16 code)
+ * 7-8bytes UTF-8(IVS) is charlen=3+4 or 4+4
+ * (2 UTF-8 pairs divided to 2 UTF-16 pairs)
+ */
+ if (charlen == 4)
+ i++;
+ else if (charlen >= 5)
+ /* 5-6bytes UTF-8 */
+ i += 2;
}
/* properly null-terminate string */
@@ -306,6 +346,9 @@ int smbConvertToUTF16(__le16 *target, const char *source, int srclen,
char src_char;
__le16 dst_char;
wchar_t tmp;
+ wchar_t wchar_to[6]; /* UTF-16 */
+ int ret;
+ unicode_t u;
if (!mapchars)
return smb_strtoUTF16(target, source, srclen, cp);
@@ -348,11 +391,57 @@ int smbConvertToUTF16(__le16 *target, const char *source, int srclen,
* if no match, use question mark, which at least in
* some cases serves as wild card
*/
- if (charlen < 1) {
- dst_char = cpu_to_le16(0x003f);
- charlen = 1;
+ if (charlen > 0)
+ goto ctoUTF16;
+
+ /* convert SURROGATE_PAIR */
+ if (strcmp(cp->charset, "utf8"))
+ goto unknown;
+ if (*(source + i) & 0x80) {
+ charlen = utf8_to_utf32(source + i, 6, &u);
+ if (charlen < 0)
+ goto unknown;
+ } else
+ goto unknown;
+ ret = utf8s_to_utf16s(source + i, charlen,
+ UTF16_LITTLE_ENDIAN,
+ wchar_to, 6);
+ if (ret < 0)
+ goto unknown;
+
+ i += charlen;
+ dst_char = cpu_to_le16(*wchar_to);
+ if (charlen <= 3)
+ /* 1-3bytes UTF-8 to 2bytes UTF-16 */
+ put_unaligned(dst_char, &target[j]);
+ else if (charlen == 4) {
+ /*
+ * 4bytes UTF-8(surrogate pair) to 4bytes UTF-16
+ * 7-8bytes UTF-8(IVS) divided to 2 UTF-16
+ * (charlen=3+4 or 4+4)
+ */
+ put_unaligned(dst_char, &target[j]);
+ dst_char = cpu_to_le16(*(wchar_to + 1));
+ j++;
+ put_unaligned(dst_char, &target[j]);
+ } else if (charlen >= 5) {
+ /* 5-6bytes UTF-8 to 6bytes UTF-16 */
+ put_unaligned(dst_char, &target[j]);
+ dst_char = cpu_to_le16(*(wchar_to + 1));
+ j++;
+ put_unaligned(dst_char, &target[j]);
+ dst_char = cpu_to_le16(*(wchar_to + 2));
+ j++;
+ put_unaligned(dst_char, &target[j]);
}
+ continue;
+
+unknown:
+ dst_char = cpu_to_le16(0x003f);
+ charlen = 1;
}
+
+ctoUTF16:
/*
* character may take more than one byte in the source string,
* but will take exactly two bytes in the target string
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index b5a5e50fc9ca..4277750a6da1 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -97,6 +97,13 @@ static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf,
return -ENOENT;
}
+ err = mnt_want_write(parent_path->mnt);
+ if (err) {
+ path_put(parent_path);
+ putname(filename);
+ return -ENOENT;
+ }
+
inode_lock_nested(parent_path->dentry->d_inode, I_MUTEX_PARENT);
d = lookup_one_qstr_excl(&last, parent_path->dentry, 0);
if (IS_ERR(d))
@@ -123,6 +130,7 @@ static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf,
err_out:
inode_unlock(d_inode(parent_path->dentry));
+ mnt_drop_write(parent_path->mnt);
path_put(parent_path);
putname(filename);
return -ENOENT;
@@ -173,10 +181,6 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode)
return err;
}
- err = mnt_want_write(path.mnt);
- if (err)
- goto out_err;
-
mode |= S_IFREG;
err = vfs_create(mnt_idmap(path.mnt), d_inode(path.dentry),
dentry, mode, true);
@@ -186,9 +190,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode)
} else {
pr_err("File(%s): creation failed (err:%d)\n", name, err);
}
- mnt_drop_write(path.mnt);
-out_err:
done_path_create(&path, dentry);
return err;
}
@@ -219,10 +221,6 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode)
return err;
}
- err = mnt_want_write(path.mnt);
- if (err)
- goto out_err2;
-
idmap = mnt_idmap(path.mnt);
mode |= S_IFDIR;
err = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode);
@@ -233,21 +231,19 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode)
dentry->d_name.len);
if (IS_ERR(d)) {
err = PTR_ERR(d);
- goto out_err1;
+ goto out_err;
}
if (unlikely(d_is_negative(d))) {
dput(d);
err = -ENOENT;
- goto out_err1;
+ goto out_err;
}
ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(d));
dput(d);
}
-out_err1:
- mnt_drop_write(path.mnt);
-out_err2:
+out_err:
done_path_create(&path, dentry);
if (err)
pr_err("mkdir(%s): creation failed (err:%d)\n", name, err);
@@ -463,7 +459,8 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos,
fp->stream.name,
(void *)stream_buf,
size,
- 0);
+ 0,
+ true);
if (err < 0)
goto out;
@@ -520,6 +517,9 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp,
}
}
+ /* Reserve lease break for parent dir at closing time */
+ fp->reserve_lease_break = true;
+
/* Do we need to break any of a levelII oplock? */
smb_break_all_levII_oplock(work, fp, 1);
@@ -605,10 +605,6 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path)
goto out_err;
}
- err = mnt_want_write(path->mnt);
- if (err)
- goto out_err;
-
idmap = mnt_idmap(path->mnt);
if (S_ISDIR(d_inode(path->dentry)->i_mode)) {
err = vfs_rmdir(idmap, d_inode(parent), path->dentry);
@@ -619,7 +615,6 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path)
if (err)
ksmbd_debug(VFS, "unlink failed, err %d\n", err);
}
- mnt_drop_write(path->mnt);
out_err:
ksmbd_revert_fsids(work);
@@ -665,16 +660,11 @@ int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname,
goto out3;
}
- err = mnt_want_write(newpath.mnt);
- if (err)
- goto out3;
-
err = vfs_link(oldpath.dentry, mnt_idmap(newpath.mnt),
d_inode(newpath.dentry),
dentry, NULL);
if (err)
ksmbd_debug(VFS, "vfs_link failed err %d\n", err);
- mnt_drop_write(newpath.mnt);
out3:
done_path_create(&newpath, dentry);
@@ -732,7 +722,7 @@ retry:
goto out3;
}
- parent_fp = ksmbd_lookup_fd_inode(d_inode(old_child->d_parent));
+ parent_fp = ksmbd_lookup_fd_inode(old_child->d_parent);
if (parent_fp) {
if (parent_fp->daccess & FILE_DELETE_LE) {
pr_err("parent dir is opened with delete access\n");
@@ -919,23 +909,27 @@ ssize_t ksmbd_vfs_getxattr(struct mnt_idmap *idmap,
/**
* ksmbd_vfs_setxattr() - vfs helper for smb set extended attributes value
* @idmap: idmap of the relevant mount
- * @dentry: dentry to set XATTR at
+ * @path: path of dentry to set XATTR at
* @attr_name: xattr name for setxattr
* @attr_value: xattr value to set
* @attr_size: size of xattr value
* @flags: destination buffer length
+ * @get_write: get write access to a mount
*
* Return: 0 on success, otherwise error
*/
int ksmbd_vfs_setxattr(struct mnt_idmap *idmap,
const struct path *path, const char *attr_name,
- void *attr_value, size_t attr_size, int flags)
+ void *attr_value, size_t attr_size, int flags,
+ bool get_write)
{
int err;
- err = mnt_want_write(path->mnt);
- if (err)
- return err;
+ if (get_write == true) {
+ err = mnt_want_write(path->mnt);
+ if (err)
+ return err;
+ }
err = vfs_setxattr(idmap,
path->dentry,
@@ -945,7 +939,8 @@ int ksmbd_vfs_setxattr(struct mnt_idmap *idmap,
flags);
if (err)
ksmbd_debug(VFS, "setxattr failed, err %d\n", err);
- mnt_drop_write(path->mnt);
+ if (get_write == true)
+ mnt_drop_write(path->mnt);
return err;
}
@@ -1194,9 +1189,10 @@ static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name,
/**
* ksmbd_vfs_kern_path_locked() - lookup a file and get path info
- * @name: file path that is relative to share
- * @flags: lookup flags
- * @path: if lookup succeed, return path info
+ * @name: file path that is relative to share
+ * @flags: lookup flags
+ * @parent_path: if lookup succeed, return parent_path info
+ * @path: if lookup succeed, return path info
* @caseless: caseless filename lookup
*
* Return: 0 on success, otherwise error
@@ -1268,6 +1264,13 @@ out1:
}
if (!err) {
+ err = mnt_want_write(parent_path->mnt);
+ if (err) {
+ path_put(path);
+ path_put(parent_path);
+ return err;
+ }
+
err = ksmbd_vfs_lock_parent(parent_path->dentry, path->dentry);
if (err) {
path_put(path);
@@ -1277,6 +1280,14 @@ out1:
return err;
}
+void ksmbd_vfs_kern_path_unlock(struct path *parent_path, struct path *path)
+{
+ inode_unlock(d_inode(parent_path->dentry));
+ mnt_drop_write(parent_path->mnt);
+ path_put(path);
+ path_put(parent_path);
+}
+
struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work,
const char *name,
unsigned int flags,
@@ -1431,7 +1442,8 @@ out:
int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn,
struct mnt_idmap *idmap,
const struct path *path,
- struct smb_ntsd *pntsd, int len)
+ struct smb_ntsd *pntsd, int len,
+ bool get_write)
{
int rc;
struct ndr sd_ndr = {0}, acl_ndr = {0};
@@ -1491,7 +1503,7 @@ int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn,
rc = ksmbd_vfs_setxattr(idmap, path,
XATTR_NAME_SD, sd_ndr.data,
- sd_ndr.offset, 0);
+ sd_ndr.offset, 0, get_write);
if (rc < 0)
pr_err("Failed to store XATTR ntacl :%d\n", rc);
@@ -1580,7 +1592,8 @@ free_n_data:
int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap,
const struct path *path,
- struct xattr_dos_attrib *da)
+ struct xattr_dos_attrib *da,
+ bool get_write)
{
struct ndr n;
int err;
@@ -1590,7 +1603,7 @@ int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap,
return err;
err = ksmbd_vfs_setxattr(idmap, path, XATTR_NAME_DOS_ATTRIBUTE,
- (void *)n.data, n.offset, 0);
+ (void *)n.data, n.offset, 0, get_write);
if (err)
ksmbd_debug(SMB, "failed to store dos attribute in xattr\n");
kfree(n.data);
@@ -1862,10 +1875,6 @@ int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap,
}
posix_state_to_acl(&acl_state, acls->a_entries);
- rc = mnt_want_write(path->mnt);
- if (rc)
- goto out_err;
-
rc = set_posix_acl(idmap, dentry, ACL_TYPE_ACCESS, acls);
if (rc < 0)
ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n",
@@ -1877,9 +1886,7 @@ int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap,
ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n",
rc);
}
- mnt_drop_write(path->mnt);
-out_err:
free_acl_state(&acl_state);
posix_acl_release(acls);
return rc;
@@ -1909,10 +1916,6 @@ int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap,
}
}
- rc = mnt_want_write(path->mnt);
- if (rc)
- goto out_err;
-
rc = set_posix_acl(idmap, dentry, ACL_TYPE_ACCESS, acls);
if (rc < 0)
ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n",
@@ -1924,9 +1927,7 @@ int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap,
ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n",
rc);
}
- mnt_drop_write(path->mnt);
-out_err:
posix_acl_release(acls);
return rc;
}
diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h
index 00968081856e..cfe1c8092f23 100644
--- a/fs/smb/server/vfs.h
+++ b/fs/smb/server/vfs.h
@@ -109,7 +109,8 @@ ssize_t ksmbd_vfs_casexattr_len(struct mnt_idmap *idmap,
int attr_name_len);
int ksmbd_vfs_setxattr(struct mnt_idmap *idmap,
const struct path *path, const char *attr_name,
- void *attr_value, size_t attr_size, int flags);
+ void *attr_value, size_t attr_size, int flags,
+ bool get_write);
int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name,
size_t *xattr_stream_name_size, int s_type);
int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap,
@@ -117,6 +118,7 @@ int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap,
int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name,
unsigned int flags, struct path *parent_path,
struct path *path, bool caseless);
+void ksmbd_vfs_kern_path_unlock(struct path *parent_path, struct path *path);
struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work,
const char *name,
unsigned int flags,
@@ -144,14 +146,16 @@ int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, const struct path *path)
int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn,
struct mnt_idmap *idmap,
const struct path *path,
- struct smb_ntsd *pntsd, int len);
+ struct smb_ntsd *pntsd, int len,
+ bool get_write);
int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn,
struct mnt_idmap *idmap,
struct dentry *dentry,
struct smb_ntsd **pntsd);
int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap,
const struct path *path,
- struct xattr_dos_attrib *da);
+ struct xattr_dos_attrib *da,
+ bool get_write);
int ksmbd_vfs_get_dos_attrib_xattr(struct mnt_idmap *idmap,
struct dentry *dentry,
struct xattr_dos_attrib *da);
diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c
index c91eac6514dd..4e82ff627d12 100644
--- a/fs/smb/server/vfs_cache.c
+++ b/fs/smb/server/vfs_cache.c
@@ -66,14 +66,14 @@ static unsigned long inode_hash(struct super_block *sb, unsigned long hashval)
return tmp & inode_hash_mask;
}
-static struct ksmbd_inode *__ksmbd_inode_lookup(struct inode *inode)
+static struct ksmbd_inode *__ksmbd_inode_lookup(struct dentry *de)
{
struct hlist_head *head = inode_hashtable +
- inode_hash(inode->i_sb, inode->i_ino);
+ inode_hash(d_inode(de)->i_sb, (unsigned long)de);
struct ksmbd_inode *ci = NULL, *ret_ci = NULL;
hlist_for_each_entry(ci, head, m_hash) {
- if (ci->m_inode == inode) {
+ if (ci->m_de == de) {
if (atomic_inc_not_zero(&ci->m_count))
ret_ci = ci;
break;
@@ -84,26 +84,27 @@ static struct ksmbd_inode *__ksmbd_inode_lookup(struct inode *inode)
static struct ksmbd_inode *ksmbd_inode_lookup(struct ksmbd_file *fp)
{
- return __ksmbd_inode_lookup(file_inode(fp->filp));
+ return __ksmbd_inode_lookup(fp->filp->f_path.dentry);
}
-static struct ksmbd_inode *ksmbd_inode_lookup_by_vfsinode(struct inode *inode)
+struct ksmbd_inode *ksmbd_inode_lookup_lock(struct dentry *d)
{
struct ksmbd_inode *ci;
read_lock(&inode_hash_lock);
- ci = __ksmbd_inode_lookup(inode);
+ ci = __ksmbd_inode_lookup(d);
read_unlock(&inode_hash_lock);
+
return ci;
}
-int ksmbd_query_inode_status(struct inode *inode)
+int ksmbd_query_inode_status(struct dentry *dentry)
{
struct ksmbd_inode *ci;
int ret = KSMBD_INODE_STATUS_UNKNOWN;
read_lock(&inode_hash_lock);
- ci = __ksmbd_inode_lookup(inode);
+ ci = __ksmbd_inode_lookup(dentry);
if (ci) {
ret = KSMBD_INODE_STATUS_OK;
if (ci->m_flags & (S_DEL_PENDING | S_DEL_ON_CLS))
@@ -143,7 +144,7 @@ void ksmbd_fd_set_delete_on_close(struct ksmbd_file *fp,
static void ksmbd_inode_hash(struct ksmbd_inode *ci)
{
struct hlist_head *b = inode_hashtable +
- inode_hash(ci->m_inode->i_sb, ci->m_inode->i_ino);
+ inode_hash(d_inode(ci->m_de)->i_sb, (unsigned long)ci->m_de);
hlist_add_head(&ci->m_hash, b);
}
@@ -157,7 +158,6 @@ static void ksmbd_inode_unhash(struct ksmbd_inode *ci)
static int ksmbd_inode_init(struct ksmbd_inode *ci, struct ksmbd_file *fp)
{
- ci->m_inode = file_inode(fp->filp);
atomic_set(&ci->m_count, 1);
atomic_set(&ci->op_count, 0);
atomic_set(&ci->sop_count, 0);
@@ -166,6 +166,7 @@ static int ksmbd_inode_init(struct ksmbd_inode *ci, struct ksmbd_file *fp)
INIT_LIST_HEAD(&ci->m_fp_list);
INIT_LIST_HEAD(&ci->m_op_list);
rwlock_init(&ci->m_lock);
+ ci->m_de = fp->filp->f_path.dentry;
return 0;
}
@@ -209,7 +210,7 @@ static void ksmbd_inode_free(struct ksmbd_inode *ci)
kfree(ci);
}
-static void ksmbd_inode_put(struct ksmbd_inode *ci)
+void ksmbd_inode_put(struct ksmbd_inode *ci)
{
if (atomic_dec_and_test(&ci->m_count))
ksmbd_inode_free(ci);
@@ -488,12 +489,15 @@ struct ksmbd_file *ksmbd_lookup_fd_cguid(char *cguid)
return fp;
}
-struct ksmbd_file *ksmbd_lookup_fd_inode(struct inode *inode)
+struct ksmbd_file *ksmbd_lookup_fd_inode(struct dentry *dentry)
{
struct ksmbd_file *lfp;
struct ksmbd_inode *ci;
+ struct inode *inode = d_inode(dentry);
- ci = ksmbd_inode_lookup_by_vfsinode(inode);
+ read_lock(&inode_hash_lock);
+ ci = __ksmbd_inode_lookup(dentry);
+ read_unlock(&inode_hash_lock);
if (!ci)
return NULL;
diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h
index 03d0bf941216..a528f0cc775a 100644
--- a/fs/smb/server/vfs_cache.h
+++ b/fs/smb/server/vfs_cache.h
@@ -51,7 +51,7 @@ struct ksmbd_inode {
atomic_t op_count;
/* opinfo count for streams */
atomic_t sop_count;
- struct inode *m_inode;
+ struct dentry *m_de;
unsigned int m_flags;
struct hlist_node m_hash;
struct list_head m_fp_list;
@@ -105,6 +105,7 @@ struct ksmbd_file {
struct ksmbd_readdir_data readdir_data;
int dot_dotdot[2];
unsigned int f_state;
+ bool reserve_lease_break;
};
static inline void set_ctx_actor(struct dir_context *ctx,
@@ -138,9 +139,11 @@ struct ksmbd_file *ksmbd_lookup_foreign_fd(struct ksmbd_work *work, u64 id);
struct ksmbd_file *ksmbd_lookup_fd_slow(struct ksmbd_work *work, u64 id,
u64 pid);
void ksmbd_fd_put(struct ksmbd_work *work, struct ksmbd_file *fp);
+struct ksmbd_inode *ksmbd_inode_lookup_lock(struct dentry *d);
+void ksmbd_inode_put(struct ksmbd_inode *ci);
struct ksmbd_file *ksmbd_lookup_durable_fd(unsigned long long id);
struct ksmbd_file *ksmbd_lookup_fd_cguid(char *cguid);
-struct ksmbd_file *ksmbd_lookup_fd_inode(struct inode *inode);
+struct ksmbd_file *ksmbd_lookup_fd_inode(struct dentry *dentry);
unsigned int ksmbd_open_durable_fd(struct ksmbd_file *fp);
struct ksmbd_file *ksmbd_open_fd(struct ksmbd_work *work, struct file *filp);
void ksmbd_close_tree_conn_fds(struct ksmbd_work *work);
@@ -164,7 +167,7 @@ enum KSMBD_INODE_STATUS {
KSMBD_INODE_STATUS_PENDING_DELETE,
};
-int ksmbd_query_inode_status(struct inode *inode);
+int ksmbd_query_inode_status(struct dentry *dentry);
bool ksmbd_inode_pending_delete(struct ksmbd_file *fp);
void ksmbd_set_inode_pending_delete(struct ksmbd_file *fp);
void ksmbd_clear_inode_pending_delete(struct ksmbd_file *fp);
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 581ce9519339..2dc730800f44 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -321,7 +321,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length,
TRACE("Block @ 0x%llx, %scompressed size %d\n", index - 2,
compressed ? "" : "un", length);
}
- if (length < 0 || length > output->length ||
+ if (length <= 0 || length > output->length ||
(index + length) > msblk->bytes_used) {
res = -EIO;
goto out;
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 723763746238..62972f0ff868 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -173,6 +173,7 @@ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
const struct export_operations squashfs_export_ops = {
+ .encode_fh = generic_encode_ino32_fh,
.fh_to_dentry = squashfs_fh_to_dentry,
.fh_to_parent = squashfs_fh_to_parent,
.get_parent = squashfs_get_parent
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index c6e626b00546..aa3411354e66 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -59,9 +59,9 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
i_uid_write(inode, i_uid);
i_gid_write(inode, i_gid);
inode->i_ino = le32_to_cpu(sqsh_ino->inode_number);
- inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime);
- inode->i_atime.tv_sec = inode->i_mtime.tv_sec;
- inode_set_ctime(inode, inode->i_mtime.tv_sec, 0);
+ inode_set_mtime(inode, le32_to_cpu(sqsh_ino->mtime), 0);
+ inode_set_atime(inode, inode_get_mtime_sec(inode), 0);
+ inode_set_ctime(inode, inode_get_mtime_sec(inode), 0);
inode->i_mode = le16_to_cpu(sqsh_ino->mode);
inode->i_size = 0;
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index a6164fdf9435..5a756e6790b5 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -111,4 +111,4 @@ extern const struct address_space_operations squashfs_symlink_aops;
extern const struct inode_operations squashfs_symlink_inode_ops;
/* xattr.c */
-extern const struct xattr_handler *squashfs_xattr_handlers[];
+extern const struct xattr_handler * const squashfs_xattr_handlers[];
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index e1e3f3dd5a06..ce6608cabd49 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c
@@ -262,7 +262,7 @@ static const struct xattr_handler *squashfs_xattr_handler(int type)
}
}
-const struct xattr_handler *squashfs_xattr_handlers[] = {
+const struct xattr_handler * const squashfs_xattr_handlers[] = {
&squashfs_xattr_user_handler,
&squashfs_xattr_trusted_handler,
&squashfs_xattr_security_handler,
diff --git a/fs/stack.c b/fs/stack.c
index b5e01bdb5f5f..f18920119944 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -66,8 +66,8 @@ void fsstack_copy_attr_all(struct inode *dest, const struct inode *src)
dest->i_uid = src->i_uid;
dest->i_gid = src->i_gid;
dest->i_rdev = src->i_rdev;
- dest->i_atime = src->i_atime;
- dest->i_mtime = src->i_mtime;
+ inode_set_atime_to_ts(dest, inode_get_atime(src));
+ inode_set_mtime_to_ts(dest, inode_get_mtime(src));
inode_set_ctime_to_ts(dest, inode_get_ctime(src));
dest->i_blkbits = src->i_blkbits;
dest->i_flags = src->i_flags;
diff --git a/fs/stat.c b/fs/stat.c
index d43a5cc1bfa4..f721d26ec3f7 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -57,8 +57,8 @@ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask,
stat->gid = vfsgid_into_kgid(vfsgid);
stat->rdev = inode->i_rdev;
stat->size = i_size_read(inode);
- stat->atime = inode->i_atime;
- stat->mtime = inode->i_mtime;
+ stat->atime = inode_get_atime(inode);
+ stat->mtime = inode_get_mtime(inode);
stat->ctime = inode_get_ctime(inode);
stat->blksize = i_blocksize(inode);
stat->blocks = inode->i_blocks;
@@ -133,7 +133,8 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
idmap = mnt_idmap(path->mnt);
if (inode->i_op->getattr)
return inode->i_op->getattr(idmap, path, stat,
- request_mask, query_flags);
+ request_mask,
+ query_flags | AT_GETATTR_NOSEC);
generic_fillattr(idmap, request_mask, inode, stat);
return 0;
@@ -166,6 +167,9 @@ int vfs_getattr(const struct path *path, struct kstat *stat,
{
int retval;
+ if (WARN_ON_ONCE(query_flags & AT_GETATTR_NOSEC))
+ return -EPERM;
+
retval = security_inode_getattr(path);
if (retval)
return retval;
diff --git a/fs/super.c b/fs/super.c
index 2d762ce67f6e..076392396e72 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -178,7 +178,7 @@ static void super_wake(struct super_block *sb, unsigned int flag)
* One thing we have to be careful of with a per-sb shrinker is that we don't
* drop the last active reference to the superblock from within the shrinker.
* If that happens we could trigger unregistering the shrinker from within the
- * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
+ * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we
* take a passive reference to the superblock to avoid this from occurring.
*/
static unsigned long super_cache_scan(struct shrinker *shrink,
@@ -191,7 +191,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
long dentries;
long inodes;
- sb = container_of(shrink, struct super_block, s_shrink);
+ sb = shrink->private_data;
/*
* Deadlock avoidance. We may hold various FS locks, and we don't want
@@ -244,7 +244,7 @@ static unsigned long super_cache_count(struct shrinker *shrink,
struct super_block *sb;
long total_objects = 0;
- sb = container_of(shrink, struct super_block, s_shrink);
+ sb = shrink->private_data;
/*
* We don't call super_trylock_shared() here as it is a scalability
@@ -306,7 +306,7 @@ static void destroy_unused_super(struct super_block *s)
security_sb_free(s);
put_user_ns(s->s_user_ns);
kfree(s->s_subtype);
- free_prealloced_shrinker(&s->s_shrink);
+ shrinker_free(s->s_shrink);
/* no delays needed */
destroy_super_work(&s->destroy_work);
}
@@ -383,16 +383,19 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
s->s_time_min = TIME64_MIN;
s->s_time_max = TIME64_MAX;
- s->s_shrink.seeks = DEFAULT_SEEKS;
- s->s_shrink.scan_objects = super_cache_scan;
- s->s_shrink.count_objects = super_cache_count;
- s->s_shrink.batch = 1024;
- s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
- if (prealloc_shrinker(&s->s_shrink, "sb-%s", type->name))
+ s->s_shrink = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
+ "sb-%s", type->name);
+ if (!s->s_shrink)
goto fail;
- if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink))
+
+ s->s_shrink->scan_objects = super_cache_scan;
+ s->s_shrink->count_objects = super_cache_count;
+ s->s_shrink->batch = 1024;
+ s->s_shrink->private_data = s;
+
+ if (list_lru_init_memcg(&s->s_dentry_lru, s->s_shrink))
goto fail;
- if (list_lru_init_memcg(&s->s_inode_lru, &s->s_shrink))
+ if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink))
goto fail;
return s;
@@ -477,7 +480,7 @@ void deactivate_locked_super(struct super_block *s)
{
struct file_system_type *fs = s->s_type;
if (atomic_dec_and_test(&s->s_active)) {
- unregister_shrinker(&s->s_shrink);
+ shrinker_free(s->s_shrink);
fs->kill_sb(s);
kill_super_notify(s);
@@ -818,7 +821,7 @@ retry:
hlist_add_head(&s->s_instances, &s->s_type->fs_supers);
spin_unlock(&sb_lock);
get_filesystem(s->s_type);
- register_shrinker_prepared(&s->s_shrink);
+ shrinker_register(s->s_shrink);
return s;
share_extant_sb:
@@ -901,7 +904,7 @@ retry:
hlist_add_head(&s->s_instances, &type->fs_supers);
spin_unlock(&sb_lock);
get_filesystem(type);
- register_shrinker_prepared(&s->s_shrink);
+ shrinker_register(s->s_shrink);
return s;
}
EXPORT_SYMBOL(sget);
@@ -1419,32 +1422,48 @@ EXPORT_SYMBOL(sget_dev);
#ifdef CONFIG_BLOCK
/*
- * Lock a super block that the callers holds a reference to.
+ * Lock the superblock that is holder of the bdev. Returns the superblock
+ * pointer if we successfully locked the superblock and it is alive. Otherwise
+ * we return NULL and just unlock bdev->bd_holder_lock.
*
- * The caller needs to ensure that the super_block isn't being freed while
- * calling this function, e.g. by holding a lock over the call to this function
- * and the place that clears the pointer to the superblock used by this function
- * before freeing the superblock.
+ * The function must be called with bdev->bd_holder_lock and releases it.
*/
-static bool super_lock_shared_active(struct super_block *sb)
+static struct super_block *bdev_super_lock_shared(struct block_device *bdev)
+ __releases(&bdev->bd_holder_lock)
{
- bool born = super_lock_shared(sb);
+ struct super_block *sb = bdev->bd_holder;
+ bool born;
+
+ lockdep_assert_held(&bdev->bd_holder_lock);
+ lockdep_assert_not_held(&sb->s_umount);
+ lockdep_assert_not_held(&bdev->bd_disk->open_mutex);
+ /* Make sure sb doesn't go away from under us */
+ spin_lock(&sb_lock);
+ sb->s_count++;
+ spin_unlock(&sb_lock);
+ mutex_unlock(&bdev->bd_holder_lock);
+
+ born = super_lock_shared(sb);
if (!born || !sb->s_root || !(sb->s_flags & SB_ACTIVE)) {
super_unlock_shared(sb);
- return false;
+ put_super(sb);
+ return NULL;
}
- return true;
+ /*
+ * The superblock is active and we hold s_umount, we can drop our
+ * temporary reference now.
+ */
+ put_super(sb);
+ return sb;
}
static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
{
- struct super_block *sb = bdev->bd_holder;
-
- /* bd_holder_lock ensures that the sb isn't freed */
- lockdep_assert_held(&bdev->bd_holder_lock);
+ struct super_block *sb;
- if (!super_lock_shared_active(sb))
+ sb = bdev_super_lock_shared(bdev);
+ if (!sb)
return;
if (!surprise)
@@ -1459,11 +1478,10 @@ static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
static void fs_bdev_sync(struct block_device *bdev)
{
- struct super_block *sb = bdev->bd_holder;
-
- lockdep_assert_held(&bdev->bd_holder_lock);
+ struct super_block *sb;
- if (!super_lock_shared_active(sb))
+ sb = bdev_super_lock_shared(bdev);
+ if (!sb)
return;
sync_filesystem(sb);
super_unlock_shared(sb);
@@ -1479,14 +1497,16 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
struct fs_context *fc)
{
blk_mode_t mode = sb_open_mode(sb_flags);
+ struct bdev_handle *bdev_handle;
struct block_device *bdev;
- bdev = blkdev_get_by_dev(sb->s_dev, mode, sb, &fs_holder_ops);
- if (IS_ERR(bdev)) {
+ bdev_handle = bdev_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops);
+ if (IS_ERR(bdev_handle)) {
if (fc)
errorf(fc, "%s: Can't open blockdev", fc->source);
- return PTR_ERR(bdev);
+ return PTR_ERR(bdev_handle);
}
+ bdev = bdev_handle->bdev;
/*
* This really should be in blkdev_get_by_dev, but right now can't due
@@ -1494,7 +1514,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
* writable from userspace even for a read-only block device.
*/
if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) {
- blkdev_put(bdev, sb);
+ bdev_release(bdev_handle);
return -EACCES;
}
@@ -1510,10 +1530,11 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
mutex_unlock(&bdev->bd_fsfreeze_mutex);
if (fc)
warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
- blkdev_put(bdev, sb);
+ bdev_release(bdev_handle);
return -EBUSY;
}
spin_lock(&sb_lock);
+ sb->s_bdev_handle = bdev_handle;
sb->s_bdev = bdev;
sb->s_bdi = bdi_get(bdev->bd_disk->bdi);
if (bdev_stable_writes(bdev))
@@ -1522,7 +1543,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
mutex_unlock(&bdev->bd_fsfreeze_mutex);
snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
- shrinker_debugfs_rename(&sb->s_shrink, "sb-%s:%s", sb->s_type->name,
+ shrinker_debugfs_rename(sb->s_shrink, "sb-%s:%s", sb->s_type->name,
sb->s_id);
sb_set_blocksize(sb, block_size(bdev));
return 0;
@@ -1646,7 +1667,7 @@ void kill_block_super(struct super_block *sb)
generic_shutdown_super(sb);
if (bdev) {
sync_blockdev(bdev);
- blkdev_put(bdev, sb);
+ bdev_release(sb->s_bdev_handle);
}
}
@@ -2139,3 +2160,4 @@ int sb_init_dio_done_wq(struct super_block *sb)
destroy_workqueue(wq);
return 0;
}
+EXPORT_SYMBOL_GPL(sb_init_dio_done_wq);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index a12ac0356c69..6b7652fb8050 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -167,6 +167,18 @@ static int sysfs_kf_bin_mmap(struct kernfs_open_file *of,
return battr->mmap(of->file, kobj, battr, vma);
}
+static loff_t sysfs_kf_bin_llseek(struct kernfs_open_file *of, loff_t offset,
+ int whence)
+{
+ struct bin_attribute *battr = of->kn->priv;
+ struct kobject *kobj = of->kn->parent->priv;
+
+ if (battr->llseek)
+ return battr->llseek(of->file, kobj, battr, offset, whence);
+ else
+ return generic_file_llseek(of->file, offset, whence);
+}
+
static int sysfs_kf_bin_open(struct kernfs_open_file *of)
{
struct bin_attribute *battr = of->kn->priv;
@@ -249,6 +261,7 @@ static const struct kernfs_ops sysfs_bin_kfops_mmap = {
.write = sysfs_kf_bin_write,
.mmap = sysfs_kf_bin_mmap,
.open = sysfs_kf_bin_open,
+ .llseek = sysfs_kf_bin_llseek,
};
int sysfs_add_file_mode_ns(struct kernfs_node *parent,
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 2f5ead88d00b..2e126d72d619 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -224,7 +224,7 @@ got_it:
memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2);
de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
dir_commit_chunk(page, pos, SYSV_DIRSIZE);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
err = sysv_handle_dirsync(dir);
out_page:
@@ -249,7 +249,7 @@ int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page)
}
de->inode = 0;
dir_commit_chunk(page, pos, SYSV_DIRSIZE);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
return sysv_handle_dirsync(inode);
}
@@ -346,7 +346,7 @@ int sysv_set_link(struct sysv_dir_entry *de, struct page *page,
}
de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
dir_commit_chunk(page, pos, SYSV_DIRSIZE);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
return sysv_handle_dirsync(inode);
}
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index 6719da5889d9..269df6d49815 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -165,7 +165,7 @@ struct inode * sysv_new_inode(const struct inode * dir, umode_t mode)
dirty_sb(sb);
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
inode->i_ino = fs16_to_cpu(sbi, ino);
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_blocks = 0;
memset(SYSV_I(inode)->i_data, 0, sizeof(SYSV_I(inode)->i_data));
SYSV_I(inode)->i_dir_start_lookup = 0;
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 0aa3827d8178..5a915b2e68f5 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -200,11 +200,9 @@ struct inode *sysv_iget(struct super_block *sb, unsigned int ino)
i_gid_write(inode, (gid_t)fs16_to_cpu(sbi, raw_inode->i_gid));
set_nlink(inode, fs16_to_cpu(sbi, raw_inode->i_nlink));
inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size);
- inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_atime);
- inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_mtime);
+ inode_set_atime(inode, fs32_to_cpu(sbi, raw_inode->i_atime), 0);
+ inode_set_mtime(inode, fs32_to_cpu(sbi, raw_inode->i_mtime), 0);
inode_set_ctime(inode, fs32_to_cpu(sbi, raw_inode->i_ctime), 0);
- inode->i_atime.tv_nsec = 0;
- inode->i_mtime.tv_nsec = 0;
inode->i_blocks = 0;
si = SYSV_I(inode);
@@ -253,9 +251,9 @@ static int __sysv_write_inode(struct inode *inode, int wait)
raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(i_gid_read(inode)));
raw_inode->i_nlink = cpu_to_fs16(sbi, inode->i_nlink);
raw_inode->i_size = cpu_to_fs32(sbi, inode->i_size);
- raw_inode->i_atime = cpu_to_fs32(sbi, inode->i_atime.tv_sec);
- raw_inode->i_mtime = cpu_to_fs32(sbi, inode->i_mtime.tv_sec);
- raw_inode->i_ctime = cpu_to_fs32(sbi, inode_get_ctime(inode).tv_sec);
+ raw_inode->i_atime = cpu_to_fs32(sbi, inode_get_atime_sec(inode));
+ raw_inode->i_mtime = cpu_to_fs32(sbi, inode_get_mtime_sec(inode));
+ raw_inode->i_ctime = cpu_to_fs32(sbi, inode_get_ctime_sec(inode));
si = SYSV_I(inode);
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index edb94e55de8e..725981474e5f 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -423,7 +423,7 @@ do_indirects:
}
n++;
}
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
if (IS_SYNC(inode))
sysv_sync_inode (inode);
else
diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index 8c8d64e76103..f0677ea0ec24 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -2,8 +2,9 @@
/*
* event_inode.c - part of tracefs, a pseudo file system for activating tracing
*
- * Copyright (C) 2020-23 VMware Inc, author: Steven Rostedt (VMware) <rostedt@goodmis.org>
+ * Copyright (C) 2020-23 VMware Inc, author: Steven Rostedt <rostedt@goodmis.org>
* Copyright (C) 2020-23 VMware Inc, author: Ajay Kaher <akaher@vmware.com>
+ * Copyright (C) 2023 Google, author: Steven Rostedt <rostedt@goodmis.org>
*
* eventfs is used to dynamically create inodes and dentries based on the
* meta data provided by the tracing system.
@@ -23,48 +24,30 @@
#include <linux/delay.h>
#include "internal.h"
-struct eventfs_inode {
- struct list_head e_top_files;
-};
+/*
+ * eventfs_mutex protects the eventfs_inode (ei) dentry. Any access
+ * to the ei->dentry must be done under this mutex and after checking
+ * if ei->is_freed is not set. When ei->is_freed is set, the dentry
+ * is on its way to being freed after the last dput() is made on it.
+ */
+static DEFINE_MUTEX(eventfs_mutex);
/*
- * struct eventfs_file - hold the properties of the eventfs files and
- * directories.
- * @name: the name of the file or directory to create
- * @d_parent: holds parent's dentry
- * @dentry: once accessed holds dentry
- * @list: file or directory to be added to parent directory
- * @ei: list of files and directories within directory
- * @fop: file_operations for file or directory
- * @iop: inode_operations for file or directory
- * @data: something that the caller will want to get to later on
- * @mode: the permission that the file or directory should have
+ * The eventfs_inode (ei) itself is protected by SRCU. It is released from
+ * its parent's list and will have is_freed set (under eventfs_mutex).
+ * After the SRCU grace period is over and the last dput() is called
+ * the ei is freed.
*/
-struct eventfs_file {
- const char *name;
- struct dentry *d_parent;
- struct dentry *dentry;
- struct list_head list;
- struct eventfs_inode *ei;
- const struct file_operations *fop;
- const struct inode_operations *iop;
- /*
- * Union - used for deletion
- * @del_list: list of eventfs_file to delete
- * @rcu: eventfs_file to delete in RCU
- * @is_freed: node is freed if one of the above is set
- */
- union {
- struct list_head del_list;
- struct rcu_head rcu;
- unsigned long is_freed;
- };
- void *data;
- umode_t mode;
+DEFINE_STATIC_SRCU(eventfs_srcu);
+
+/* Mode is unsigned short, use the upper bits for flags */
+enum {
+ EVENTFS_SAVE_MODE = BIT(16),
+ EVENTFS_SAVE_UID = BIT(17),
+ EVENTFS_SAVE_GID = BIT(18),
};
-static DEFINE_MUTEX(eventfs_mutex);
-DEFINE_STATIC_SRCU(eventfs_srcu);
+#define EVENTFS_MODE_MASK (EVENTFS_SAVE_MODE - 1)
static struct dentry *eventfs_root_lookup(struct inode *dir,
struct dentry *dentry,
@@ -73,8 +56,95 @@ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file);
static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx);
static int eventfs_release(struct inode *inode, struct file *file);
+static void update_attr(struct eventfs_attr *attr, struct iattr *iattr)
+{
+ unsigned int ia_valid = iattr->ia_valid;
+
+ if (ia_valid & ATTR_MODE) {
+ attr->mode = (attr->mode & ~EVENTFS_MODE_MASK) |
+ (iattr->ia_mode & EVENTFS_MODE_MASK) |
+ EVENTFS_SAVE_MODE;
+ }
+ if (ia_valid & ATTR_UID) {
+ attr->mode |= EVENTFS_SAVE_UID;
+ attr->uid = iattr->ia_uid;
+ }
+ if (ia_valid & ATTR_GID) {
+ attr->mode |= EVENTFS_SAVE_GID;
+ attr->gid = iattr->ia_gid;
+ }
+}
+
+static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *iattr)
+{
+ const struct eventfs_entry *entry;
+ struct eventfs_inode *ei;
+ const char *name;
+ int ret;
+
+ mutex_lock(&eventfs_mutex);
+ ei = dentry->d_fsdata;
+ if (ei->is_freed) {
+ /* Do not allow changes if the event is about to be removed. */
+ mutex_unlock(&eventfs_mutex);
+ return -ENODEV;
+ }
+
+ /* Preallocate the children mode array if necessary */
+ if (!(dentry->d_inode->i_mode & S_IFDIR)) {
+ if (!ei->entry_attrs) {
+ ei->entry_attrs = kzalloc(sizeof(*ei->entry_attrs) * ei->nr_entries,
+ GFP_NOFS);
+ if (!ei->entry_attrs) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ }
+
+ ret = simple_setattr(idmap, dentry, iattr);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * If this is a dir, then update the ei cache, only the file
+ * mode is saved in the ei->m_children, and the ownership is
+ * determined by the parent directory.
+ */
+ if (dentry->d_inode->i_mode & S_IFDIR) {
+ /*
+ * The events directory dentry is never freed, unless its
+ * part of an instance that is deleted. It's attr is the
+ * default for its child files and directories.
+ * Do not update it. It's not used for its own mode or ownership
+ */
+ if (!ei->is_events)
+ update_attr(&ei->attr, iattr);
+
+ } else {
+ name = dentry->d_name.name;
+
+ for (int i = 0; i < ei->nr_entries; i++) {
+ entry = &ei->entries[i];
+ if (strcmp(name, entry->name) == 0) {
+ update_attr(&ei->entry_attrs[i], iattr);
+ break;
+ }
+ }
+ }
+ out:
+ mutex_unlock(&eventfs_mutex);
+ return ret;
+}
+
static const struct inode_operations eventfs_root_dir_inode_operations = {
.lookup = eventfs_root_lookup,
+ .setattr = eventfs_set_attr,
+};
+
+static const struct inode_operations eventfs_file_inode_operations = {
+ .setattr = eventfs_set_attr,
};
static const struct file_operations eventfs_file_operations = {
@@ -85,26 +155,110 @@ static const struct file_operations eventfs_file_operations = {
.release = eventfs_release,
};
+/* Return the evenfs_inode of the "events" directory */
+static struct eventfs_inode *eventfs_find_events(struct dentry *dentry)
+{
+ struct eventfs_inode *ei;
+
+ mutex_lock(&eventfs_mutex);
+ do {
+ /* The parent always has an ei, except for events itself */
+ ei = dentry->d_parent->d_fsdata;
+
+ /*
+ * If the ei is being freed, the ownership of the children
+ * doesn't matter.
+ */
+ if (ei->is_freed) {
+ ei = NULL;
+ break;
+ }
+
+ dentry = ei->dentry;
+ } while (!ei->is_events);
+ mutex_unlock(&eventfs_mutex);
+
+ return ei;
+}
+
+static void update_inode_attr(struct dentry *dentry, struct inode *inode,
+ struct eventfs_attr *attr, umode_t mode)
+{
+ struct eventfs_inode *events_ei = eventfs_find_events(dentry);
+
+ if (!events_ei)
+ return;
+
+ inode->i_mode = mode;
+ inode->i_uid = events_ei->attr.uid;
+ inode->i_gid = events_ei->attr.gid;
+
+ if (!attr)
+ return;
+
+ if (attr->mode & EVENTFS_SAVE_MODE)
+ inode->i_mode = attr->mode & EVENTFS_MODE_MASK;
+
+ if (attr->mode & EVENTFS_SAVE_UID)
+ inode->i_uid = attr->uid;
+
+ if (attr->mode & EVENTFS_SAVE_GID)
+ inode->i_gid = attr->gid;
+}
+
+static void update_gid(struct eventfs_inode *ei, kgid_t gid, int level)
+{
+ struct eventfs_inode *ei_child;
+
+ /* at most we have events/system/event */
+ if (WARN_ON_ONCE(level > 3))
+ return;
+
+ ei->attr.gid = gid;
+
+ if (ei->entry_attrs) {
+ for (int i = 0; i < ei->nr_entries; i++) {
+ ei->entry_attrs[i].gid = gid;
+ }
+ }
+
+ /*
+ * Only eventfs_inode with dentries are updated, make sure
+ * all eventfs_inodes are updated. If one of the children
+ * do not have a dentry, this function must traverse it.
+ */
+ list_for_each_entry_srcu(ei_child, &ei->children, list,
+ srcu_read_lock_held(&eventfs_srcu)) {
+ if (!ei_child->dentry)
+ update_gid(ei_child, gid, level + 1);
+ }
+}
+
+void eventfs_update_gid(struct dentry *dentry, kgid_t gid)
+{
+ struct eventfs_inode *ei = dentry->d_fsdata;
+ int idx;
+
+ idx = srcu_read_lock(&eventfs_srcu);
+ update_gid(ei, gid, 0);
+ srcu_read_unlock(&eventfs_srcu, idx);
+}
+
/**
* create_file - create a file in the tracefs filesystem
* @name: the name of the file to create.
* @mode: the permission that the file should have.
+ * @attr: saved attributes changed by user
* @parent: parent dentry for this file.
* @data: something that the caller will want to get to later on.
* @fop: struct file_operations that should be used for this file.
*
- * This is the basic "create a file" function for tracefs. It allows for a
- * wide range of flexibility in creating a file.
- *
- * This function will return a pointer to a dentry if it succeeds. This
- * pointer must be passed to the tracefs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, %NULL will be returned.
- *
- * If tracefs is not enabled in the kernel, the value -%ENODEV will be
- * returned.
+ * This function creates a dentry that represents a file in the eventsfs_inode
+ * directory. The inode.i_private pointer will point to @data in the open()
+ * call.
*/
static struct dentry *create_file(const char *name, umode_t mode,
+ struct eventfs_attr *attr,
struct dentry *parent, void *data,
const struct file_operations *fop)
{
@@ -118,6 +272,7 @@ static struct dentry *create_file(const char *name, umode_t mode,
if (WARN_ON_ONCE(!S_ISREG(mode)))
return NULL;
+ WARN_ON_ONCE(!parent);
dentry = eventfs_start_creating(name, parent);
if (IS_ERR(dentry))
@@ -127,7 +282,10 @@ static struct dentry *create_file(const char *name, umode_t mode,
if (unlikely(!inode))
return eventfs_failed_creating(dentry);
- inode->i_mode = mode;
+ /* If the user updated the directory's attributes, use them */
+ update_inode_attr(dentry, inode, attr, mode);
+
+ inode->i_op = &eventfs_file_inode_operations;
inode->i_fop = fop;
inode->i_private = data;
@@ -140,28 +298,19 @@ static struct dentry *create_file(const char *name, umode_t mode,
/**
* create_dir - create a dir in the tracefs filesystem
- * @name: the name of the file to create.
+ * @ei: the eventfs_inode that represents the directory to create
* @parent: parent dentry for this file.
- * @data: something that the caller will want to get to later on.
- *
- * This is the basic "create a dir" function for eventfs. It allows for a
- * wide range of flexibility in creating a dir.
- *
- * This function will return a pointer to a dentry if it succeeds. This
- * pointer must be passed to the tracefs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, %NULL will be returned.
*
- * If tracefs is not enabled in the kernel, the value -%ENODEV will be
- * returned.
+ * This function will create a dentry for a directory represented by
+ * a eventfs_inode.
*/
-static struct dentry *create_dir(const char *name, struct dentry *parent, void *data)
+static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent)
{
struct tracefs_inode *ti;
struct dentry *dentry;
struct inode *inode;
- dentry = eventfs_start_creating(name, parent);
+ dentry = eventfs_start_creating(ei->name, parent);
if (IS_ERR(dentry))
return dentry;
@@ -169,10 +318,12 @@ static struct dentry *create_dir(const char *name, struct dentry *parent, void *
if (unlikely(!inode))
return eventfs_failed_creating(dentry);
- inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+ /* If the user updated the directory's attributes, use them */
+ update_inode_attr(dentry, inode, &ei->attr,
+ S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO);
+
inode->i_op = &eventfs_root_dir_inode_operations;
inode->i_fop = &eventfs_file_operations;
- inode->i_private = data;
ti = get_tracefs(inode);
ti->flags |= TRACEFS_EVENT_INODE;
@@ -184,117 +335,192 @@ static struct dentry *create_dir(const char *name, struct dentry *parent, void *
return eventfs_end_creating(dentry);
}
+static void free_ei(struct eventfs_inode *ei)
+{
+ kfree_const(ei->name);
+ kfree(ei->d_children);
+ kfree(ei->entry_attrs);
+ kfree(ei);
+}
+
/**
- * eventfs_set_ef_status_free - set the ef->status to free
+ * eventfs_set_ei_status_free - remove the dentry reference from an eventfs_inode
* @ti: the tracefs_inode of the dentry
- * @dentry: dentry who's status to be freed
+ * @dentry: dentry which has the reference to remove.
*
- * eventfs_set_ef_status_free will be called if no more
- * references remain
+ * Remove the association between a dentry from an eventfs_inode.
*/
-void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry)
+void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry)
{
- struct tracefs_inode *ti_parent;
struct eventfs_inode *ei;
- struct eventfs_file *ef, *tmp;
-
- /* The top level events directory may be freed by this */
- if (unlikely(ti->flags & TRACEFS_EVENT_TOP_INODE)) {
- LIST_HEAD(ef_del_list);
+ int i;
- mutex_lock(&eventfs_mutex);
+ mutex_lock(&eventfs_mutex);
- ei = ti->private;
+ ei = dentry->d_fsdata;
+ if (!ei)
+ goto out;
- /* Record all the top level files */
- list_for_each_entry_srcu(ef, &ei->e_top_files, list,
- lockdep_is_held(&eventfs_mutex)) {
- list_add_tail(&ef->del_list, &ef_del_list);
+ /* This could belong to one of the files of the ei */
+ if (ei->dentry != dentry) {
+ for (i = 0; i < ei->nr_entries; i++) {
+ if (ei->d_children[i] == dentry)
+ break;
}
+ if (WARN_ON_ONCE(i == ei->nr_entries))
+ goto out;
+ ei->d_children[i] = NULL;
+ } else if (ei->is_freed) {
+ free_ei(ei);
+ } else {
+ ei->dentry = NULL;
+ }
- /* Nothing should access this, but just in case! */
- ti->private = NULL;
+ dentry->d_fsdata = NULL;
+ out:
+ mutex_unlock(&eventfs_mutex);
+}
- mutex_unlock(&eventfs_mutex);
+/**
+ * create_file_dentry - create a dentry for a file of an eventfs_inode
+ * @ei: the eventfs_inode that the file will be created under
+ * @idx: the index into the d_children[] of the @ei
+ * @parent: The parent dentry of the created file.
+ * @name: The name of the file to create
+ * @mode: The mode of the file.
+ * @data: The data to use to set the inode of the file with on open()
+ * @fops: The fops of the file to be created.
+ * @lookup: If called by the lookup routine, in which case, dput() the created dentry.
+ *
+ * Create a dentry for a file of an eventfs_inode @ei and place it into the
+ * address located at @e_dentry. If the @e_dentry already has a dentry, then
+ * just do a dget() on it and return. Otherwise create the dentry and attach it.
+ */
+static struct dentry *
+create_file_dentry(struct eventfs_inode *ei, int idx,
+ struct dentry *parent, const char *name, umode_t mode, void *data,
+ const struct file_operations *fops, bool lookup)
+{
+ struct eventfs_attr *attr = NULL;
+ struct dentry **e_dentry = &ei->d_children[idx];
+ struct dentry *dentry;
- /* Now safely free the top level files and their children */
- list_for_each_entry_safe(ef, tmp, &ef_del_list, del_list) {
- list_del(&ef->del_list);
- eventfs_remove(ef);
- }
+ WARN_ON_ONCE(!inode_is_locked(parent->d_inode));
- kfree(ei);
- return;
+ mutex_lock(&eventfs_mutex);
+ if (ei->is_freed) {
+ mutex_unlock(&eventfs_mutex);
+ return NULL;
+ }
+ /* If the e_dentry already has a dentry, use it */
+ if (*e_dentry) {
+ /* lookup does not need to up the ref count */
+ if (!lookup)
+ dget(*e_dentry);
+ mutex_unlock(&eventfs_mutex);
+ return *e_dentry;
}
- mutex_lock(&eventfs_mutex);
+ /* ei->entry_attrs are protected by SRCU */
+ if (ei->entry_attrs)
+ attr = &ei->entry_attrs[idx];
- ti_parent = get_tracefs(dentry->d_parent->d_inode);
- if (!ti_parent || !(ti_parent->flags & TRACEFS_EVENT_INODE))
- goto out;
+ mutex_unlock(&eventfs_mutex);
- ef = dentry->d_fsdata;
- if (!ef)
- goto out;
+ dentry = create_file(name, mode, attr, parent, data, fops);
- /*
- * If ef was freed, then the LSB bit is set for d_fsdata.
- * But this should not happen, as it should still have a
- * ref count that prevents it. Warn in case it does.
- */
- if (WARN_ON_ONCE((unsigned long)ef & 1))
- goto out;
+ mutex_lock(&eventfs_mutex);
- dentry->d_fsdata = NULL;
- ef->dentry = NULL;
-out:
+ if (IS_ERR_OR_NULL(dentry)) {
+ /*
+ * When the mutex was released, something else could have
+ * created the dentry for this e_dentry. In which case
+ * use that one.
+ *
+ * If ei->is_freed is set, the e_dentry is currently on its
+ * way to being freed, don't return it. If e_dentry is NULL
+ * it means it was already freed.
+ */
+ if (ei->is_freed)
+ dentry = NULL;
+ else
+ dentry = *e_dentry;
+ /* The lookup does not need to up the dentry refcount */
+ if (dentry && !lookup)
+ dget(dentry);
+ mutex_unlock(&eventfs_mutex);
+ return dentry;
+ }
+
+ if (!*e_dentry && !ei->is_freed) {
+ *e_dentry = dentry;
+ dentry->d_fsdata = ei;
+ } else {
+ /*
+ * Should never happen unless we get here due to being freed.
+ * Otherwise it means two dentries exist with the same name.
+ */
+ WARN_ON_ONCE(!ei->is_freed);
+ dentry = NULL;
+ }
mutex_unlock(&eventfs_mutex);
+
+ if (lookup)
+ dput(dentry);
+
+ return dentry;
}
/**
* eventfs_post_create_dir - post create dir routine
- * @ef: eventfs_file of recently created dir
+ * @ei: eventfs_inode of recently created dir
*
* Map the meta-data of files within an eventfs dir to their parent dentry
*/
-static void eventfs_post_create_dir(struct eventfs_file *ef)
+static void eventfs_post_create_dir(struct eventfs_inode *ei)
{
- struct eventfs_file *ef_child;
+ struct eventfs_inode *ei_child;
struct tracefs_inode *ti;
+ lockdep_assert_held(&eventfs_mutex);
+
/* srcu lock already held */
/* fill parent-child relation */
- list_for_each_entry_srcu(ef_child, &ef->ei->e_top_files, list,
+ list_for_each_entry_srcu(ei_child, &ei->children, list,
srcu_read_lock_held(&eventfs_srcu)) {
- ef_child->d_parent = ef->dentry;
+ ei_child->d_parent = ei->dentry;
}
- ti = get_tracefs(ef->dentry->d_inode);
- ti->private = ef->ei;
+ ti = get_tracefs(ei->dentry->d_inode);
+ ti->private = ei;
}
/**
- * create_dentry - helper function to create dentry
- * @ef: eventfs_file of file or directory to create
- * @parent: parent dentry
- * @lookup: true if called from lookup routine
+ * create_dir_dentry - Create a directory dentry for the eventfs_inode
+ * @pei: The eventfs_inode parent of ei.
+ * @ei: The eventfs_inode to create the directory for
+ * @parent: The dentry of the parent of this directory
+ * @lookup: True if this is called by the lookup code
*
- * Used to create a dentry for file/dir, executes post dentry creation routine
+ * This creates and attaches a directory dentry to the eventfs_inode @ei.
*/
static struct dentry *
-create_dentry(struct eventfs_file *ef, struct dentry *parent, bool lookup)
+create_dir_dentry(struct eventfs_inode *pei, struct eventfs_inode *ei,
+ struct dentry *parent, bool lookup)
{
- bool invalidate = false;
- struct dentry *dentry;
+ struct dentry *dentry = NULL;
+
+ WARN_ON_ONCE(!inode_is_locked(parent->d_inode));
mutex_lock(&eventfs_mutex);
- if (ef->is_freed) {
+ if (pei->is_freed || ei->is_freed) {
mutex_unlock(&eventfs_mutex);
return NULL;
}
- if (ef->dentry) {
- dentry = ef->dentry;
- /* On dir open, up the ref count */
+ if (ei->dentry) {
+ /* If the dentry already has a dentry, use it */
+ dentry = ei->dentry;
+ /* lookup does not need to up the ref count */
if (!lookup)
dget(dentry);
mutex_unlock(&eventfs_mutex);
@@ -302,97 +528,134 @@ create_dentry(struct eventfs_file *ef, struct dentry *parent, bool lookup)
}
mutex_unlock(&eventfs_mutex);
- if (!lookup)
- inode_lock(parent->d_inode);
-
- if (ef->ei)
- dentry = create_dir(ef->name, parent, ef->data);
- else
- dentry = create_file(ef->name, ef->mode, parent,
- ef->data, ef->fop);
-
- if (!lookup)
- inode_unlock(parent->d_inode);
+ dentry = create_dir(ei, parent);
mutex_lock(&eventfs_mutex);
- if (IS_ERR_OR_NULL(dentry)) {
- /* If the ef was already updated get it */
- dentry = ef->dentry;
+
+ if (IS_ERR_OR_NULL(dentry) && !ei->is_freed) {
+ /*
+ * When the mutex was released, something else could have
+ * created the dentry for this e_dentry. In which case
+ * use that one.
+ *
+ * If ei->is_freed is set, the e_dentry is currently on its
+ * way to being freed.
+ */
+ dentry = ei->dentry;
if (dentry && !lookup)
dget(dentry);
mutex_unlock(&eventfs_mutex);
return dentry;
}
- if (!ef->dentry && !ef->is_freed) {
- ef->dentry = dentry;
- if (ef->ei)
- eventfs_post_create_dir(ef);
- dentry->d_fsdata = ef;
+ if (!ei->dentry && !ei->is_freed) {
+ ei->dentry = dentry;
+ eventfs_post_create_dir(ei);
+ dentry->d_fsdata = ei;
} else {
- /* A race here, should try again (unless freed) */
- invalidate = true;
-
/*
* Should never happen unless we get here due to being freed.
* Otherwise it means two dentries exist with the same name.
*/
- WARN_ON_ONCE(!ef->is_freed);
+ WARN_ON_ONCE(!ei->is_freed);
+ dentry = NULL;
}
mutex_unlock(&eventfs_mutex);
- if (invalidate)
- d_invalidate(dentry);
- if (lookup || invalidate)
+ if (lookup)
dput(dentry);
- return invalidate ? NULL : dentry;
-}
-
-static bool match_event_file(struct eventfs_file *ef, const char *name)
-{
- bool ret;
-
- mutex_lock(&eventfs_mutex);
- ret = !ef->is_freed && strcmp(ef->name, name) == 0;
- mutex_unlock(&eventfs_mutex);
-
- return ret;
+ return dentry;
}
/**
* eventfs_root_lookup - lookup routine to create file/dir
* @dir: in which a lookup is being done
* @dentry: file/dir dentry
- * @flags: to pass as flags parameter to simple lookup
+ * @flags: Just passed to simple_lookup()
*
- * Used to create a dynamic file/dir within @dir. Use the eventfs_inode
- * list of meta data to find the information needed to create the file/dir.
+ * Used to create dynamic file/dir with-in @dir, search with-in @ei
+ * list, if @dentry found go ahead and create the file/dir
*/
+
static struct dentry *eventfs_root_lookup(struct inode *dir,
struct dentry *dentry,
unsigned int flags)
{
+ const struct file_operations *fops;
+ const struct eventfs_entry *entry;
+ struct eventfs_inode *ei_child;
struct tracefs_inode *ti;
struct eventfs_inode *ei;
- struct eventfs_file *ef;
+ struct dentry *ei_dentry = NULL;
struct dentry *ret = NULL;
+ const char *name = dentry->d_name.name;
+ bool created = false;
+ umode_t mode;
+ void *data;
int idx;
+ int i;
+ int r;
ti = get_tracefs(dir);
if (!(ti->flags & TRACEFS_EVENT_INODE))
return NULL;
- ei = ti->private;
+ /* Grab srcu to prevent the ei from going away */
idx = srcu_read_lock(&eventfs_srcu);
- list_for_each_entry_srcu(ef, &ei->e_top_files, list,
+
+ /*
+ * Grab the eventfs_mutex to consistent value from ti->private.
+ * This s
+ */
+ mutex_lock(&eventfs_mutex);
+ ei = READ_ONCE(ti->private);
+ if (ei && !ei->is_freed)
+ ei_dentry = READ_ONCE(ei->dentry);
+ mutex_unlock(&eventfs_mutex);
+
+ if (!ei || !ei_dentry)
+ goto out;
+
+ data = ei->data;
+
+ list_for_each_entry_srcu(ei_child, &ei->children, list,
srcu_read_lock_held(&eventfs_srcu)) {
- if (!match_event_file(ef, dentry->d_name.name))
+ if (strcmp(ei_child->name, name) != 0)
continue;
ret = simple_lookup(dir, dentry, flags);
- create_dentry(ef, ef->d_parent, true);
+ if (IS_ERR(ret))
+ goto out;
+ create_dir_dentry(ei, ei_child, ei_dentry, true);
+ created = true;
break;
}
+
+ if (created)
+ goto out;
+
+ for (i = 0; i < ei->nr_entries; i++) {
+ entry = &ei->entries[i];
+ if (strcmp(name, entry->name) == 0) {
+ void *cdata = data;
+ mutex_lock(&eventfs_mutex);
+ /* If ei->is_freed, then the event itself may be too */
+ if (!ei->is_freed)
+ r = entry->callback(name, &mode, &cdata, &fops);
+ else
+ r = -1;
+ mutex_unlock(&eventfs_mutex);
+ if (r <= 0)
+ continue;
+ ret = simple_lookup(dir, dentry, flags);
+ if (IS_ERR(ret))
+ goto out;
+ create_file_dentry(ei, i, ei_dentry, name, mode, cdata,
+ fops, true);
+ break;
+ }
+ }
+ out:
srcu_read_unlock(&eventfs_srcu, idx);
return ret;
}
@@ -432,29 +695,48 @@ static int eventfs_release(struct inode *inode, struct file *file)
return dcache_dir_close(inode, file);
}
+static int add_dentries(struct dentry ***dentries, struct dentry *d, int cnt)
+{
+ struct dentry **tmp;
+
+ tmp = krealloc(*dentries, sizeof(d) * (cnt + 2), GFP_NOFS);
+ if (!tmp)
+ return -1;
+ tmp[cnt] = d;
+ tmp[cnt + 1] = NULL;
+ *dentries = tmp;
+ return 0;
+}
+
/**
* dcache_dir_open_wrapper - eventfs open wrapper
* @inode: not used
- * @file: dir to be opened (to create its child)
+ * @file: dir to be opened (to create it's children)
*
- * Used to dynamically create the file/dir within @file. @file is really a
- * directory and all the files/dirs of the children within @file will be
- * created. If any of the files/dirs have already been created, their
- * reference count will be incremented.
+ * Used to dynamic create file/dir with-in @file, all the
+ * file/dir will be created. If already created then references
+ * will be increased
*/
static int dcache_dir_open_wrapper(struct inode *inode, struct file *file)
{
+ const struct file_operations *fops;
+ const struct eventfs_entry *entry;
+ struct eventfs_inode *ei_child;
struct tracefs_inode *ti;
struct eventfs_inode *ei;
- struct eventfs_file *ef;
struct dentry_list *dlist;
struct dentry **dentries = NULL;
- struct dentry *dentry = file_dentry(file);
+ struct dentry *parent = file_dentry(file);
struct dentry *d;
struct inode *f_inode = file_inode(file);
+ const char *name = parent->d_name.name;
+ umode_t mode;
+ void *data;
int cnt = 0;
int idx;
int ret;
+ int i;
+ int r;
ti = get_tracefs(f_inode);
if (!(ti->flags & TRACEFS_EVENT_INODE))
@@ -463,27 +745,60 @@ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file)
if (WARN_ON_ONCE(file->private_data))
return -EINVAL;
+ idx = srcu_read_lock(&eventfs_srcu);
+
+ mutex_lock(&eventfs_mutex);
+ ei = READ_ONCE(ti->private);
+ mutex_unlock(&eventfs_mutex);
+
+ if (!ei) {
+ srcu_read_unlock(&eventfs_srcu, idx);
+ return -EINVAL;
+ }
+
+
+ data = ei->data;
+
dlist = kmalloc(sizeof(*dlist), GFP_KERNEL);
- if (!dlist)
+ if (!dlist) {
+ srcu_read_unlock(&eventfs_srcu, idx);
return -ENOMEM;
+ }
- ei = ti->private;
- idx = srcu_read_lock(&eventfs_srcu);
- list_for_each_entry_srcu(ef, &ei->e_top_files, list,
+ inode_lock(parent->d_inode);
+ list_for_each_entry_srcu(ei_child, &ei->children, list,
srcu_read_lock_held(&eventfs_srcu)) {
- d = create_dentry(ef, dentry, false);
+ d = create_dir_dentry(ei, ei_child, parent, false);
if (d) {
- struct dentry **tmp;
+ ret = add_dentries(&dentries, d, cnt);
+ if (ret < 0)
+ break;
+ cnt++;
+ }
+ }
- tmp = krealloc(dentries, sizeof(d) * (cnt + 2), GFP_KERNEL);
- if (!tmp)
+ for (i = 0; i < ei->nr_entries; i++) {
+ void *cdata = data;
+ entry = &ei->entries[i];
+ name = entry->name;
+ mutex_lock(&eventfs_mutex);
+ /* If ei->is_freed, then the event itself may be too */
+ if (!ei->is_freed)
+ r = entry->callback(name, &mode, &cdata, &fops);
+ else
+ r = -1;
+ mutex_unlock(&eventfs_mutex);
+ if (r <= 0)
+ continue;
+ d = create_file_dentry(ei, i, parent, name, mode, cdata, fops, false);
+ if (d) {
+ ret = add_dentries(&dentries, d, cnt);
+ if (ret < 0)
break;
- tmp[cnt] = d;
- tmp[cnt + 1] = NULL;
cnt++;
- dentries = tmp;
}
}
+ inode_unlock(parent->d_inode);
srcu_read_unlock(&eventfs_srcu, idx);
ret = dcache_dir_open(inode, file);
@@ -514,287 +829,253 @@ static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx)
}
/**
- * eventfs_prepare_ef - helper function to prepare eventfs_file
- * @name: the name of the file/directory to create.
- * @mode: the permission that the file should have.
- * @fop: struct file_operations that should be used for this file/directory.
- * @iop: struct inode_operations that should be used for this file/directory.
- * @data: something that the caller will want to get to later on. The
- * inode.i_private pointer will point to this value on the open() call.
+ * eventfs_create_dir - Create the eventfs_inode for this directory
+ * @name: The name of the directory to create.
+ * @parent: The eventfs_inode of the parent directory.
+ * @entries: A list of entries that represent the files under this directory
+ * @size: The number of @entries
+ * @data: The default data to pass to the files (an entry may override it).
+ *
+ * This function creates the descriptor to represent a directory in the
+ * eventfs. This descriptor is an eventfs_inode, and it is returned to be
+ * used to create other children underneath.
*
- * This function allocates and fills the eventfs_file structure.
+ * The @entries is an array of eventfs_entry structures which has:
+ * const char *name
+ * eventfs_callback callback;
+ *
+ * The name is the name of the file, and the callback is a pointer to a function
+ * that will be called when the file is reference (either by lookup or by
+ * reading a directory). The callback is of the prototype:
+ *
+ * int callback(const char *name, umode_t *mode, void **data,
+ * const struct file_operations **fops);
+ *
+ * When a file needs to be created, this callback will be called with
+ * name = the name of the file being created (so that the same callback
+ * may be used for multiple files).
+ * mode = a place to set the file's mode
+ * data = A pointer to @data, and the callback may replace it, which will
+ * cause the file created to pass the new data to the open() call.
+ * fops = the fops to use for the created file.
+ *
+ * NB. @callback is called while holding internal locks of the eventfs
+ * system. The callback must not call any code that might also call into
+ * the tracefs or eventfs system or it will risk creating a deadlock.
*/
-static struct eventfs_file *eventfs_prepare_ef(const char *name, umode_t mode,
- const struct file_operations *fop,
- const struct inode_operations *iop,
- void *data)
+struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
+ const struct eventfs_entry *entries,
+ int size, void *data)
{
- struct eventfs_file *ef;
+ struct eventfs_inode *ei;
- ef = kzalloc(sizeof(*ef), GFP_KERNEL);
- if (!ef)
+ if (!parent)
+ return ERR_PTR(-EINVAL);
+
+ ei = kzalloc(sizeof(*ei), GFP_KERNEL);
+ if (!ei)
return ERR_PTR(-ENOMEM);
- ef->name = kstrdup(name, GFP_KERNEL);
- if (!ef->name) {
- kfree(ef);
+ ei->name = kstrdup_const(name, GFP_KERNEL);
+ if (!ei->name) {
+ kfree(ei);
return ERR_PTR(-ENOMEM);
}
- if (S_ISDIR(mode)) {
- ef->ei = kzalloc(sizeof(*ef->ei), GFP_KERNEL);
- if (!ef->ei) {
- kfree(ef->name);
- kfree(ef);
+ if (size) {
+ ei->d_children = kzalloc(sizeof(*ei->d_children) * size, GFP_KERNEL);
+ if (!ei->d_children) {
+ kfree_const(ei->name);
+ kfree(ei);
return ERR_PTR(-ENOMEM);
}
- INIT_LIST_HEAD(&ef->ei->e_top_files);
- } else {
- ef->ei = NULL;
}
- ef->iop = iop;
- ef->fop = fop;
- ef->mode = mode;
- ef->data = data;
- return ef;
+ ei->entries = entries;
+ ei->nr_entries = size;
+ ei->data = data;
+ INIT_LIST_HEAD(&ei->children);
+ INIT_LIST_HEAD(&ei->list);
+
+ mutex_lock(&eventfs_mutex);
+ if (!parent->is_freed) {
+ list_add_tail(&ei->list, &parent->children);
+ ei->d_parent = parent->dentry;
+ }
+ mutex_unlock(&eventfs_mutex);
+
+ /* Was the parent freed? */
+ if (list_empty(&ei->list)) {
+ free_ei(ei);
+ ei = NULL;
+ }
+ return ei;
}
/**
- * eventfs_create_events_dir - create the trace event structure
- * @name: the name of the directory to create.
- * @parent: parent dentry for this file. This should be a directory dentry
- * if set. If this parameter is NULL, then the directory will be
- * created in the root of the tracefs filesystem.
+ * eventfs_create_events_dir - create the top level events directory
+ * @name: The name of the top level directory to create.
+ * @parent: Parent dentry for this file in the tracefs directory.
+ * @entries: A list of entries that represent the files under this directory
+ * @size: The number of @entries
+ * @data: The default data to pass to the files (an entry may override it).
*
* This function creates the top of the trace event directory.
+ *
+ * See eventfs_create_dir() for use of @entries.
*/
-struct dentry *eventfs_create_events_dir(const char *name,
- struct dentry *parent)
+struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
+ const struct eventfs_entry *entries,
+ int size, void *data)
{
struct dentry *dentry = tracefs_start_creating(name, parent);
struct eventfs_inode *ei;
struct tracefs_inode *ti;
struct inode *inode;
+ kuid_t uid;
+ kgid_t gid;
if (security_locked_down(LOCKDOWN_TRACEFS))
return NULL;
if (IS_ERR(dentry))
- return dentry;
+ return ERR_CAST(dentry);
ei = kzalloc(sizeof(*ei), GFP_KERNEL);
if (!ei)
- return ERR_PTR(-ENOMEM);
+ goto fail_ei;
+
inode = tracefs_get_inode(dentry->d_sb);
- if (unlikely(!inode)) {
- kfree(ei);
- tracefs_failed_creating(dentry);
- return ERR_PTR(-ENOMEM);
+ if (unlikely(!inode))
+ goto fail;
+
+ if (size) {
+ ei->d_children = kzalloc(sizeof(*ei->d_children) * size, GFP_KERNEL);
+ if (!ei->d_children)
+ goto fail;
}
- INIT_LIST_HEAD(&ei->e_top_files);
+ ei->dentry = dentry;
+ ei->entries = entries;
+ ei->nr_entries = size;
+ ei->is_events = 1;
+ ei->data = data;
+ ei->name = kstrdup_const(name, GFP_KERNEL);
+ if (!ei->name)
+ goto fail;
+
+ /* Save the ownership of this directory */
+ uid = d_inode(dentry->d_parent)->i_uid;
+ gid = d_inode(dentry->d_parent)->i_gid;
+
+ /* This is used as the default ownership of the files and directories */
+ ei->attr.uid = uid;
+ ei->attr.gid = gid;
+
+ INIT_LIST_HEAD(&ei->children);
+ INIT_LIST_HEAD(&ei->list);
ti = get_tracefs(inode);
ti->flags |= TRACEFS_EVENT_INODE | TRACEFS_EVENT_TOP_INODE;
ti->private = ei;
inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+ inode->i_uid = uid;
+ inode->i_gid = gid;
inode->i_op = &eventfs_root_dir_inode_operations;
inode->i_fop = &eventfs_file_operations;
+ dentry->d_fsdata = ei;
+
/* directory inodes start off with i_nlink == 2 (for "." entry) */
inc_nlink(inode);
d_instantiate(dentry, inode);
inc_nlink(dentry->d_parent->d_inode);
fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
- return tracefs_end_creating(dentry);
-}
-
-/**
- * eventfs_add_subsystem_dir - add eventfs subsystem_dir to list to create later
- * @name: the name of the file to create.
- * @parent: parent dentry for this dir.
- *
- * This function adds eventfs subsystem dir to list.
- * And all these dirs are created on the fly when they are looked up,
- * and the dentry and inodes will be removed when they are done.
- */
-struct eventfs_file *eventfs_add_subsystem_dir(const char *name,
- struct dentry *parent)
-{
- struct tracefs_inode *ti_parent;
- struct eventfs_inode *ei_parent;
- struct eventfs_file *ef;
+ tracefs_end_creating(dentry);
- if (security_locked_down(LOCKDOWN_TRACEFS))
- return NULL;
+ return ei;
- if (!parent)
- return ERR_PTR(-EINVAL);
-
- ti_parent = get_tracefs(parent->d_inode);
- ei_parent = ti_parent->private;
-
- ef = eventfs_prepare_ef(name, S_IFDIR, NULL, NULL, NULL);
- if (IS_ERR(ef))
- return ef;
-
- mutex_lock(&eventfs_mutex);
- list_add_tail(&ef->list, &ei_parent->e_top_files);
- ef->d_parent = parent;
- mutex_unlock(&eventfs_mutex);
- return ef;
+ fail:
+ kfree(ei->d_children);
+ kfree(ei);
+ fail_ei:
+ tracefs_failed_creating(dentry);
+ return ERR_PTR(-ENOMEM);
}
-/**
- * eventfs_add_dir - add eventfs dir to list to create later
- * @name: the name of the file to create.
- * @ef_parent: parent eventfs_file for this dir.
- *
- * This function adds eventfs dir to list.
- * And all these dirs are created on the fly when they are looked up,
- * and the dentry and inodes will be removed when they are done.
- */
-struct eventfs_file *eventfs_add_dir(const char *name,
- struct eventfs_file *ef_parent)
-{
- struct eventfs_file *ef;
-
- if (security_locked_down(LOCKDOWN_TRACEFS))
- return NULL;
-
- if (!ef_parent)
- return ERR_PTR(-EINVAL);
-
- ef = eventfs_prepare_ef(name, S_IFDIR, NULL, NULL, NULL);
- if (IS_ERR(ef))
- return ef;
+static LLIST_HEAD(free_list);
- mutex_lock(&eventfs_mutex);
- list_add_tail(&ef->list, &ef_parent->ei->e_top_files);
- ef->d_parent = ef_parent->dentry;
- mutex_unlock(&eventfs_mutex);
- return ef;
-}
-
-/**
- * eventfs_add_events_file - add the data needed to create a file for later reference
- * @name: the name of the file to create.
- * @mode: the permission that the file should have.
- * @parent: parent dentry for this file.
- * @data: something that the caller will want to get to later on.
- * @fop: struct file_operations that should be used for this file.
- *
- * This function is used to add the information needed to create a
- * dentry/inode within the top level events directory. The file created
- * will have the @mode permissions. The @data will be used to fill the
- * inode.i_private when the open() call is done. The dentry and inodes are
- * all created when they are referenced, and removed when they are no
- * longer referenced.
- */
-int eventfs_add_events_file(const char *name, umode_t mode,
- struct dentry *parent, void *data,
- const struct file_operations *fop)
+static void eventfs_workfn(struct work_struct *work)
{
- struct tracefs_inode *ti;
- struct eventfs_inode *ei;
- struct eventfs_file *ef;
-
- if (security_locked_down(LOCKDOWN_TRACEFS))
- return -ENODEV;
-
- if (!parent)
- return -EINVAL;
-
- if (!(mode & S_IFMT))
- mode |= S_IFREG;
-
- if (!parent->d_inode)
- return -EINVAL;
-
- ti = get_tracefs(parent->d_inode);
- if (!(ti->flags & TRACEFS_EVENT_INODE))
- return -EINVAL;
-
- ei = ti->private;
- ef = eventfs_prepare_ef(name, mode, fop, NULL, data);
-
- if (IS_ERR(ef))
- return -ENOMEM;
-
- mutex_lock(&eventfs_mutex);
- list_add_tail(&ef->list, &ei->e_top_files);
- ef->d_parent = parent;
- mutex_unlock(&eventfs_mutex);
- return 0;
+ struct eventfs_inode *ei, *tmp;
+ struct llist_node *llnode;
+
+ llnode = llist_del_all(&free_list);
+ llist_for_each_entry_safe(ei, tmp, llnode, llist) {
+ /* This dput() matches the dget() from unhook_dentry() */
+ for (int i = 0; i < ei->nr_entries; i++) {
+ if (ei->d_children[i])
+ dput(ei->d_children[i]);
+ }
+ /* This should only get here if it had a dentry */
+ if (!WARN_ON_ONCE(!ei->dentry))
+ dput(ei->dentry);
+ }
}
-/**
- * eventfs_add_file - add eventfs file to list to create later
- * @name: the name of the file to create.
- * @mode: the permission that the file should have.
- * @ef_parent: parent eventfs_file for this file.
- * @data: something that the caller will want to get to later on.
- * @fop: struct file_operations that should be used for this file.
- *
- * This function is used to add the information needed to create a
- * file within a subdirectory of the events directory. The file created
- * will have the @mode permissions. The @data will be used to fill the
- * inode.i_private when the open() call is done. The dentry and inodes are
- * all created when they are referenced, and removed when they are no
- * longer referenced.
- */
-int eventfs_add_file(const char *name, umode_t mode,
- struct eventfs_file *ef_parent,
- void *data,
- const struct file_operations *fop)
-{
- struct eventfs_file *ef;
-
- if (security_locked_down(LOCKDOWN_TRACEFS))
- return -ENODEV;
+static DECLARE_WORK(eventfs_work, eventfs_workfn);
- if (!ef_parent)
- return -EINVAL;
+static void free_rcu_ei(struct rcu_head *head)
+{
+ struct eventfs_inode *ei = container_of(head, struct eventfs_inode, rcu);
- if (!(mode & S_IFMT))
- mode |= S_IFREG;
+ if (ei->dentry) {
+ /* Do not free the ei until all references of dentry are gone */
+ if (llist_add(&ei->llist, &free_list))
+ queue_work(system_unbound_wq, &eventfs_work);
+ return;
+ }
- ef = eventfs_prepare_ef(name, mode, fop, NULL, data);
- if (IS_ERR(ef))
- return -ENOMEM;
+ /* If the ei doesn't have a dentry, neither should its children */
+ for (int i = 0; i < ei->nr_entries; i++) {
+ WARN_ON_ONCE(ei->d_children[i]);
+ }
- mutex_lock(&eventfs_mutex);
- list_add_tail(&ef->list, &ef_parent->ei->e_top_files);
- ef->d_parent = ef_parent->dentry;
- mutex_unlock(&eventfs_mutex);
- return 0;
+ free_ei(ei);
}
-static void free_ef(struct rcu_head *head)
+static void unhook_dentry(struct dentry *dentry)
{
- struct eventfs_file *ef = container_of(head, struct eventfs_file, rcu);
+ if (!dentry)
+ return;
+ /*
+ * Need to add a reference to the dentry that is expected by
+ * simple_recursive_removal(), which will include a dput().
+ */
+ dget(dentry);
- kfree(ef->name);
- kfree(ef->ei);
- kfree(ef);
+ /*
+ * Also add a reference for the dput() in eventfs_workfn().
+ * That is required as that dput() will free the ei after
+ * the SRCU grace period is over.
+ */
+ dget(dentry);
}
/**
* eventfs_remove_rec - remove eventfs dir or file from list
- * @ef: eventfs_file to be removed.
- * @head: to create list of eventfs_file to be deleted
- * @level: to check recursion depth
+ * @ei: eventfs_inode to be removed.
+ * @level: prevent recursion from going more than 3 levels deep.
*
- * The helper function eventfs_remove_rec() is used to clean up and free the
- * associated data from eventfs for both of the added functions.
+ * This function recursively removes eventfs_inodes which
+ * contains info of files and/or directories.
*/
-static void eventfs_remove_rec(struct eventfs_file *ef, struct list_head *head, int level)
+static void eventfs_remove_rec(struct eventfs_inode *ei, int level)
{
- struct eventfs_file *ef_child;
+ struct eventfs_inode *ei_child;
- if (!ef)
+ if (!ei)
return;
/*
* Check recursion depth. It should never be greater than 3:
@@ -806,100 +1087,76 @@ static void eventfs_remove_rec(struct eventfs_file *ef, struct list_head *head,
if (WARN_ON_ONCE(level > 3))
return;
- if (ef->ei) {
- /* search for nested folders or files */
- list_for_each_entry_srcu(ef_child, &ef->ei->e_top_files, list,
- lockdep_is_held(&eventfs_mutex)) {
- eventfs_remove_rec(ef_child, head, level + 1);
+ /* search for nested folders or files */
+ list_for_each_entry_srcu(ei_child, &ei->children, list,
+ lockdep_is_held(&eventfs_mutex)) {
+ /* Children only have dentry if parent does */
+ WARN_ON_ONCE(ei_child->dentry && !ei->dentry);
+ eventfs_remove_rec(ei_child, level + 1);
+ }
+
+
+ ei->is_freed = 1;
+
+ for (int i = 0; i < ei->nr_entries; i++) {
+ if (ei->d_children[i]) {
+ /* Children only have dentry if parent does */
+ WARN_ON_ONCE(!ei->dentry);
+ unhook_dentry(ei->d_children[i]);
}
}
- list_del_rcu(&ef->list);
- list_add_tail(&ef->del_list, head);
+ unhook_dentry(ei->dentry);
+
+ list_del_rcu(&ei->list);
+ call_srcu(&eventfs_srcu, &ei->rcu, free_rcu_ei);
}
/**
- * eventfs_remove - remove eventfs dir or file from list
- * @ef: eventfs_file to be removed.
+ * eventfs_remove_dir - remove eventfs dir or file from list
+ * @ei: eventfs_inode to be removed.
*
* This function acquire the eventfs_mutex lock and call eventfs_remove_rec()
*/
-void eventfs_remove(struct eventfs_file *ef)
+void eventfs_remove_dir(struct eventfs_inode *ei)
{
- struct eventfs_file *tmp;
- LIST_HEAD(ef_del_list);
- struct dentry *dentry_list = NULL;
struct dentry *dentry;
- if (!ef)
+ if (!ei)
return;
mutex_lock(&eventfs_mutex);
- eventfs_remove_rec(ef, &ef_del_list, 0);
- list_for_each_entry_safe(ef, tmp, &ef_del_list, del_list) {
- if (ef->dentry) {
- unsigned long ptr = (unsigned long)dentry_list;
-
- /* Keep the dentry from being freed yet */
- dget(ef->dentry);
-
- /*
- * Paranoid: The dget() above should prevent the dentry
- * from being freed and calling eventfs_set_ef_status_free().
- * But just in case, set the link list LSB pointer to 1
- * and have eventfs_set_ef_status_free() check that to
- * make sure that if it does happen, it will not think
- * the d_fsdata is an event_file.
- *
- * For this to work, no event_file should be allocated
- * on a odd space, as the ef should always be allocated
- * to be at least word aligned. Check for that too.
- */
- WARN_ON_ONCE(ptr & 1);
-
- ef->dentry->d_fsdata = (void *)(ptr | 1);
- dentry_list = ef->dentry;
- ef->dentry = NULL;
- }
- call_srcu(&eventfs_srcu, &ef->rcu, free_ef);
- }
+ dentry = ei->dentry;
+ eventfs_remove_rec(ei, 0);
mutex_unlock(&eventfs_mutex);
- while (dentry_list) {
- unsigned long ptr;
-
- dentry = dentry_list;
- ptr = (unsigned long)dentry->d_fsdata & ~1UL;
- dentry_list = (struct dentry *)ptr;
- dentry->d_fsdata = NULL;
- d_invalidate(dentry);
- mutex_lock(&eventfs_mutex);
- /* dentry should now have at least a single reference */
- WARN_ONCE((int)d_count(dentry) < 1,
- "dentry %p less than one reference (%d) after invalidate\n",
- dentry, d_count(dentry));
- mutex_unlock(&eventfs_mutex);
- dput(dentry);
- }
+ /*
+ * If any of the ei children has a dentry, then the ei itself
+ * must have a dentry.
+ */
+ if (dentry)
+ simple_recursive_removal(dentry, NULL);
}
/**
- * eventfs_remove_events_dir - remove eventfs dir or file from list
- * @dentry: events's dentry to be removed.
+ * eventfs_remove_events_dir - remove the top level eventfs directory
+ * @ei: the event_inode returned by eventfs_create_events_dir().
*
- * This function remove events main directory
+ * This function removes the events main directory
*/
-void eventfs_remove_events_dir(struct dentry *dentry)
+void eventfs_remove_events_dir(struct eventfs_inode *ei)
{
- struct tracefs_inode *ti;
-
- if (!dentry || !dentry->d_inode)
- return;
+ struct dentry *dentry;
- ti = get_tracefs(dentry->d_inode);
- if (!ti || !(ti->flags & TRACEFS_EVENT_INODE))
- return;
+ dentry = ei->dentry;
+ eventfs_remove_dir(ei);
- d_invalidate(dentry);
+ /*
+ * Matches the dget() done by tracefs_start_creating()
+ * in eventfs_create_events_dir() when it the dentry was
+ * created. In other words, it's a normal dentry that
+ * sticks around while the other ei->dentry are created
+ * and destroyed dynamically.
+ */
dput(dentry);
}
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 891653ba9cf3..bc86ffdb103b 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -152,7 +152,7 @@ struct inode *tracefs_get_inode(struct super_block *sb)
struct inode *inode = new_inode(sb);
if (inode) {
inode->i_ino = get_next_ino();
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
}
return inode;
}
@@ -210,14 +210,24 @@ repeat:
next = this_parent->d_subdirs.next;
resume:
while (next != &this_parent->d_subdirs) {
+ struct tracefs_inode *ti;
struct list_head *tmp = next;
struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
next = tmp->next;
+ /* Note, getdents() can add a cursor dentry with no inode */
+ if (!dentry->d_inode)
+ continue;
+
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
change_gid(dentry, gid);
+ /* If this is the events directory, update that too */
+ ti = get_tracefs(dentry->d_inode);
+ if (ti && (ti->flags & TRACEFS_EVENT_INODE))
+ eventfs_update_gid(dentry, gid);
+
if (!list_empty(&dentry->d_subdirs)) {
spin_unlock(&this_parent->d_lock);
spin_release(&dentry->d_lock.dep_map, _RET_IP_);
@@ -385,7 +395,7 @@ static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode)
ti = get_tracefs(inode);
if (ti && ti->flags & TRACEFS_EVENT_INODE)
- eventfs_set_ef_status_free(ti, dentry);
+ eventfs_set_ei_status_free(ti, dentry);
iput(inode);
}
@@ -509,20 +519,15 @@ struct dentry *eventfs_start_creating(const char *name, struct dentry *parent)
struct dentry *dentry;
int error;
+ /* Must always have a parent. */
+ if (WARN_ON_ONCE(!parent))
+ return ERR_PTR(-EINVAL);
+
error = simple_pin_fs(&trace_fs_type, &tracefs_mount,
&tracefs_mount_count);
if (error)
return ERR_PTR(error);
- /*
- * If the parent is not specified, we create it in the root.
- * We need the root dentry to do this, which is in the super
- * block. A pointer to that is in the struct vfsmount that we
- * have around.
- */
- if (!parent)
- parent = tracefs_mount->mnt_root;
-
if (unlikely(IS_DEADDIR(parent->d_inode)))
dentry = ERR_PTR(-ENOENT);
else
diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h
index 4f2e49e2197b..42bdeb471a07 100644
--- a/fs/tracefs/internal.h
+++ b/fs/tracefs/internal.h
@@ -13,6 +13,59 @@ struct tracefs_inode {
struct inode vfs_inode;
};
+/*
+ * struct eventfs_attr - cache the mode and ownership of a eventfs entry
+ * @mode: saved mode plus flags of what is saved
+ * @uid: saved uid if changed
+ * @gid: saved gid if changed
+ */
+struct eventfs_attr {
+ int mode;
+ kuid_t uid;
+ kgid_t gid;
+};
+
+/*
+ * struct eventfs_inode - hold the properties of the eventfs directories.
+ * @list: link list into the parent directory
+ * @entries: the array of entries representing the files in the directory
+ * @name: the name of the directory to create
+ * @children: link list into the child eventfs_inode
+ * @dentry: the dentry of the directory
+ * @d_parent: pointer to the parent's dentry
+ * @d_children: The array of dentries to represent the files when created
+ * @entry_attrs: Saved mode and ownership of the @d_children
+ * @attr: Saved mode and ownership of eventfs_inode itself
+ * @data: The private data to pass to the callbacks
+ * @is_freed: Flag set if the eventfs is on its way to be freed
+ * Note if is_freed is set, then dentry is corrupted.
+ * @nr_entries: The number of items in @entries
+ */
+struct eventfs_inode {
+ struct list_head list;
+ const struct eventfs_entry *entries;
+ const char *name;
+ struct list_head children;
+ struct dentry *dentry; /* Check is_freed to access */
+ struct dentry *d_parent;
+ struct dentry **d_children;
+ struct eventfs_attr *entry_attrs;
+ struct eventfs_attr attr;
+ void *data;
+ /*
+ * Union - used for deletion
+ * @llist: for calling dput() if needed after RCU
+ * @rcu: eventfs_inode to delete in RCU
+ */
+ union {
+ struct llist_node llist;
+ struct rcu_head rcu;
+ };
+ unsigned int is_freed:1;
+ unsigned int is_events:1;
+ unsigned int nr_entries:30;
+};
+
static inline struct tracefs_inode *get_tracefs(const struct inode *inode)
{
return container_of(inode, struct tracefs_inode, vfs_inode);
@@ -25,6 +78,7 @@ struct inode *tracefs_get_inode(struct super_block *sb);
struct dentry *eventfs_start_creating(const char *name, struct dentry *parent);
struct dentry *eventfs_failed_creating(struct dentry *dentry);
struct dentry *eventfs_end_creating(struct dentry *dentry);
-void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry);
+void eventfs_update_gid(struct dentry *dentry, kgid_t gid);
+void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry);
#endif /* _TRACEFS_INTERNAL_H */
diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c
index e564d5ff8781..0d561ecb6869 100644
--- a/fs/ubifs/auth.c
+++ b/fs/ubifs/auth.c
@@ -9,10 +9,9 @@
* This file implements various helper functions for UBIFS authentication support
*/
-#include <linux/crypto.h>
#include <linux/verification.h>
#include <crypto/hash.h>
-#include <crypto/algapi.h>
+#include <crypto/utils.h>
#include <keys/user-type.h>
#include <keys/asymmetric-type.h>
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index 3125e76376ee..921f9033d0d2 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -88,8 +88,7 @@ int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn,
}
const struct fscrypt_operations ubifs_crypt_operations = {
- .flags = FS_CFLG_OWN_PAGES,
- .key_prefix = "ubifs:",
+ .legacy_key_prefix = "ubifs:",
.get_context = ubifs_crypt_get_context,
.set_context = ubifs_crypt_set_context,
.empty_dir = ubifs_crypt_empty_dir,
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index eef9e527d9ff..d013c5b3f1ed 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -237,14 +237,14 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode)
pr_err("\tuid %u\n", (unsigned int)i_uid_read(inode));
pr_err("\tgid %u\n", (unsigned int)i_gid_read(inode));
pr_err("\tatime %u.%u\n",
- (unsigned int)inode->i_atime.tv_sec,
- (unsigned int)inode->i_atime.tv_nsec);
+ (unsigned int) inode_get_atime_sec(inode),
+ (unsigned int) inode_get_atime_nsec(inode));
pr_err("\tmtime %u.%u\n",
- (unsigned int)inode->i_mtime.tv_sec,
- (unsigned int)inode->i_mtime.tv_nsec);
+ (unsigned int) inode_get_mtime_sec(inode),
+ (unsigned int) inode_get_mtime_nsec(inode));
pr_err("\tctime %u.%u\n",
- (unsigned int) inode_get_ctime(inode).tv_sec,
- (unsigned int) inode_get_ctime(inode).tv_nsec);
+ (unsigned int) inode_get_ctime_sec(inode),
+ (unsigned int) inode_get_ctime_nsec(inode));
pr_err("\tcreat_sqnum %llu\n", ui->creat_sqnum);
pr_err("\txattr_size %u\n", ui->xattr_size);
pr_err("\txattr_cnt %u\n", ui->xattr_cnt);
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 2f48c58d47cd..3b13c648d490 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -96,7 +96,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
inode->i_flags |= S_NOCMTIME;
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_mapping->nrpages = 0;
if (!is_xattr) {
@@ -324,7 +324,8 @@ static int ubifs_create(struct mnt_idmap *idmap, struct inode *dir,
mutex_lock(&dir_ui->ui_mutex);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
- dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err)
goto out_cancel;
@@ -724,7 +725,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
struct inode *inode = d_inode(old_dentry);
struct ubifs_inode *ui = ubifs_inode(inode);
struct ubifs_inode *dir_ui = ubifs_inode(dir);
- int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+ int err, sz_change;
struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2,
.dirtied_ino_d = ALIGN(ui->data_len, 8) };
struct fscrypt_name nm;
@@ -748,6 +749,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
if (err)
return err;
+ sz_change = CALC_DENT_SIZE(fname_len(&nm));
+
err = dbg_check_synced_i_size(c, inode);
if (err)
goto out_fname;
@@ -767,7 +770,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
inode_set_ctime_current(inode);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
- dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err)
goto out_cancel;
@@ -841,7 +845,8 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
drop_nlink(inode);
dir->i_size -= sz_change;
dir_ui->ui_size = dir->i_size;
- dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
if (err)
goto out_cancel;
@@ -944,7 +949,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
drop_nlink(dir);
dir->i_size -= sz_change;
dir_ui->ui_size = dir->i_size;
- dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
if (err)
goto out_cancel;
@@ -1018,7 +1024,8 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
inc_nlink(dir);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
- dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err) {
ubifs_err(c, "cannot create directory, error %d", err);
@@ -1109,7 +1116,8 @@ static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir,
mutex_lock(&dir_ui->ui_mutex);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
- dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err)
goto out_cancel;
@@ -1209,7 +1217,8 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir,
mutex_lock(&dir_ui->ui_mutex);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
- dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err)
goto out_cancel;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index e5382f0b2587..2d2b39f843ce 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1088,9 +1088,9 @@ static void do_attr_changes(struct inode *inode, const struct iattr *attr)
if (attr->ia_valid & ATTR_GID)
inode->i_gid = attr->ia_gid;
if (attr->ia_valid & ATTR_ATIME)
- inode->i_atime = attr->ia_atime;
+ inode_set_atime_to_ts(inode, attr->ia_atime);
if (attr->ia_valid & ATTR_MTIME)
- inode->i_mtime = attr->ia_mtime;
+ inode_set_mtime_to_ts(inode, attr->ia_mtime);
if (attr->ia_valid & ATTR_CTIME)
inode_set_ctime_to_ts(inode, attr->ia_ctime);
if (attr->ia_valid & ATTR_MODE) {
@@ -1192,7 +1192,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
mutex_lock(&ui->ui_mutex);
ui->ui_size = inode->i_size;
/* Truncation changes inode [mc]time */
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
/* Other attributes may be changed at the same time as well */
do_attr_changes(inode, attr);
err = ubifs_jnl_truncate(c, inode, old_size, new_size);
@@ -1239,7 +1239,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
mutex_lock(&ui->ui_mutex);
if (attr->ia_valid & ATTR_SIZE) {
/* Truncation changes inode [mc]time */
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
/* 'truncate_setsize()' changed @i_size, update @ui_size */
ui->ui_size = inode->i_size;
}
@@ -1365,9 +1365,9 @@ static inline int mctime_update_needed(const struct inode *inode,
const struct timespec64 *now)
{
struct timespec64 ctime = inode_get_ctime(inode);
+ struct timespec64 mtime = inode_get_mtime(inode);
- if (!timespec64_equal(&inode->i_mtime, now) ||
- !timespec64_equal(&ctime, now))
+ if (!timespec64_equal(&mtime, now) || !timespec64_equal(&ctime, now))
return 1;
return 0;
}
@@ -1375,6 +1375,9 @@ static inline int mctime_update_needed(const struct inode *inode,
/**
* ubifs_update_time - update time of inode.
* @inode: inode to update
+ * @time: timespec structure to hold the current time value
+ * @flags: time updating control flag determines updating
+ * which time fields of @inode
*
* This function updates time of the inode.
*/
@@ -1429,7 +1432,7 @@ static int update_mctime(struct inode *inode)
return err;
mutex_lock(&ui->ui_mutex);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
release = ui->dirty;
mark_inode_dirty_sync(inode);
mutex_unlock(&ui->ui_mutex);
@@ -1567,7 +1570,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
struct ubifs_inode *ui = ubifs_inode(inode);
mutex_lock(&ui->ui_mutex);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
release = ui->dirty;
mark_inode_dirty_sync(inode);
mutex_unlock(&ui->ui_mutex);
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index ffc9beee7be6..f0a5538c84b0 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -452,12 +452,12 @@ static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino,
ino->ch.node_type = UBIFS_INO_NODE;
ino_key_init_flash(c, &ino->key, inode->i_ino);
ino->creat_sqnum = cpu_to_le64(ui->creat_sqnum);
- ino->atime_sec = cpu_to_le64(inode->i_atime.tv_sec);
- ino->atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
- ino->ctime_sec = cpu_to_le64(inode_get_ctime(inode).tv_sec);
- ino->ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
- ino->mtime_sec = cpu_to_le64(inode->i_mtime.tv_sec);
- ino->mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ ino->atime_sec = cpu_to_le64(inode_get_atime_sec(inode));
+ ino->atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode));
+ ino->ctime_sec = cpu_to_le64(inode_get_ctime_sec(inode));
+ ino->ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
+ ino->mtime_sec = cpu_to_le64(inode_get_mtime_sec(inode));
+ ino->mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
ino->uid = cpu_to_le32(i_uid_read(inode));
ino->gid = cpu_to_le32(i_gid_read(inode));
ino->mode = cpu_to_le32(inode->i_mode);
@@ -1607,6 +1607,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
ubifs_err(c, "bad data node (block %u, inode %lu)",
blk, inode->i_ino);
ubifs_dump_node(c, dn, dn_size);
+ err = -EUCLEAN;
goto out_free;
}
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 4211e4456b1e..c59d47fe7939 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -23,7 +23,6 @@
#include "ubifs.h"
#include <linux/list_sort.h>
#include <crypto/hash.h>
-#include <crypto/algapi.h>
/**
* struct replay_entry - replay list entry.
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index b08fb28d16b5..09e270d6ed02 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -54,11 +54,7 @@ module_param_cb(default_version, &ubifs_default_version_ops, &ubifs_default_vers
static struct kmem_cache *ubifs_inode_slab;
/* UBIFS TNC shrinker description */
-static struct shrinker ubifs_shrinker_info = {
- .scan_objects = ubifs_shrink_scan,
- .count_objects = ubifs_shrink_count,
- .seeks = DEFAULT_SEEKS,
-};
+static struct shrinker *ubifs_shrinker_info;
/**
* validate_inode - validate inode.
@@ -142,10 +138,10 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
set_nlink(inode, le32_to_cpu(ino->nlink));
i_uid_write(inode, le32_to_cpu(ino->uid));
i_gid_write(inode, le32_to_cpu(ino->gid));
- inode->i_atime.tv_sec = (int64_t)le64_to_cpu(ino->atime_sec);
- inode->i_atime.tv_nsec = le32_to_cpu(ino->atime_nsec);
- inode->i_mtime.tv_sec = (int64_t)le64_to_cpu(ino->mtime_sec);
- inode->i_mtime.tv_nsec = le32_to_cpu(ino->mtime_nsec);
+ inode_set_atime(inode, (int64_t)le64_to_cpu(ino->atime_sec),
+ le32_to_cpu(ino->atime_nsec));
+ inode_set_mtime(inode, (int64_t)le64_to_cpu(ino->mtime_sec),
+ le32_to_cpu(ino->mtime_nsec));
inode_set_ctime(inode, (int64_t)le64_to_cpu(ino->ctime_sec),
le32_to_cpu(ino->ctime_nsec));
inode->i_mode = le32_to_cpu(ino->mode);
@@ -923,8 +919,10 @@ static void free_buds(struct ubifs_info *c)
{
struct ubifs_bud *bud, *n;
- rbtree_postorder_for_each_entry_safe(bud, n, &c->buds, rb)
+ rbtree_postorder_for_each_entry_safe(bud, n, &c->buds, rb) {
+ kfree(bud->log_hash);
kfree(bud);
+ }
}
/**
@@ -1193,6 +1191,7 @@ static void destroy_journal(struct ubifs_info *c)
bud = list_entry(c->old_buds.next, struct ubifs_bud, list);
list_del(&bud->list);
+ kfree(bud->log_hash);
kfree(bud);
}
ubifs_destroy_idx_gc(c);
@@ -2373,7 +2372,7 @@ static void inode_slab_ctor(void *obj)
static int __init ubifs_init(void)
{
- int err;
+ int err = -ENOMEM;
BUILD_BUG_ON(sizeof(struct ubifs_ch) != 24);
@@ -2439,10 +2438,15 @@ static int __init ubifs_init(void)
if (!ubifs_inode_slab)
return -ENOMEM;
- err = register_shrinker(&ubifs_shrinker_info, "ubifs-slab");
- if (err)
+ ubifs_shrinker_info = shrinker_alloc(0, "ubifs-slab");
+ if (!ubifs_shrinker_info)
goto out_slab;
+ ubifs_shrinker_info->count_objects = ubifs_shrink_count;
+ ubifs_shrinker_info->scan_objects = ubifs_shrink_scan;
+
+ shrinker_register(ubifs_shrinker_info);
+
err = ubifs_compressors_init();
if (err)
goto out_shrinker;
@@ -2467,7 +2471,7 @@ out_dbg:
dbg_debugfs_exit();
ubifs_compressors_exit();
out_shrinker:
- unregister_shrinker(&ubifs_shrinker_info);
+ shrinker_free(ubifs_shrinker_info);
out_slab:
kmem_cache_destroy(ubifs_inode_slab);
return err;
@@ -2483,7 +2487,7 @@ static void __exit ubifs_exit(void)
dbg_debugfs_exit();
ubifs_sysfs_exit();
ubifs_compressors_exit();
- unregister_shrinker(&ubifs_shrinker_info);
+ shrinker_free(ubifs_shrinker_info);
/*
* Make sure all delayed rcu free inodes are flushed before we
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 6b7d95b65f4b..f4728e65d1bd 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -65,6 +65,7 @@ static void do_insert_old_idx(struct ubifs_info *c,
else {
ubifs_err(c, "old idx added twice!");
kfree(old_idx);
+ return;
}
}
rb_link_node(&old_idx->rb, parent, p);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index ebb3ad6b5e7e..3916dc4f30ca 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -31,7 +31,7 @@
#include <linux/completion.h>
#include <crypto/hash_info.h>
#include <crypto/hash.h>
-#include <crypto/algapi.h>
+#include <crypto/utils.h>
#include <linux/fscrypt.h>
@@ -2043,7 +2043,7 @@ ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf,
size_t size);
#ifdef CONFIG_UBIFS_FS_XATTR
-extern const struct xattr_handler *ubifs_xattr_handlers[];
+extern const struct xattr_handler * const ubifs_xattr_handlers[];
ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size);
void ubifs_evict_xattr_inode(struct ubifs_info *c, ino_t xattr_inum);
int ubifs_purge_xattrs(struct inode *host);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 406c82eab513..0847db521984 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -735,7 +735,7 @@ static const struct xattr_handler ubifs_security_xattr_handler = {
};
#endif
-const struct xattr_handler *ubifs_xattr_handlers[] = {
+const struct xattr_handler * const ubifs_xattr_handlers[] = {
&ubifs_user_xattr_handler,
&ubifs_trusted_xattr_handler,
#ifdef CONFIG_UBIFS_FS_SECURITY
diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h
index de17a97e8667..415b050b977d 100644
--- a/fs/udf/ecma_167.h
+++ b/fs/udf/ecma_167.h
@@ -471,7 +471,7 @@ struct fileIdentDesc {
uint8_t lengthFileIdent;
struct long_ad icb;
__le16 lengthOfImpUse;
- uint8_t impUse[];
+ /* uint8_t impUse[]; */
/* uint8_t fileIdent[]; */
/* uint8_t padding[]; */
} __packed;
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 6b558cbbeb6b..5f1f969f4134 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -100,8 +100,8 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode)
iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
else
iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
- iinfo->i_crtime = inode->i_mtime;
+ simple_inode_init_ts(inode);
+ iinfo->i_crtime = inode_get_mtime(inode);
if (unlikely(insert_inode_locked(inode) < 0)) {
make_bad_inode(inode);
iput(inode);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index a17a6184cc39..d8493449d4c5 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1296,7 +1296,7 @@ set_size:
goto out_unlock;
}
update_time:
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
if (IS_SYNC(inode))
udf_sync_inode(inode);
else
@@ -1327,7 +1327,7 @@ static int udf_read_inode(struct inode *inode, bool hidden_inode)
int bs = inode->i_sb->s_blocksize;
int ret = -EIO;
uint32_t uid, gid;
- struct timespec64 ctime;
+ struct timespec64 ts;
reread:
if (iloc->partitionReferenceNum >= sbi->s_partitions) {
@@ -1504,10 +1504,12 @@ reread:
inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
(inode->i_sb->s_blocksize_bits - 9);
- udf_disk_stamp_to_time(&inode->i_atime, fe->accessTime);
- udf_disk_stamp_to_time(&inode->i_mtime, fe->modificationTime);
- udf_disk_stamp_to_time(&ctime, fe->attrTime);
- inode_set_ctime_to_ts(inode, ctime);
+ udf_disk_stamp_to_time(&ts, fe->accessTime);
+ inode_set_atime_to_ts(inode, ts);
+ udf_disk_stamp_to_time(&ts, fe->modificationTime);
+ inode_set_mtime_to_ts(inode, ts);
+ udf_disk_stamp_to_time(&ts, fe->attrTime);
+ inode_set_ctime_to_ts(inode, ts);
iinfo->i_unique = le64_to_cpu(fe->uniqueID);
iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr);
@@ -1519,11 +1521,13 @@ reread:
inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) <<
(inode->i_sb->s_blocksize_bits - 9);
- udf_disk_stamp_to_time(&inode->i_atime, efe->accessTime);
- udf_disk_stamp_to_time(&inode->i_mtime, efe->modificationTime);
+ udf_disk_stamp_to_time(&ts, efe->accessTime);
+ inode_set_atime_to_ts(inode, ts);
+ udf_disk_stamp_to_time(&ts, efe->modificationTime);
+ inode_set_mtime_to_ts(inode, ts);
+ udf_disk_stamp_to_time(&ts, efe->attrTime);
+ inode_set_ctime_to_ts(inode, ts);
udf_disk_stamp_to_time(&iinfo->i_crtime, efe->createTime);
- udf_disk_stamp_to_time(&ctime, efe->attrTime);
- inode_set_ctime_to_ts(inode, ctime);
iinfo->i_unique = le64_to_cpu(efe->uniqueID);
iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr);
@@ -1798,8 +1802,8 @@ static int udf_update_inode(struct inode *inode, int do_sync)
inode->i_sb->s_blocksize - sizeof(struct fileEntry));
fe->logicalBlocksRecorded = cpu_to_le64(lb_recorded);
- udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime);
- udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime);
+ udf_time_to_disk_stamp(&fe->accessTime, inode_get_atime(inode));
+ udf_time_to_disk_stamp(&fe->modificationTime, inode_get_mtime(inode));
udf_time_to_disk_stamp(&fe->attrTime, inode_get_ctime(inode));
memset(&(fe->impIdent), 0, sizeof(struct regid));
strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER);
@@ -1829,12 +1833,14 @@ static int udf_update_inode(struct inode *inode, int do_sync)
cpu_to_le32(inode->i_sb->s_blocksize);
}
- udf_adjust_time(iinfo, inode->i_atime);
- udf_adjust_time(iinfo, inode->i_mtime);
+ udf_adjust_time(iinfo, inode_get_atime(inode));
+ udf_adjust_time(iinfo, inode_get_mtime(inode));
udf_adjust_time(iinfo, inode_get_ctime(inode));
- udf_time_to_disk_stamp(&efe->accessTime, inode->i_atime);
- udf_time_to_disk_stamp(&efe->modificationTime, inode->i_mtime);
+ udf_time_to_disk_stamp(&efe->accessTime,
+ inode_get_atime(inode));
+ udf_time_to_disk_stamp(&efe->modificationTime,
+ inode_get_mtime(inode));
udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime);
udf_time_to_disk_stamp(&efe->attrTime, inode_get_ctime(inode));
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index ae55ab8859b6..3508ac484da3 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -365,7 +365,7 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode)
*(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse =
cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL);
udf_fiiter_write_fi(&iter, NULL);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
udf_fiiter_release(&iter);
udf_add_fid_counter(dir->i_sb, false, 1);
@@ -471,7 +471,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
udf_fiiter_release(&iter);
udf_add_fid_counter(dir->i_sb, true, 1);
inc_nlink(dir);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
d_instantiate_new(dentry, inode);
@@ -523,8 +523,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
inode->i_size = 0;
inode_dec_link_count(dir);
udf_add_fid_counter(dir->i_sb, true, -1);
- dir->i_mtime = inode_set_ctime_to_ts(dir,
- inode_set_ctime_current(inode));
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
mark_inode_dirty(dir);
ret = 0;
end_rmdir:
@@ -555,7 +555,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
set_nlink(inode, 1);
}
udf_fiiter_delete_entry(&iter);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
inode_dec_link_count(inode);
udf_add_fid_counter(dir->i_sb, false, -1);
@@ -748,7 +748,7 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
udf_add_fid_counter(dir->i_sb, false, 1);
inode_set_ctime_current(inode);
mark_inode_dirty(inode);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
ihold(inode);
d_instantiate(dentry, inode);
@@ -866,8 +866,8 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir,
udf_add_fid_counter(old_dir->i_sb, S_ISDIR(new_inode->i_mode),
-1);
}
- old_dir->i_mtime = inode_set_ctime_current(old_dir);
- new_dir->i_mtime = inode_set_ctime_current(new_dir);
+ inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir));
+ inode_set_mtime_to_ts(new_dir, inode_set_ctime_current(new_dir));
mark_inode_dirty(old_dir);
mark_inode_dirty(new_dir);
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 9af6ff7f9747..f9a60bc1abcf 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -86,7 +86,7 @@ struct udf_virtual_data {
struct udf_bitmap {
__u32 s_extPosition;
int s_nr_groups;
- struct buffer_head *s_block_bitmap[];
+ struct buffer_head *s_block_bitmap[] __counted_by(s_nr_groups);
};
struct udf_part_map {
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 2436e3f82147..53c11be2b2c1 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -240,6 +240,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg,
unsigned int count, sector_t oldb,
sector_t newb, struct page *locked_page)
{
+ struct folio *folio, *locked_folio = page_folio(locked_page);
const unsigned blks_per_page =
1 << (PAGE_SHIFT - inode->i_blkbits);
const unsigned mask = blks_per_page - 1;
@@ -247,42 +248,39 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg,
pgoff_t index, cur_index, last_index;
unsigned pos, j, lblock;
sector_t end, i;
- struct page *page;
struct buffer_head *head, *bh;
UFSD("ENTER, ino %lu, count %u, oldb %llu, newb %llu\n",
inode->i_ino, count,
(unsigned long long)oldb, (unsigned long long)newb);
- BUG_ON(!locked_page);
- BUG_ON(!PageLocked(locked_page));
+ BUG_ON(!folio_test_locked(locked_folio));
- cur_index = locked_page->index;
+ cur_index = locked_folio->index;
end = count + beg;
last_index = end >> (PAGE_SHIFT - inode->i_blkbits);
for (i = beg; i < end; i = (i | mask) + 1) {
index = i >> (PAGE_SHIFT - inode->i_blkbits);
if (likely(cur_index != index)) {
- page = ufs_get_locked_page(mapping, index);
- if (!page)/* it was truncated */
+ folio = ufs_get_locked_folio(mapping, index);
+ if (!folio) /* it was truncated */
continue;
- if (IS_ERR(page)) {/* or EIO */
+ if (IS_ERR(folio)) {/* or EIO */
ufs_error(inode->i_sb, __func__,
"read of page %llu failed\n",
(unsigned long long)index);
continue;
}
} else
- page = locked_page;
+ folio = locked_folio;
- head = page_buffers(page);
+ head = folio_buffers(folio);
bh = head;
pos = i & mask;
for (j = 0; j < pos; ++j)
bh = bh->b_this_page;
-
if (unlikely(index == last_index))
lblock = end & mask;
else
@@ -313,7 +311,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg,
} while (bh != head);
if (likely(cur_index != index))
- ufs_put_locked_page(page);
+ ufs_put_locked_folio(folio);
}
UFSD("EXIT\n");
}
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index fd57f03b6c93..27c85d92d1dc 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -107,7 +107,7 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
ufs_commit_chunk(page, pos, len);
ufs_put_page(page);
if (update_times)
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
ufs_handle_dirsync(dir);
}
@@ -397,7 +397,7 @@ got_it:
ufs_set_de_type(sb, de, inode->i_mode);
ufs_commit_chunk(page, pos, rec_len);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
err = ufs_handle_dirsync(dir);
@@ -539,7 +539,7 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
pde->d_reclen = cpu_to_fs16(sb, to - from);
dir->d_ino = 0;
ufs_commit_chunk(page, pos, to - from);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
err = ufs_handle_dirsync(inode);
out:
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index a1e7bd9d1f98..73531827ecee 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -292,7 +292,7 @@ cg_found:
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
inode->i_blocks = 0;
inode->i_generation = 0;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
ufsi->i_flags = UFS_I(dir)->i_flags;
ufsi->i_lastfrag = 0;
ufsi->i_shadow = 0;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 21a4779a2de5..ebce93b08281 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -579,13 +579,15 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
i_gid_write(inode, ufs_get_inode_gid(sb, ufs_inode));
inode->i_size = fs64_to_cpu(sb, ufs_inode->ui_size);
- inode->i_atime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec);
+ inode_set_atime(inode,
+ (signed)fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec),
+ 0);
inode_set_ctime(inode,
(signed)fs32_to_cpu(sb, ufs_inode->ui_ctime.tv_sec),
0);
- inode->i_mtime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec);
- inode->i_mtime.tv_nsec = 0;
- inode->i_atime.tv_nsec = 0;
+ inode_set_mtime(inode,
+ (signed)fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec),
+ 0);
inode->i_blocks = fs32_to_cpu(sb, ufs_inode->ui_blocks);
inode->i_generation = fs32_to_cpu(sb, ufs_inode->ui_gen);
ufsi->i_flags = fs32_to_cpu(sb, ufs_inode->ui_flags);
@@ -626,12 +628,12 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
i_gid_write(inode, fs32_to_cpu(sb, ufs2_inode->ui_gid));
inode->i_size = fs64_to_cpu(sb, ufs2_inode->ui_size);
- inode->i_atime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_atime);
+ inode_set_atime(inode, fs64_to_cpu(sb, ufs2_inode->ui_atime),
+ fs32_to_cpu(sb, ufs2_inode->ui_atimensec));
inode_set_ctime(inode, fs64_to_cpu(sb, ufs2_inode->ui_ctime),
fs32_to_cpu(sb, ufs2_inode->ui_ctimensec));
- inode->i_mtime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_mtime);
- inode->i_atime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_atimensec);
- inode->i_mtime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_mtimensec);
+ inode_set_mtime(inode, fs64_to_cpu(sb, ufs2_inode->ui_mtime),
+ fs32_to_cpu(sb, ufs2_inode->ui_mtimensec));
inode->i_blocks = fs64_to_cpu(sb, ufs2_inode->ui_blocks);
inode->i_generation = fs32_to_cpu(sb, ufs2_inode->ui_gen);
ufsi->i_flags = fs32_to_cpu(sb, ufs2_inode->ui_flags);
@@ -725,12 +727,14 @@ static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode)
ufs_set_inode_gid(sb, ufs_inode, i_gid_read(inode));
ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size);
- ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb, inode->i_atime.tv_sec);
+ ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb,
+ inode_get_atime_sec(inode));
ufs_inode->ui_atime.tv_usec = 0;
ufs_inode->ui_ctime.tv_sec = cpu_to_fs32(sb,
- inode_get_ctime(inode).tv_sec);
+ inode_get_ctime_sec(inode));
ufs_inode->ui_ctime.tv_usec = 0;
- ufs_inode->ui_mtime.tv_sec = cpu_to_fs32(sb, inode->i_mtime.tv_sec);
+ ufs_inode->ui_mtime.tv_sec = cpu_to_fs32(sb,
+ inode_get_mtime_sec(inode));
ufs_inode->ui_mtime.tv_usec = 0;
ufs_inode->ui_blocks = cpu_to_fs32(sb, inode->i_blocks);
ufs_inode->ui_flags = cpu_to_fs32(sb, ufsi->i_flags);
@@ -770,13 +774,15 @@ static void ufs2_update_inode(struct inode *inode, struct ufs2_inode *ufs_inode)
ufs_inode->ui_gid = cpu_to_fs32(sb, i_gid_read(inode));
ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size);
- ufs_inode->ui_atime = cpu_to_fs64(sb, inode->i_atime.tv_sec);
- ufs_inode->ui_atimensec = cpu_to_fs32(sb, inode->i_atime.tv_nsec);
- ufs_inode->ui_ctime = cpu_to_fs64(sb, inode_get_ctime(inode).tv_sec);
+ ufs_inode->ui_atime = cpu_to_fs64(sb, inode_get_atime_sec(inode));
+ ufs_inode->ui_atimensec = cpu_to_fs32(sb,
+ inode_get_atime_nsec(inode));
+ ufs_inode->ui_ctime = cpu_to_fs64(sb, inode_get_ctime_sec(inode));
ufs_inode->ui_ctimensec = cpu_to_fs32(sb,
- inode_get_ctime(inode).tv_nsec);
- ufs_inode->ui_mtime = cpu_to_fs64(sb, inode->i_mtime.tv_sec);
- ufs_inode->ui_mtimensec = cpu_to_fs32(sb, inode->i_mtime.tv_nsec);
+ inode_get_ctime_nsec(inode));
+ ufs_inode->ui_mtime = cpu_to_fs64(sb, inode_get_mtime_sec(inode));
+ ufs_inode->ui_mtimensec = cpu_to_fs32(sb,
+ inode_get_mtime_nsec(inode));
ufs_inode->ui_blocks = cpu_to_fs64(sb, inode->i_blocks);
ufs_inode->ui_flags = cpu_to_fs32(sb, ufsi->i_flags);
@@ -1057,7 +1063,7 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size)
struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
unsigned i, end;
sector_t lastfrag;
- struct page *lastpage;
+ struct folio *folio;
struct buffer_head *bh;
u64 phys64;
@@ -1068,18 +1074,17 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size)
lastfrag--;
- lastpage = ufs_get_locked_page(mapping, lastfrag >>
+ folio = ufs_get_locked_folio(mapping, lastfrag >>
(PAGE_SHIFT - inode->i_blkbits));
- if (IS_ERR(lastpage)) {
- err = -EIO;
- goto out;
- }
-
- end = lastfrag & ((1 << (PAGE_SHIFT - inode->i_blkbits)) - 1);
- bh = page_buffers(lastpage);
- for (i = 0; i < end; ++i)
- bh = bh->b_this_page;
+ if (IS_ERR(folio)) {
+ err = -EIO;
+ goto out;
+ }
+ end = lastfrag & ((1 << (PAGE_SHIFT - inode->i_blkbits)) - 1);
+ bh = folio_buffers(folio);
+ for (i = 0; i < end; ++i)
+ bh = bh->b_this_page;
err = ufs_getfrag_block(inode, lastfrag, bh, 1);
@@ -1095,7 +1100,7 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size)
*/
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
- set_page_dirty(lastpage);
+ folio_mark_dirty(folio);
}
if (lastfrag >= UFS_IND_FRAGMENT) {
@@ -1113,7 +1118,7 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size)
}
}
out_unlock:
- ufs_put_locked_page(lastpage);
+ ufs_put_locked_folio(folio);
out:
return err;
}
@@ -1208,7 +1213,7 @@ static int ufs_truncate(struct inode *inode, loff_t size)
truncate_setsize(inode, size);
ufs_truncate_blocks(inode);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
out:
UFSD("EXIT: err %d\n", err);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 23377c1baed9..a480810cd4e3 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -137,6 +137,7 @@ static struct dentry *ufs_get_parent(struct dentry *child)
}
static const struct export_operations ufs_export_ops = {
+ .encode_fh = generic_encode_ino32_fh,
.fh_to_dentry = ufs_fh_to_dentry,
.fh_to_parent = ufs_fh_to_parent,
.get_parent = ufs_get_parent,
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 08ddf41eaaad..2acf191eb89e 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -230,42 +230,40 @@ ufs_set_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi, dev_t dev
}
/**
- * ufs_get_locked_page() - locate, pin and lock a pagecache page, if not exist
+ * ufs_get_locked_folio() - locate, pin and lock a pagecache folio, if not exist
* read it from disk.
* @mapping: the address_space to search
* @index: the page index
*
- * Locates the desired pagecache page, if not exist we'll read it,
+ * Locates the desired pagecache folio, if not exist we'll read it,
* locks it, increments its reference
* count and returns its address.
*
*/
-
-struct page *ufs_get_locked_page(struct address_space *mapping,
+struct folio *ufs_get_locked_folio(struct address_space *mapping,
pgoff_t index)
{
struct inode *inode = mapping->host;
- struct page *page = find_lock_page(mapping, index);
- if (!page) {
- page = read_mapping_page(mapping, index, NULL);
+ struct folio *folio = filemap_lock_folio(mapping, index);
+ if (IS_ERR(folio)) {
+ folio = read_mapping_folio(mapping, index, NULL);
- if (IS_ERR(page)) {
- printk(KERN_ERR "ufs_change_blocknr: "
- "read_mapping_page error: ino %lu, index: %lu\n",
+ if (IS_ERR(folio)) {
+ printk(KERN_ERR "ufs_change_blocknr: read_mapping_folio error: ino %lu, index: %lu\n",
mapping->host->i_ino, index);
- return page;
+ return folio;
}
- lock_page(page);
+ folio_lock(folio);
- if (unlikely(page->mapping == NULL)) {
+ if (unlikely(folio->mapping == NULL)) {
/* Truncate got there first */
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return NULL;
}
}
- if (!page_has_buffers(page))
- create_empty_buffers(page, 1 << inode->i_blkbits, 0);
- return page;
+ if (!folio_buffers(folio))
+ create_empty_buffers(folio, 1 << inode->i_blkbits, 0);
+ return folio;
}
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 89247193d96d..0ecd2ed792f5 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -273,15 +273,13 @@ extern void _ubh_ubhcpymem_(struct ufs_sb_private_info *, unsigned char *, struc
extern void _ubh_memcpyubh_(struct ufs_sb_private_info *, struct ufs_buffer_head *, unsigned char *, unsigned);
/* This functions works with cache pages*/
-extern struct page *ufs_get_locked_page(struct address_space *mapping,
- pgoff_t index);
-static inline void ufs_put_locked_page(struct page *page)
+struct folio *ufs_get_locked_folio(struct address_space *mapping, pgoff_t index);
+static inline void ufs_put_locked_folio(struct folio *folio)
{
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
}
-
/*
* macros and inline function to get important structures from ufs_sb_private_info
*/
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 56eaae9dac1a..e8af40b05549 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -49,7 +49,7 @@ static struct ctl_table vm_userfaultfd_table[] = {
};
#endif
-static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
+static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;
/*
* Start with fault_pending_wqh and fault_wqh so they're more likely
@@ -123,6 +123,11 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
return ctx->features & UFFD_FEATURE_INITIALIZED;
}
+static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
+{
+ return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
+}
+
/*
* Whether WP_UNPOPULATED is enabled on the uffd context. It is only
* meaningful when userfaultfd_wp()==true on the vma and when it's
@@ -922,20 +927,15 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
continue;
}
new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
- prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end,
- new_flags, vma->anon_vma,
- vma->vm_file, vma->vm_pgoff,
- vma_policy(vma),
- NULL_VM_UFFD_CTX, anon_vma_name(vma));
- if (prev) {
- vma = prev;
- } else {
- prev = vma;
- }
+ vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start,
+ vma->vm_end, new_flags,
+ NULL_VM_UFFD_CTX);
vma_start_write(vma);
userfaultfd_set_vm_flags(vma, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+
+ prev = vma;
}
mmap_write_unlock(mm);
mmput(mm);
@@ -1325,7 +1325,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
bool basic_ioctls;
unsigned long start, end, vma_end;
struct vma_iterator vmi;
- pgoff_t pgoff;
+ bool wp_async = userfaultfd_wp_async_ctx(ctx);
user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -1399,7 +1399,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
/* check not compatible vmas */
ret = -EINVAL;
- if (!vma_can_userfault(cur, vm_flags))
+ if (!vma_can_userfault(cur, vm_flags, wp_async))
goto out_unlock;
/*
@@ -1460,7 +1460,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
for_each_vma_range(vmi, vma, end) {
cond_resched();
- BUG_ON(!vma_can_userfault(vma, vm_flags));
+ BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async));
BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
vma->vm_userfaultfd_ctx.ctx != ctx);
WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
@@ -1478,28 +1478,14 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
vma_end = min(end, vma->vm_end);
new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
- pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
- prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
- vma->anon_vma, vma->vm_file, pgoff,
- vma_policy(vma),
- ((struct vm_userfaultfd_ctx){ ctx }),
- anon_vma_name(vma));
- if (prev) {
- /* vma_merge() invalidated the mas */
- vma = prev;
- goto next;
- }
- if (vma->vm_start < start) {
- ret = split_vma(&vmi, vma, start, 1);
- if (ret)
- break;
- }
- if (vma->vm_end > end) {
- ret = split_vma(&vmi, vma, end, 0);
- if (ret)
- break;
+ vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
+ new_flags,
+ (struct vm_userfaultfd_ctx){ctx});
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ break;
}
- next:
+
/*
* In the vma_merge() successful mprotect-like case 8:
* the next vma was merged into the current one and
@@ -1561,7 +1547,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
unsigned long start, end, vma_end;
const void __user *buf = (void __user *)arg;
struct vma_iterator vmi;
- pgoff_t pgoff;
+ bool wp_async = userfaultfd_wp_async_ctx(ctx);
ret = -EFAULT;
if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
@@ -1615,7 +1601,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
* provides for more strict behavior to notice
* unregistration errors.
*/
- if (!vma_can_userfault(cur, cur->vm_flags))
+ if (!vma_can_userfault(cur, cur->vm_flags, wp_async))
goto out_unlock;
found = true;
@@ -1631,7 +1617,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
for_each_vma_range(vmi, vma, end) {
cond_resched();
- BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
+ BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async));
/*
* Nothing to do: this vma is already registered into this
@@ -1664,26 +1650,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
uffd_wp_range(vma, start, vma_end - start, false);
new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
- pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
- prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
- vma->anon_vma, vma->vm_file, pgoff,
- vma_policy(vma),
- NULL_VM_UFFD_CTX, anon_vma_name(vma));
- if (prev) {
- vma = prev;
- goto next;
- }
- if (vma->vm_start < start) {
- ret = split_vma(&vmi, vma, start, 1);
- if (ret)
- break;
- }
- if (vma->vm_end > end) {
- ret = split_vma(&vmi, vma, end, 0);
- if (ret)
- break;
+ vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
+ new_flags, NULL_VM_UFFD_CTX);
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ break;
}
- next:
+
/*
* In the vma_merge() successful mprotect-like case 8:
* the next vma was merged into the current one and
@@ -2018,6 +1991,11 @@ out:
return ret;
}
+bool userfaultfd_wp_async(struct vm_area_struct *vma)
+{
+ return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx);
+}
+
static inline unsigned int uffd_ctx_features(__u64 user_features)
{
/*
@@ -2051,6 +2029,11 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
ret = -EPERM;
if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
goto err_out;
+
+ /* WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally */
+ if (features & UFFD_FEATURE_WP_ASYNC)
+ features |= UFFD_FEATURE_WP_UNPOPULATED;
+
/* report all available features and ioctls to userland */
uffdio_api.features = UFFD_API_FEATURES;
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
@@ -2063,6 +2046,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
#ifndef CONFIG_PTE_MARKER_UFFD_WP
uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
+ uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
#endif
uffdio_api.ioctls = UFFD_API_IOCTLS;
ret = -EFAULT;
diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c
index 83f20dd15522..72ac9320e6a3 100644
--- a/fs/vboxsf/utils.c
+++ b/fs/vboxsf/utils.c
@@ -126,12 +126,12 @@ int vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode,
do_div(allocated, 512);
inode->i_blocks = allocated;
- inode->i_atime = ns_to_timespec64(
- info->access_time.ns_relative_to_unix_epoch);
+ inode_set_atime_to_ts(inode,
+ ns_to_timespec64(info->access_time.ns_relative_to_unix_epoch));
inode_set_ctime_to_ts(inode,
ns_to_timespec64(info->change_time.ns_relative_to_unix_epoch));
- inode->i_mtime = ns_to_timespec64(
- info->modification_time.ns_relative_to_unix_epoch);
+ inode_set_mtime_to_ts(inode,
+ ns_to_timespec64(info->modification_time.ns_relative_to_unix_epoch));
return 0;
}
@@ -194,7 +194,7 @@ int vboxsf_inode_revalidate(struct dentry *dentry)
struct vboxsf_sbi *sbi;
struct vboxsf_inode *sf_i;
struct shfl_fsobjinfo info;
- struct timespec64 prev_mtime;
+ struct timespec64 mtime, prev_mtime;
struct inode *inode;
int err;
@@ -202,7 +202,7 @@ int vboxsf_inode_revalidate(struct dentry *dentry)
return -EINVAL;
inode = d_inode(dentry);
- prev_mtime = inode->i_mtime;
+ prev_mtime = inode_get_mtime(inode);
sf_i = VBOXSF_I(inode);
sbi = VBOXSF_SBI(dentry->d_sb);
if (!sf_i->force_restat) {
@@ -225,7 +225,8 @@ int vboxsf_inode_revalidate(struct dentry *dentry)
* page-cache for it. Note this also gets triggered by our own writes,
* this is unavoidable.
*/
- if (timespec64_compare(&inode->i_mtime, &prev_mtime) > 0)
+ mtime = inode_get_mtime(inode);
+ if (timespec64_compare(&mtime, &prev_mtime) > 0)
invalidate_inode_pages2(inode->i_mapping);
return 0;
diff --git a/fs/xattr.c b/fs/xattr.c
index efd4736bc94b..09d927603433 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -56,7 +56,7 @@ strcmp_prefix(const char *a, const char *a_prefix)
static const struct xattr_handler *
xattr_resolve_name(struct inode *inode, const char **name)
{
- const struct xattr_handler **handlers = inode->i_sb->s_xattr;
+ const struct xattr_handler * const *handlers = inode->i_sb->s_xattr;
const struct xattr_handler *handler;
if (!(inode->i_opflags & IOP_XATTR)) {
@@ -162,7 +162,7 @@ xattr_permission(struct mnt_idmap *idmap, struct inode *inode,
int
xattr_supports_user_prefix(struct inode *inode)
{
- const struct xattr_handler **handlers = inode->i_sb->s_xattr;
+ const struct xattr_handler * const *handlers = inode->i_sb->s_xattr;
const struct xattr_handler *handler;
if (!(inode->i_opflags & IOP_XATTR)) {
@@ -999,7 +999,7 @@ int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name)
ssize_t
generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
{
- const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr;
+ const struct xattr_handler *handler, * const *handlers = dentry->d_sb->s_xattr;
ssize_t remaining_size = buffer_size;
int err = 0;
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index ed0bc8cbc703..567fb37274d3 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -147,7 +147,7 @@ config XFS_ONLINE_SCRUB_STATS
bool "XFS online metadata check usage data collection"
default y
depends on XFS_ONLINE_SCRUB
- select XFS_DEBUG
+ select DEBUG_FS
help
If you say Y here, the kernel will gather usage data about
the online metadata check subsystem. This includes the number
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 3069194527dd..100ab5931b31 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2275,16 +2275,37 @@ xfs_alloc_min_freelist(
ASSERT(mp->m_alloc_maxlevels > 0);
+ /*
+ * For a btree shorter than the maximum height, the worst case is that
+ * every level gets split and a new level is added, then while inserting
+ * another entry to refill the AGFL, every level under the old root gets
+ * split again. This is:
+ *
+ * (full height split reservation) + (AGFL refill split height)
+ * = (current height + 1) + (current height - 1)
+ * = (new height) + (new height - 2)
+ * = 2 * new height - 2
+ *
+ * For a btree of maximum height, the worst case is that every level
+ * under the root gets split, then while inserting another entry to
+ * refill the AGFL, every level under the root gets split again. This is
+ * also:
+ *
+ * 2 * (current height - 1)
+ * = 2 * (new height - 1)
+ * = 2 * new height - 2
+ */
+
/* space needed by-bno freespace btree */
min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
- mp->m_alloc_maxlevels);
+ mp->m_alloc_maxlevels) * 2 - 2;
/* space needed by-size freespace btree */
min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
- mp->m_alloc_maxlevels);
+ mp->m_alloc_maxlevels) * 2 - 2;
/* space needed reverse mapping used space btree */
if (xfs_has_rmapbt(mp))
min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
- mp->m_rmap_maxlevels);
+ mp->m_rmap_maxlevels) * 2 - 2;
return min_free;
}
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 30c931b38853..be62acffad6c 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -21,7 +21,7 @@
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
#include "xfs_bmap_btree.h"
-#include "xfs_rtalloc.h"
+#include "xfs_rtbitmap.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_quota.h"
@@ -2989,7 +2989,7 @@ xfs_bmap_extsize_align(
* If realtime, and the result isn't a multiple of the realtime
* extent size we need to remove blocks until it is.
*/
- if (rt && (temp = (align_alen % mp->m_sb.sb_rextsize))) {
+ if (rt && (temp = xfs_extlen_to_rtxmod(mp, align_alen))) {
/*
* We're not covering the original request, or
* we won't be able to once we fix the length.
@@ -3016,7 +3016,7 @@ xfs_bmap_extsize_align(
else {
align_alen -= orig_off - align_off;
align_off = orig_off;
- align_alen -= align_alen % mp->m_sb.sb_rextsize;
+ align_alen -= xfs_extlen_to_rtxmod(mp, align_alen);
}
/*
* Result doesn't cover the request, fail it.
@@ -4826,12 +4826,8 @@ xfs_bmap_del_extent_delay(
ASSERT(got->br_startoff <= del->br_startoff);
ASSERT(got_endoff >= del_endoff);
- if (isrt) {
- uint64_t rtexts = XFS_FSB_TO_B(mp, del->br_blockcount);
-
- do_div(rtexts, mp->m_sb.sb_rextsize);
- xfs_mod_frextents(mp, rtexts);
- }
+ if (isrt)
+ xfs_mod_frextents(mp, xfs_rtb_to_rtx(mp, del->br_blockcount));
/*
* Update the inode delalloc counter now and wait to update the
@@ -5057,33 +5053,20 @@ xfs_bmap_del_extent_real(
flags = XFS_ILOG_CORE;
if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
- xfs_filblks_t len;
- xfs_extlen_t mod;
-
- len = div_u64_rem(del->br_blockcount, mp->m_sb.sb_rextsize,
- &mod);
- ASSERT(mod == 0);
-
if (!(bflags & XFS_BMAPI_REMAP)) {
- xfs_fsblock_t bno;
-
- bno = div_u64_rem(del->br_startblock,
- mp->m_sb.sb_rextsize, &mod);
- ASSERT(mod == 0);
-
- error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
+ error = xfs_rtfree_blocks(tp, del->br_startblock,
+ del->br_blockcount);
if (error)
goto done;
}
do_fx = 0;
- nblks = len * mp->m_sb.sb_rextsize;
qfield = XFS_TRANS_DQ_RTBCOUNT;
} else {
do_fx = 1;
- nblks = del->br_blockcount;
qfield = XFS_TRANS_DQ_BCOUNT;
}
+ nblks = del->br_blockcount;
del_endblock = del->br_startblock + del->br_blockcount;
if (cur) {
@@ -5289,7 +5272,6 @@ __xfs_bunmapi(
int tmp_logflags; /* partial logging flags */
int wasdel; /* was a delayed alloc extent */
int whichfork; /* data or attribute fork */
- xfs_fsblock_t sum;
xfs_filblks_t len = *rlen; /* length to unmap in file */
xfs_fileoff_t end;
struct xfs_iext_cursor icur;
@@ -5384,8 +5366,8 @@ __xfs_bunmapi(
if (!isrt)
goto delete;
- sum = del.br_startblock + del.br_blockcount;
- div_u64_rem(sum, mp->m_sb.sb_rextsize, &mod);
+ mod = xfs_rtb_to_rtxoff(mp,
+ del.br_startblock + del.br_blockcount);
if (mod) {
/*
* Realtime extent not lined up at the end.
@@ -5432,7 +5414,8 @@ __xfs_bunmapi(
goto error0;
goto nodelete;
}
- div_u64_rem(del.br_startblock, mp->m_sb.sb_rextsize, &mod);
+
+ mod = xfs_rtb_to_rtxoff(mp, del.br_startblock);
if (mod) {
xfs_extlen_t off = mp->m_sb.sb_rextsize - mod;
@@ -6209,8 +6192,8 @@ xfs_bmap_validate_extent(
return __this_address;
if (XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK) {
- if (!xfs_verify_rtext(mp, irec->br_startblock,
- irec->br_blockcount))
+ if (!xfs_verify_rtbext(mp, irec->br_startblock,
+ irec->br_blockcount))
return __this_address;
} else {
if (!xfs_verify_fsbext(mp, irec->br_startblock,
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index bcfb6a4203cd..f71679ce23b9 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -245,21 +245,18 @@ xfs_defer_create_intents(
return ret;
}
-/* Abort all the intents that were committed. */
STATIC void
-xfs_defer_trans_abort(
- struct xfs_trans *tp,
- struct list_head *dop_pending)
+xfs_defer_pending_abort(
+ struct xfs_mount *mp,
+ struct list_head *dop_list)
{
struct xfs_defer_pending *dfp;
const struct xfs_defer_op_type *ops;
- trace_xfs_defer_trans_abort(tp, _RET_IP_);
-
/* Abort intent items that don't have a done item. */
- list_for_each_entry(dfp, dop_pending, dfp_list) {
+ list_for_each_entry(dfp, dop_list, dfp_list) {
ops = defer_op_types[dfp->dfp_type];
- trace_xfs_defer_pending_abort(tp->t_mountp, dfp);
+ trace_xfs_defer_pending_abort(mp, dfp);
if (dfp->dfp_intent && !dfp->dfp_done) {
ops->abort_intent(dfp->dfp_intent);
dfp->dfp_intent = NULL;
@@ -267,6 +264,16 @@ xfs_defer_trans_abort(
}
}
+/* Abort all the intents that were committed. */
+STATIC void
+xfs_defer_trans_abort(
+ struct xfs_trans *tp,
+ struct list_head *dop_pending)
+{
+ trace_xfs_defer_trans_abort(tp, _RET_IP_);
+ xfs_defer_pending_abort(tp->t_mountp, dop_pending);
+}
+
/*
* Capture resources that the caller said not to release ("held") when the
* transaction commits. Caller is responsible for zero-initializing @dres.
@@ -756,12 +763,13 @@ xfs_defer_ops_capture(
/* Release all resources that we used to capture deferred ops. */
void
-xfs_defer_ops_capture_free(
+xfs_defer_ops_capture_abort(
struct xfs_mount *mp,
struct xfs_defer_capture *dfc)
{
unsigned short i;
+ xfs_defer_pending_abort(mp, &dfc->dfc_dfops);
xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
@@ -802,7 +810,7 @@ xfs_defer_ops_capture_and_commit(
/* Commit the transaction and add the capture structure to the list. */
error = xfs_trans_commit(tp);
if (error) {
- xfs_defer_ops_capture_free(mp, dfc);
+ xfs_defer_ops_capture_abort(mp, dfc);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 114a3a4930a3..8788ad5f6a73 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -121,7 +121,7 @@ int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp,
struct list_head *capture_list);
void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp,
struct xfs_defer_resources *dres);
-void xfs_defer_ops_capture_free(struct xfs_mount *mp,
+void xfs_defer_ops_capture_abort(struct xfs_mount *mp,
struct xfs_defer_capture *d);
void xfs_defer_resources_rele(struct xfs_defer_resources *dres);
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 371dc07233e0..9a88aba1589f 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -98,7 +98,7 @@ typedef struct xfs_sb {
uint32_t sb_blocksize; /* logical block size, bytes */
xfs_rfsblock_t sb_dblocks; /* number of data blocks */
xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */
- xfs_rtblock_t sb_rextents; /* number of realtime extents */
+ xfs_rtbxlen_t sb_rextents; /* number of realtime extents */
uuid_t sb_uuid; /* user-visible file system unique id */
xfs_fsblock_t sb_logstart; /* starting block of log if internal */
xfs_ino_t sb_rootino; /* root inode number */
@@ -691,6 +691,22 @@ struct xfs_agfl {
xfs_daddr_to_agno(mp, (d) + (len) - 1)))
/*
+ * Realtime bitmap information is accessed by the word, which is currently
+ * stored in host-endian format.
+ */
+union xfs_rtword_raw {
+ __u32 old;
+};
+
+/*
+ * Realtime summary counts are accessed by the word, which is currently
+ * stored in host-endian format.
+ */
+union xfs_suminfo_raw {
+ __u32 old;
+};
+
+/*
* XFS Timestamps
* ==============
*
@@ -1142,24 +1158,10 @@ static inline bool xfs_dinode_has_large_extent_counts(
#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize)
#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask)
-#define XFS_BLOCKWSIZE(mp) ((mp)->m_blockwsize)
-#define XFS_BLOCKWMASK(mp) ((mp)->m_blockwmask)
/*
- * RT Summary and bit manipulation macros.
+ * RT bit manipulation macros.
*/
-#define XFS_SUMOFFS(mp,ls,bb) ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb)))
-#define XFS_SUMOFFSTOBLOCK(mp,s) \
- (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
-#define XFS_SUMPTR(mp,bp,so) \
- ((xfs_suminfo_t *)((bp)->b_addr + \
- (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
-
-#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log)
-#define XFS_BLOCKTOBIT(mp,bb) ((bb) << (mp)->m_blkbit_log)
-#define XFS_BITTOWORD(mp,bi) \
- ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp)))
-
#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b))
#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b))
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index a35781577cad..137a65bda95d 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -220,8 +220,10 @@ xfs_inode_from_disk(
* a time before epoch is converted to a time long after epoch
* on 64 bit systems.
*/
- inode->i_atime = xfs_inode_from_disk_ts(from, from->di_atime);
- inode->i_mtime = xfs_inode_from_disk_ts(from, from->di_mtime);
+ inode_set_atime_to_ts(inode,
+ xfs_inode_from_disk_ts(from, from->di_atime));
+ inode_set_mtime_to_ts(inode,
+ xfs_inode_from_disk_ts(from, from->di_mtime));
inode_set_ctime_to_ts(inode,
xfs_inode_from_disk_ts(from, from->di_ctime));
@@ -315,8 +317,8 @@ xfs_inode_to_disk(
to->di_projid_lo = cpu_to_be16(ip->i_projid & 0xffff);
to->di_projid_hi = cpu_to_be16(ip->i_projid >> 16);
- to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime);
- to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime);
+ to->di_atime = xfs_inode_to_disk_ts(ip, inode_get_atime(inode));
+ to->di_mtime = xfs_inode_to_disk_ts(ip, inode_get_mtime(inode));
to->di_ctime = xfs_inode_to_disk_ts(ip, inode_get_ctime(inode));
to->di_nlink = cpu_to_be32(inode->i_nlink);
to->di_gen = cpu_to_be32(inode->i_generation);
@@ -508,6 +510,9 @@ xfs_dinode_verify(
if (mode && nextents + naextents > nblocks)
return __this_address;
+ if (nextents + naextents == 0 && nblocks != 0)
+ return __this_address;
+
if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents)
return __this_address;
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index fa180ab66b73..c269d704314d 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -16,6 +16,7 @@
#include "xfs_trans.h"
#include "xfs_rtalloc.h"
#include "xfs_error.h"
+#include "xfs_rtbitmap.h"
/*
* Realtime allocator bitmap functions shared with userspace.
@@ -46,25 +47,69 @@ const struct xfs_buf_ops xfs_rtbuf_ops = {
.verify_write = xfs_rtbuf_verify_write,
};
+/* Release cached rt bitmap and summary buffers. */
+void
+xfs_rtbuf_cache_relse(
+ struct xfs_rtalloc_args *args)
+{
+ if (args->rbmbp) {
+ xfs_trans_brelse(args->tp, args->rbmbp);
+ args->rbmbp = NULL;
+ args->rbmoff = NULLFILEOFF;
+ }
+ if (args->sumbp) {
+ xfs_trans_brelse(args->tp, args->sumbp);
+ args->sumbp = NULL;
+ args->sumoff = NULLFILEOFF;
+ }
+}
+
/*
* Get a buffer for the bitmap or summary file block specified.
* The buffer is returned read and locked.
*/
int
xfs_rtbuf_get(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t block, /* block number in bitmap or summary */
- int issum, /* is summary not bitmap */
- struct xfs_buf **bpp) /* output: buffer for the block */
+ struct xfs_rtalloc_args *args,
+ xfs_fileoff_t block, /* block number in bitmap or summary */
+ int issum) /* is summary not bitmap */
{
- struct xfs_buf *bp; /* block buffer, result */
- xfs_inode_t *ip; /* bitmap or summary inode */
- xfs_bmbt_irec_t map;
- int nmap = 1;
- int error; /* error value */
+ struct xfs_mount *mp = args->mp;
+ struct xfs_buf **cbpp; /* cached block buffer */
+ xfs_fileoff_t *coffp; /* cached block number */
+ struct xfs_buf *bp; /* block buffer, result */
+ struct xfs_inode *ip; /* bitmap or summary inode */
+ struct xfs_bmbt_irec map;
+ enum xfs_blft type;
+ int nmap = 1;
+ int error;
- ip = issum ? mp->m_rsumip : mp->m_rbmip;
+ if (issum) {
+ cbpp = &args->sumbp;
+ coffp = &args->sumoff;
+ ip = mp->m_rsumip;
+ type = XFS_BLFT_RTSUMMARY_BUF;
+ } else {
+ cbpp = &args->rbmbp;
+ coffp = &args->rbmoff;
+ ip = mp->m_rbmip;
+ type = XFS_BLFT_RTBITMAP_BUF;
+ }
+
+ /*
+ * If we have a cached buffer, and the block number matches, use that.
+ */
+ if (*cbpp && *coffp == block)
+ return 0;
+
+ /*
+ * Otherwise we have to have to get the buffer. If there was an old
+ * one, get rid of it first.
+ */
+ if (*cbpp) {
+ xfs_trans_brelse(args->tp, *cbpp);
+ *cbpp = NULL;
+ }
error = xfs_bmapi_read(ip, block, 1, &map, &nmap, 0);
if (error)
@@ -74,15 +119,15 @@ xfs_rtbuf_get(
return -EFSCORRUPTED;
ASSERT(map.br_startblock != NULLFSBLOCK);
- error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+ error = xfs_trans_read_buf(mp, args->tp, mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, map.br_startblock),
mp->m_bsize, 0, &bp, &xfs_rtbuf_ops);
if (error)
return error;
- xfs_trans_buf_set_type(tp, bp, issum ? XFS_BLFT_RTSUMMARY_BUF
- : XFS_BLFT_RTBITMAP_BUF);
- *bpp = bp;
+ xfs_trans_buf_set_type(args->tp, bp, type);
+ *cbpp = bp;
+ *coffp = block;
return 0;
}
@@ -92,47 +137,44 @@ xfs_rtbuf_get(
*/
int
xfs_rtfind_back(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* starting block to look at */
- xfs_rtblock_t limit, /* last block to look at */
- xfs_rtblock_t *rtblock) /* out: start block found */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxnum_t start, /* starting rtext to look at */
+ xfs_rtxnum_t limit, /* last rtext to look at */
+ xfs_rtxnum_t *rtx) /* out: start rtext found */
{
- xfs_rtword_t *b; /* current word in buffer */
- int bit; /* bit number in the word */
- xfs_rtblock_t block; /* bitmap block number */
- struct xfs_buf *bp; /* buf for the block */
- xfs_rtword_t *bufp; /* starting word in buffer */
- int error; /* error value */
- xfs_rtblock_t firstbit; /* first useful bit in the word */
- xfs_rtblock_t i; /* current bit number rel. to start */
- xfs_rtblock_t len; /* length of inspected area */
- xfs_rtword_t mask; /* mask of relevant bits for value */
- xfs_rtword_t want; /* mask for "good" values */
- xfs_rtword_t wdiff; /* difference from wanted value */
- int word; /* word number in the buffer */
+ struct xfs_mount *mp = args->mp;
+ int bit; /* bit number in the word */
+ xfs_fileoff_t block; /* bitmap block number */
+ int error; /* error value */
+ xfs_rtxnum_t firstbit; /* first useful bit in the word */
+ xfs_rtxnum_t i; /* current bit number rel. to start */
+ xfs_rtxnum_t len; /* length of inspected area */
+ xfs_rtword_t mask; /* mask of relevant bits for value */
+ xfs_rtword_t want; /* mask for "good" values */
+ xfs_rtword_t wdiff; /* difference from wanted value */
+ xfs_rtword_t incore;
+ unsigned int word; /* word number in the buffer */
/*
* Compute and read in starting bitmap block for starting block.
*/
- block = XFS_BITTOBLOCK(mp, start);
- error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
- if (error) {
+ block = xfs_rtx_to_rbmblock(mp, start);
+ error = xfs_rtbitmap_read_buf(args, block);
+ if (error)
return error;
- }
- bufp = bp->b_addr;
+
/*
* Get the first word's index & point to it.
*/
- word = XFS_BITTOWORD(mp, start);
- b = &bufp[word];
+ word = xfs_rtx_to_rbmword(mp, start);
bit = (int)(start & (XFS_NBWORD - 1));
len = start - limit + 1;
/*
* Compute match value, based on the bit at start: if 1 (free)
* then all-ones, else all-zeroes.
*/
- want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+ incore = xfs_rtbitmap_getword(args, word);
+ want = (incore & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
/*
* If the starting position is not word-aligned, deal with the
* partial word.
@@ -149,13 +191,12 @@ xfs_rtfind_back(
* Calculate the difference between the value there
* and what we're looking for.
*/
- if ((wdiff = (*b ^ want) & mask)) {
+ if ((wdiff = (incore ^ want) & mask)) {
/*
* Different. Mark where we are and return.
*/
- xfs_trans_brelse(tp, bp);
i = bit - XFS_RTHIBIT(wdiff);
- *rtblock = start - i + 1;
+ *rtx = start - i + 1;
return 0;
}
i = bit - firstbit + 1;
@@ -167,19 +208,11 @@ xfs_rtfind_back(
/*
* If done with this block, get the previous one.
*/
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
- if (error) {
+ error = xfs_rtbitmap_read_buf(args, --block);
+ if (error)
return error;
- }
- bufp = bp->b_addr;
- word = XFS_BLOCKWMASK(mp);
- b = &bufp[word];
- } else {
- /*
- * Go on to the previous word in the buffer.
- */
- b--;
+
+ word = mp->m_blockwsize - 1;
}
} else {
/*
@@ -195,13 +228,13 @@ xfs_rtfind_back(
/*
* Compute difference between actual and desired value.
*/
- if ((wdiff = *b ^ want)) {
+ incore = xfs_rtbitmap_getword(args, word);
+ if ((wdiff = incore ^ want)) {
/*
* Different, mark where we are and return.
*/
- xfs_trans_brelse(tp, bp);
i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
- *rtblock = start - i + 1;
+ *rtx = start - i + 1;
return 0;
}
i += XFS_NBWORD;
@@ -213,19 +246,11 @@ xfs_rtfind_back(
/*
* If done with this block, get the previous one.
*/
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
- if (error) {
+ error = xfs_rtbitmap_read_buf(args, --block);
+ if (error)
return error;
- }
- bufp = bp->b_addr;
- word = XFS_BLOCKWMASK(mp);
- b = &bufp[word];
- } else {
- /*
- * Go on to the previous word in the buffer.
- */
- b--;
+
+ word = mp->m_blockwsize - 1;
}
}
/*
@@ -242,13 +267,13 @@ xfs_rtfind_back(
/*
* Compute difference between actual and desired value.
*/
- if ((wdiff = (*b ^ want) & mask)) {
+ incore = xfs_rtbitmap_getword(args, word);
+ if ((wdiff = (incore ^ want) & mask)) {
/*
* Different, mark where we are and return.
*/
- xfs_trans_brelse(tp, bp);
i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
- *rtblock = start - i + 1;
+ *rtx = start - i + 1;
return 0;
} else
i = len;
@@ -256,8 +281,7 @@ xfs_rtfind_back(
/*
* No match, return that we scanned the whole area.
*/
- xfs_trans_brelse(tp, bp);
- *rtblock = start - i + 1;
+ *rtx = start - i + 1;
return 0;
}
@@ -267,47 +291,44 @@ xfs_rtfind_back(
*/
int
xfs_rtfind_forw(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* starting block to look at */
- xfs_rtblock_t limit, /* last block to look at */
- xfs_rtblock_t *rtblock) /* out: start block found */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxnum_t start, /* starting rtext to look at */
+ xfs_rtxnum_t limit, /* last rtext to look at */
+ xfs_rtxnum_t *rtx) /* out: start rtext found */
{
- xfs_rtword_t *b; /* current word in buffer */
- int bit; /* bit number in the word */
- xfs_rtblock_t block; /* bitmap block number */
- struct xfs_buf *bp; /* buf for the block */
- xfs_rtword_t *bufp; /* starting word in buffer */
- int error; /* error value */
- xfs_rtblock_t i; /* current bit number rel. to start */
- xfs_rtblock_t lastbit; /* last useful bit in the word */
- xfs_rtblock_t len; /* length of inspected area */
- xfs_rtword_t mask; /* mask of relevant bits for value */
- xfs_rtword_t want; /* mask for "good" values */
- xfs_rtword_t wdiff; /* difference from wanted value */
- int word; /* word number in the buffer */
+ struct xfs_mount *mp = args->mp;
+ int bit; /* bit number in the word */
+ xfs_fileoff_t block; /* bitmap block number */
+ int error;
+ xfs_rtxnum_t i; /* current bit number rel. to start */
+ xfs_rtxnum_t lastbit;/* last useful bit in the word */
+ xfs_rtxnum_t len; /* length of inspected area */
+ xfs_rtword_t mask; /* mask of relevant bits for value */
+ xfs_rtword_t want; /* mask for "good" values */
+ xfs_rtword_t wdiff; /* difference from wanted value */
+ xfs_rtword_t incore;
+ unsigned int word; /* word number in the buffer */
/*
* Compute and read in starting bitmap block for starting block.
*/
- block = XFS_BITTOBLOCK(mp, start);
- error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
- if (error) {
+ block = xfs_rtx_to_rbmblock(mp, start);
+ error = xfs_rtbitmap_read_buf(args, block);
+ if (error)
return error;
- }
- bufp = bp->b_addr;
+
/*
* Get the first word's index & point to it.
*/
- word = XFS_BITTOWORD(mp, start);
- b = &bufp[word];
+ word = xfs_rtx_to_rbmword(mp, start);
bit = (int)(start & (XFS_NBWORD - 1));
len = limit - start + 1;
/*
* Compute match value, based on the bit at start: if 1 (free)
* then all-ones, else all-zeroes.
*/
- want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+ incore = xfs_rtbitmap_getword(args, word);
+ want = (incore & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
/*
* If the starting position is not word-aligned, deal with the
* partial word.
@@ -323,13 +344,12 @@ xfs_rtfind_forw(
* Calculate the difference between the value there
* and what we're looking for.
*/
- if ((wdiff = (*b ^ want) & mask)) {
+ if ((wdiff = (incore ^ want) & mask)) {
/*
* Different. Mark where we are and return.
*/
- xfs_trans_brelse(tp, bp);
i = XFS_RTLOBIT(wdiff) - bit;
- *rtblock = start + i - 1;
+ *rtx = start + i - 1;
return 0;
}
i = lastbit - bit;
@@ -337,22 +357,15 @@ xfs_rtfind_forw(
* Go on to next block if that's where the next word is
* and we need the next word.
*/
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ if (++word == mp->m_blockwsize && i < len) {
/*
* If done with this block, get the previous one.
*/
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
+ error = xfs_rtbitmap_read_buf(args, ++block);
+ if (error)
return error;
- }
- b = bufp = bp->b_addr;
+
word = 0;
- } else {
- /*
- * Go on to the previous word in the buffer.
- */
- b++;
}
} else {
/*
@@ -368,13 +381,13 @@ xfs_rtfind_forw(
/*
* Compute difference between actual and desired value.
*/
- if ((wdiff = *b ^ want)) {
+ incore = xfs_rtbitmap_getword(args, word);
+ if ((wdiff = incore ^ want)) {
/*
* Different, mark where we are and return.
*/
- xfs_trans_brelse(tp, bp);
i += XFS_RTLOBIT(wdiff);
- *rtblock = start + i - 1;
+ *rtx = start + i - 1;
return 0;
}
i += XFS_NBWORD;
@@ -382,22 +395,15 @@ xfs_rtfind_forw(
* Go on to next block if that's where the next word is
* and we need the next word.
*/
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ if (++word == mp->m_blockwsize && i < len) {
/*
* If done with this block, get the next one.
*/
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
+ error = xfs_rtbitmap_read_buf(args, ++block);
+ if (error)
return error;
- }
- b = bufp = bp->b_addr;
+
word = 0;
- } else {
- /*
- * Go on to the next word in the buffer.
- */
- b++;
}
}
/*
@@ -412,13 +418,13 @@ xfs_rtfind_forw(
/*
* Compute difference between actual and desired value.
*/
- if ((wdiff = (*b ^ want) & mask)) {
+ incore = xfs_rtbitmap_getword(args, word);
+ if ((wdiff = (incore ^ want) & mask)) {
/*
* Different, mark where we are and return.
*/
- xfs_trans_brelse(tp, bp);
i += XFS_RTLOBIT(wdiff);
- *rtblock = start + i - 1;
+ *rtx = start + i - 1;
return 0;
} else
i = len;
@@ -426,11 +432,25 @@ xfs_rtfind_forw(
/*
* No match, return that we scanned the whole area.
*/
- xfs_trans_brelse(tp, bp);
- *rtblock = start + i - 1;
+ *rtx = start + i - 1;
return 0;
}
+/* Log rtsummary counter at @infoword. */
+static inline void
+xfs_trans_log_rtsummary(
+ struct xfs_rtalloc_args *args,
+ unsigned int infoword)
+{
+ struct xfs_buf *bp = args->sumbp;
+ size_t first, last;
+
+ first = (void *)xfs_rsumblock_infoptr(args, infoword) - bp->b_addr;
+ last = first + sizeof(xfs_suminfo_t) - 1;
+
+ xfs_trans_log_buf(args->tp, bp, first, last);
+}
+
/*
* Read and/or modify the summary information for a given extent size,
* bitmap block combination.
@@ -442,86 +462,77 @@ xfs_rtfind_forw(
*/
int
xfs_rtmodify_summary_int(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- int log, /* log2 of extent size */
- xfs_rtblock_t bbno, /* bitmap block number */
- int delta, /* change to make to summary info */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb, /* in/out: summary block number */
- xfs_suminfo_t *sum) /* out: summary info for this block */
+ struct xfs_rtalloc_args *args,
+ int log, /* log2 of extent size */
+ xfs_fileoff_t bbno, /* bitmap block number */
+ int delta, /* change to make to summary info */
+ xfs_suminfo_t *sum) /* out: summary info for this block */
{
- struct xfs_buf *bp; /* buffer for the summary block */
- int error; /* error value */
- xfs_fsblock_t sb; /* summary fsblock */
- int so; /* index into the summary file */
- xfs_suminfo_t *sp; /* pointer to returned data */
+ struct xfs_mount *mp = args->mp;
+ int error;
+ xfs_fileoff_t sb; /* summary fsblock */
+ xfs_rtsumoff_t so; /* index into the summary file */
+ unsigned int infoword;
/*
* Compute entry number in the summary file.
*/
- so = XFS_SUMOFFS(mp, log, bbno);
+ so = xfs_rtsumoffs(mp, log, bbno);
/*
* Compute the block number in the summary file.
*/
- sb = XFS_SUMOFFSTOBLOCK(mp, so);
- /*
- * If we have an old buffer, and the block number matches, use that.
- */
- if (*rbpp && *rsb == sb)
- bp = *rbpp;
- /*
- * Otherwise we have to get the buffer.
- */
- else {
- /*
- * If there was an old one, get rid of it first.
- */
- if (*rbpp)
- xfs_trans_brelse(tp, *rbpp);
- error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
- if (error) {
- return error;
- }
- /*
- * Remember this buffer and block for the next call.
- */
- *rbpp = bp;
- *rsb = sb;
- }
+ sb = xfs_rtsumoffs_to_block(mp, so);
+
+ error = xfs_rtsummary_read_buf(args, sb);
+ if (error)
+ return error;
+
/*
* Point to the summary information, modify/log it, and/or copy it out.
*/
- sp = XFS_SUMPTR(mp, bp, so);
+ infoword = xfs_rtsumoffs_to_infoword(mp, so);
if (delta) {
- uint first = (uint)((char *)sp - (char *)bp->b_addr);
+ xfs_suminfo_t val = xfs_suminfo_add(args, infoword, delta);
- *sp += delta;
if (mp->m_rsum_cache) {
- if (*sp == 0 && log == mp->m_rsum_cache[bbno])
- mp->m_rsum_cache[bbno]++;
- if (*sp != 0 && log < mp->m_rsum_cache[bbno])
+ if (val == 0 && log + 1 == mp->m_rsum_cache[bbno])
mp->m_rsum_cache[bbno] = log;
+ if (val != 0 && log >= mp->m_rsum_cache[bbno])
+ mp->m_rsum_cache[bbno] = log + 1;
}
- xfs_trans_log_buf(tp, bp, first, first + sizeof(*sp) - 1);
+ xfs_trans_log_rtsummary(args, infoword);
+ if (sum)
+ *sum = val;
+ } else if (sum) {
+ *sum = xfs_suminfo_get(args, infoword);
}
- if (sum)
- *sum = *sp;
return 0;
}
int
xfs_rtmodify_summary(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- int log, /* log2 of extent size */
- xfs_rtblock_t bbno, /* bitmap block number */
- int delta, /* change to make to summary info */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb) /* in/out: summary block number */
+ struct xfs_rtalloc_args *args,
+ int log, /* log2 of extent size */
+ xfs_fileoff_t bbno, /* bitmap block number */
+ int delta) /* in/out: summary block number */
{
- return xfs_rtmodify_summary_int(mp, tp, log, bbno,
- delta, rbpp, rsb, NULL);
+ return xfs_rtmodify_summary_int(args, log, bbno, delta, NULL);
+}
+
+/* Log rtbitmap block from the word @from to the byte before @next. */
+static inline void
+xfs_trans_log_rtbitmap(
+ struct xfs_rtalloc_args *args,
+ unsigned int from,
+ unsigned int next)
+{
+ struct xfs_buf *bp = args->rbmbp;
+ size_t first, last;
+
+ first = (void *)xfs_rbmblock_wordptr(args, from) - bp->b_addr;
+ last = ((void *)xfs_rbmblock_wordptr(args, next) - 1) - bp->b_addr;
+
+ xfs_trans_log_buf(args->tp, bp, first, last);
}
/*
@@ -530,41 +541,37 @@ xfs_rtmodify_summary(
*/
int
xfs_rtmodify_range(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* starting block to modify */
- xfs_extlen_t len, /* length of extent to modify */
- int val) /* 1 for free, 0 for allocated */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxnum_t start, /* starting rtext to modify */
+ xfs_rtxlen_t len, /* length of extent to modify */
+ int val) /* 1 for free, 0 for allocated */
{
- xfs_rtword_t *b; /* current word in buffer */
- int bit; /* bit number in the word */
- xfs_rtblock_t block; /* bitmap block number */
- struct xfs_buf *bp; /* buf for the block */
- xfs_rtword_t *bufp; /* starting word in buffer */
- int error; /* error value */
- xfs_rtword_t *first; /* first used word in the buffer */
- int i; /* current bit number rel. to start */
- int lastbit; /* last useful bit in word */
- xfs_rtword_t mask; /* mask o frelevant bits for value */
- int word; /* word number in the buffer */
+ struct xfs_mount *mp = args->mp;
+ int bit; /* bit number in the word */
+ xfs_fileoff_t block; /* bitmap block number */
+ int error;
+ int i; /* current bit number rel. to start */
+ int lastbit; /* last useful bit in word */
+ xfs_rtword_t mask; /* mask of relevant bits for value */
+ xfs_rtword_t incore;
+ unsigned int firstword; /* first word used in the buffer */
+ unsigned int word; /* word number in the buffer */
/*
* Compute starting bitmap block number.
*/
- block = XFS_BITTOBLOCK(mp, start);
+ block = xfs_rtx_to_rbmblock(mp, start);
/*
* Read the bitmap block, and point to its data.
*/
- error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
- if (error) {
+ error = xfs_rtbitmap_read_buf(args, block);
+ if (error)
return error;
- }
- bufp = bp->b_addr;
+
/*
* Compute the starting word's address, and starting bit.
*/
- word = XFS_BITTOWORD(mp, start);
- first = b = &bufp[word];
+ firstword = word = xfs_rtx_to_rbmword(mp, start);
bit = (int)(start & (XFS_NBWORD - 1));
/*
* 0 (allocated) => all zeroes; 1 (free) => all ones.
@@ -583,34 +590,28 @@ xfs_rtmodify_range(
/*
* Set/clear the active bits.
*/
+ incore = xfs_rtbitmap_getword(args, word);
if (val)
- *b |= mask;
+ incore |= mask;
else
- *b &= ~mask;
+ incore &= ~mask;
+ xfs_rtbitmap_setword(args, word, incore);
i = lastbit - bit;
/*
* Go on to the next block if that's where the next word is
* and we need the next word.
*/
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ if (++word == mp->m_blockwsize && i < len) {
/*
* Log the changed part of this block.
* Get the next one.
*/
- xfs_trans_log_buf(tp, bp,
- (uint)((char *)first - (char *)bufp),
- (uint)((char *)b - (char *)bufp));
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
+ xfs_trans_log_rtbitmap(args, firstword, word);
+ error = xfs_rtbitmap_read_buf(args, ++block);
+ if (error)
return error;
- }
- first = b = bufp = bp->b_addr;
- word = 0;
- } else {
- /*
- * Go on to the next word in the buffer
- */
- b++;
+
+ firstword = word = 0;
}
} else {
/*
@@ -626,31 +627,23 @@ xfs_rtmodify_range(
/*
* Set the word value correctly.
*/
- *b = val;
+ xfs_rtbitmap_setword(args, word, val);
i += XFS_NBWORD;
/*
* Go on to the next block if that's where the next word is
* and we need the next word.
*/
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ if (++word == mp->m_blockwsize && i < len) {
/*
* Log the changed part of this block.
* Get the next one.
*/
- xfs_trans_log_buf(tp, bp,
- (uint)((char *)first - (char *)bufp),
- (uint)((char *)b - (char *)bufp));
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
+ xfs_trans_log_rtbitmap(args, firstword, word);
+ error = xfs_rtbitmap_read_buf(args, ++block);
+ if (error)
return error;
- }
- first = b = bufp = bp->b_addr;
- word = 0;
- } else {
- /*
- * Go on to the next word in the buffer
- */
- b++;
+
+ firstword = word = 0;
}
}
/*
@@ -665,18 +658,19 @@ xfs_rtmodify_range(
/*
* Set/clear the active bits.
*/
+ incore = xfs_rtbitmap_getword(args, word);
if (val)
- *b |= mask;
+ incore |= mask;
else
- *b &= ~mask;
- b++;
+ incore &= ~mask;
+ xfs_rtbitmap_setword(args, word, incore);
+ word++;
}
/*
* Log any remaining changed bytes.
*/
- if (b > first)
- xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp),
- (uint)((char *)b - (char *)bufp - 1));
+ if (word > firstword)
+ xfs_trans_log_rtbitmap(args, firstword, word);
return 0;
}
@@ -686,23 +680,21 @@ xfs_rtmodify_range(
*/
int
xfs_rtfree_range(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* starting block to free */
- xfs_extlen_t len, /* length to free */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb) /* in/out: summary block number */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxnum_t start, /* starting rtext to free */
+ xfs_rtxlen_t len) /* in/out: summary block number */
{
- xfs_rtblock_t end; /* end of the freed extent */
- int error; /* error value */
- xfs_rtblock_t postblock; /* first block freed > end */
- xfs_rtblock_t preblock; /* first block freed < start */
+ struct xfs_mount *mp = args->mp;
+ xfs_rtxnum_t end; /* end of the freed extent */
+ int error; /* error value */
+ xfs_rtxnum_t postblock; /* first rtext freed > end */
+ xfs_rtxnum_t preblock; /* first rtext freed < start */
end = start + len - 1;
/*
* Modify the bitmap to mark this extent freed.
*/
- error = xfs_rtmodify_range(mp, tp, start, len, 1);
+ error = xfs_rtmodify_range(args, start, len, 1);
if (error) {
return error;
}
@@ -711,15 +703,15 @@ xfs_rtfree_range(
* We need to find the beginning and end of the extent so we can
* properly update the summary.
*/
- error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+ error = xfs_rtfind_back(args, start, 0, &preblock);
if (error) {
return error;
}
/*
* Find the next allocated block (end of allocated extent).
*/
- error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
- &postblock);
+ error = xfs_rtfind_forw(args, end, mp->m_sb.sb_rextents - 1,
+ &postblock);
if (error)
return error;
/*
@@ -727,9 +719,9 @@ xfs_rtfree_range(
* old extent, add summary data for them to be allocated.
*/
if (preblock < start) {
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(start - preblock),
- XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+ error = xfs_rtmodify_summary(args,
+ XFS_RTBLOCKLOG(start - preblock),
+ xfs_rtx_to_rbmblock(mp, preblock), -1);
if (error) {
return error;
}
@@ -739,9 +731,9 @@ xfs_rtfree_range(
* old extent, add summary data for them to be allocated.
*/
if (postblock > end) {
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(postblock - end),
- XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb);
+ error = xfs_rtmodify_summary(args,
+ XFS_RTBLOCKLOG(postblock - end),
+ xfs_rtx_to_rbmblock(mp, end + 1), -1);
if (error) {
return error;
}
@@ -750,10 +742,9 @@ xfs_rtfree_range(
* Increment the summary information corresponding to the entire
* (new) free extent.
*/
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(postblock + 1 - preblock),
- XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
- return error;
+ return xfs_rtmodify_summary(args,
+ XFS_RTBLOCKLOG(postblock + 1 - preblock),
+ xfs_rtx_to_rbmblock(mp, preblock), 1);
}
/*
@@ -762,43 +753,39 @@ xfs_rtfree_range(
*/
int
xfs_rtcheck_range(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* starting block number of extent */
- xfs_extlen_t len, /* length of extent */
- int val, /* 1 for free, 0 for allocated */
- xfs_rtblock_t *new, /* out: first block not matching */
- int *stat) /* out: 1 for matches, 0 for not */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxnum_t start, /* starting rtext number of extent */
+ xfs_rtxlen_t len, /* length of extent */
+ int val, /* 1 for free, 0 for allocated */
+ xfs_rtxnum_t *new, /* out: first rtext not matching */
+ int *stat) /* out: 1 for matches, 0 for not */
{
- xfs_rtword_t *b; /* current word in buffer */
- int bit; /* bit number in the word */
- xfs_rtblock_t block; /* bitmap block number */
- struct xfs_buf *bp; /* buf for the block */
- xfs_rtword_t *bufp; /* starting word in buffer */
- int error; /* error value */
- xfs_rtblock_t i; /* current bit number rel. to start */
- xfs_rtblock_t lastbit; /* last useful bit in word */
- xfs_rtword_t mask; /* mask of relevant bits for value */
- xfs_rtword_t wdiff; /* difference from wanted value */
- int word; /* word number in the buffer */
+ struct xfs_mount *mp = args->mp;
+ int bit; /* bit number in the word */
+ xfs_fileoff_t block; /* bitmap block number */
+ int error;
+ xfs_rtxnum_t i; /* current bit number rel. to start */
+ xfs_rtxnum_t lastbit; /* last useful bit in word */
+ xfs_rtword_t mask; /* mask of relevant bits for value */
+ xfs_rtword_t wdiff; /* difference from wanted value */
+ xfs_rtword_t incore;
+ unsigned int word; /* word number in the buffer */
/*
* Compute starting bitmap block number
*/
- block = XFS_BITTOBLOCK(mp, start);
+ block = xfs_rtx_to_rbmblock(mp, start);
/*
* Read the bitmap block.
*/
- error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
- if (error) {
+ error = xfs_rtbitmap_read_buf(args, block);
+ if (error)
return error;
- }
- bufp = bp->b_addr;
+
/*
* Compute the starting word's address, and starting bit.
*/
- word = XFS_BITTOWORD(mp, start);
- b = &bufp[word];
+ word = xfs_rtx_to_rbmword(mp, start);
bit = (int)(start & (XFS_NBWORD - 1));
/*
* 0 (allocated) => all zero's; 1 (free) => all one's.
@@ -820,11 +807,11 @@ xfs_rtcheck_range(
/*
* Compute difference between actual and desired value.
*/
- if ((wdiff = (*b ^ val) & mask)) {
+ incore = xfs_rtbitmap_getword(args, word);
+ if ((wdiff = (incore ^ val) & mask)) {
/*
* Different, compute first wrong bit and return.
*/
- xfs_trans_brelse(tp, bp);
i = XFS_RTLOBIT(wdiff) - bit;
*new = start + i;
*stat = 0;
@@ -835,22 +822,15 @@ xfs_rtcheck_range(
* Go on to next block if that's where the next word is
* and we need the next word.
*/
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ if (++word == mp->m_blockwsize && i < len) {
/*
* If done with this block, get the next one.
*/
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
+ error = xfs_rtbitmap_read_buf(args, ++block);
+ if (error)
return error;
- }
- b = bufp = bp->b_addr;
+
word = 0;
- } else {
- /*
- * Go on to the next word in the buffer.
- */
- b++;
}
} else {
/*
@@ -866,11 +846,11 @@ xfs_rtcheck_range(
/*
* Compute difference between actual and desired value.
*/
- if ((wdiff = *b ^ val)) {
+ incore = xfs_rtbitmap_getword(args, word);
+ if ((wdiff = incore ^ val)) {
/*
* Different, compute first wrong bit and return.
*/
- xfs_trans_brelse(tp, bp);
i += XFS_RTLOBIT(wdiff);
*new = start + i;
*stat = 0;
@@ -881,22 +861,15 @@ xfs_rtcheck_range(
* Go on to next block if that's where the next word is
* and we need the next word.
*/
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ if (++word == mp->m_blockwsize && i < len) {
/*
* If done with this block, get the next one.
*/
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
+ error = xfs_rtbitmap_read_buf(args, ++block);
+ if (error)
return error;
- }
- b = bufp = bp->b_addr;
+
word = 0;
- } else {
- /*
- * Go on to the next word in the buffer.
- */
- b++;
}
}
/*
@@ -911,11 +884,11 @@ xfs_rtcheck_range(
/*
* Compute difference between actual and desired value.
*/
- if ((wdiff = (*b ^ val) & mask)) {
+ incore = xfs_rtbitmap_getword(args, word);
+ if ((wdiff = (incore ^ val) & mask)) {
/*
* Different, compute first wrong bit and return.
*/
- xfs_trans_brelse(tp, bp);
i += XFS_RTLOBIT(wdiff);
*new = start + i;
*stat = 0;
@@ -926,7 +899,6 @@ xfs_rtcheck_range(
/*
* Successful, return.
*/
- xfs_trans_brelse(tp, bp);
*new = start + i;
*stat = 1;
return 0;
@@ -936,57 +908,57 @@ xfs_rtcheck_range(
/*
* Check that the given extent (block range) is allocated already.
*/
-STATIC int /* error */
+STATIC int
xfs_rtcheck_alloc_range(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t bno, /* starting block number of extent */
- xfs_extlen_t len) /* length of extent */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxnum_t start, /* starting rtext number of extent */
+ xfs_rtxlen_t len) /* length of extent */
{
- xfs_rtblock_t new; /* dummy for xfs_rtcheck_range */
- int stat;
- int error;
+ xfs_rtxnum_t new; /* dummy for xfs_rtcheck_range */
+ int stat;
+ int error;
- error = xfs_rtcheck_range(mp, tp, bno, len, 0, &new, &stat);
+ error = xfs_rtcheck_range(args, start, len, 0, &new, &stat);
if (error)
return error;
ASSERT(stat);
return 0;
}
#else
-#define xfs_rtcheck_alloc_range(m,t,b,l) (0)
+#define xfs_rtcheck_alloc_range(a,b,l) (0)
#endif
/*
* Free an extent in the realtime subvolume. Length is expressed in
* realtime extents, as is the block number.
*/
-int /* error */
+int
xfs_rtfree_extent(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t bno, /* starting block number to free */
- xfs_extlen_t len) /* length of extent freed */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_rtxnum_t start, /* starting rtext number to free */
+ xfs_rtxlen_t len) /* length of extent freed */
{
- int error; /* error value */
- xfs_mount_t *mp; /* file system mount structure */
- xfs_fsblock_t sb; /* summary file block number */
- struct xfs_buf *sumbp = NULL; /* summary file block buffer */
-
- mp = tp->t_mountp;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_rtalloc_args args = {
+ .mp = mp,
+ .tp = tp,
+ };
+ int error;
+ struct timespec64 atime;
ASSERT(mp->m_rbmip->i_itemp != NULL);
ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
- error = xfs_rtcheck_alloc_range(mp, tp, bno, len);
+ error = xfs_rtcheck_alloc_range(&args, start, len);
if (error)
return error;
/*
* Free the range of realtime blocks.
*/
- error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb);
- if (error) {
- return error;
- }
+ error = xfs_rtfree_range(&args, start, len);
+ if (error)
+ goto out;
+
/*
* Mark more blocks free in the superblock.
*/
@@ -999,10 +971,49 @@ xfs_rtfree_extent(
mp->m_sb.sb_rextents) {
if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM))
mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;
- *(uint64_t *)&VFS_I(mp->m_rbmip)->i_atime = 0;
+
+ atime = inode_get_atime(VFS_I(mp->m_rbmip));
+ atime.tv_sec = 0;
+ inode_set_atime_to_ts(VFS_I(mp->m_rbmip), atime);
xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
}
- return 0;
+ error = 0;
+out:
+ xfs_rtbuf_cache_relse(&args);
+ return error;
+}
+
+/*
+ * Free some blocks in the realtime subvolume. rtbno and rtlen are in units of
+ * rt blocks, not rt extents; must be aligned to the rt extent size; and rtlen
+ * cannot exceed XFS_MAX_BMBT_EXTLEN.
+ */
+int
+xfs_rtfree_blocks(
+ struct xfs_trans *tp,
+ xfs_fsblock_t rtbno,
+ xfs_filblks_t rtlen)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_rtxnum_t start;
+ xfs_filblks_t len;
+ xfs_extlen_t mod;
+
+ ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN);
+
+ len = xfs_rtb_to_rtxrem(mp, rtlen, &mod);
+ if (mod) {
+ ASSERT(mod == 0);
+ return -EIO;
+ }
+
+ start = xfs_rtb_to_rtxrem(mp, rtbno, &mod);
+ if (mod) {
+ ASSERT(mod == 0);
+ return -EIO;
+ }
+
+ return xfs_rtfree_extent(tp, start, len);
}
/* Find all the free records within a given range. */
@@ -1015,10 +1026,14 @@ xfs_rtalloc_query_range(
xfs_rtalloc_query_range_fn fn,
void *priv)
{
+ struct xfs_rtalloc_args args = {
+ .mp = mp,
+ .tp = tp,
+ };
struct xfs_rtalloc_rec rec;
- xfs_rtblock_t rtstart;
- xfs_rtblock_t rtend;
- xfs_rtblock_t high_key;
+ xfs_rtxnum_t rtstart;
+ xfs_rtxnum_t rtend;
+ xfs_rtxnum_t high_key;
int is_free;
int error = 0;
@@ -1034,13 +1049,13 @@ xfs_rtalloc_query_range(
rtstart = low_rec->ar_startext;
while (rtstart <= high_key) {
/* Is the first block free? */
- error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend,
+ error = xfs_rtcheck_range(&args, rtstart, 1, 1, &rtend,
&is_free);
if (error)
break;
/* How long does the extent go for? */
- error = xfs_rtfind_forw(mp, tp, rtstart, high_key, &rtend);
+ error = xfs_rtfind_forw(&args, rtstart, high_key, &rtend);
if (error)
break;
@@ -1056,6 +1071,7 @@ xfs_rtalloc_query_range(
rtstart = rtend + 1;
}
+ xfs_rtbuf_cache_relse(&args);
return error;
}
@@ -1081,18 +1097,79 @@ int
xfs_rtalloc_extent_is_free(
struct xfs_mount *mp,
struct xfs_trans *tp,
- xfs_rtblock_t start,
- xfs_extlen_t len,
+ xfs_rtxnum_t start,
+ xfs_rtxlen_t len,
bool *is_free)
{
- xfs_rtblock_t end;
+ struct xfs_rtalloc_args args = {
+ .mp = mp,
+ .tp = tp,
+ };
+ xfs_rtxnum_t end;
int matches;
int error;
- error = xfs_rtcheck_range(mp, tp, start, len, 1, &end, &matches);
+ error = xfs_rtcheck_range(&args, start, len, 1, &end, &matches);
+ xfs_rtbuf_cache_relse(&args);
if (error)
return error;
*is_free = matches;
return 0;
}
+
+/*
+ * Compute the number of rtbitmap blocks needed to track the given number of rt
+ * extents.
+ */
+xfs_filblks_t
+xfs_rtbitmap_blockcount(
+ struct xfs_mount *mp,
+ xfs_rtbxlen_t rtextents)
+{
+ return howmany_64(rtextents, NBBY * mp->m_sb.sb_blocksize);
+}
+
+/*
+ * Compute the number of rtbitmap words needed to populate every block of a
+ * bitmap that is large enough to track the given number of rt extents.
+ */
+unsigned long long
+xfs_rtbitmap_wordcount(
+ struct xfs_mount *mp,
+ xfs_rtbxlen_t rtextents)
+{
+ xfs_filblks_t blocks;
+
+ blocks = xfs_rtbitmap_blockcount(mp, rtextents);
+ return XFS_FSB_TO_B(mp, blocks) >> XFS_WORDLOG;
+}
+
+/* Compute the number of rtsummary blocks needed to track the given rt space. */
+xfs_filblks_t
+xfs_rtsummary_blockcount(
+ struct xfs_mount *mp,
+ unsigned int rsumlevels,
+ xfs_extlen_t rbmblocks)
+{
+ unsigned long long rsumwords;
+
+ rsumwords = (unsigned long long)rsumlevels * rbmblocks;
+ return XFS_B_TO_FSB(mp, rsumwords << XFS_WORDLOG);
+}
+
+/*
+ * Compute the number of rtsummary info words needed to populate every block of
+ * a summary file that is large enough to track the given rt space.
+ */
+unsigned long long
+xfs_rtsummary_wordcount(
+ struct xfs_mount *mp,
+ unsigned int rsumlevels,
+ xfs_extlen_t rbmblocks)
+{
+ xfs_filblks_t blocks;
+
+ blocks = xfs_rtsummary_blockcount(mp, rsumlevels, rbmblocks);
+ return XFS_FSB_TO_B(mp, blocks) >> XFS_WORDLOG;
+}
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h
new file mode 100644
index 000000000000..c0637057d69c
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtbitmap.h
@@ -0,0 +1,383 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_RTBITMAP_H__
+#define __XFS_RTBITMAP_H__
+
+struct xfs_rtalloc_args {
+ struct xfs_mount *mp;
+ struct xfs_trans *tp;
+
+ struct xfs_buf *rbmbp; /* bitmap block buffer */
+ struct xfs_buf *sumbp; /* summary block buffer */
+
+ xfs_fileoff_t rbmoff; /* bitmap block number */
+ xfs_fileoff_t sumoff; /* summary block number */
+};
+
+static inline xfs_rtblock_t
+xfs_rtx_to_rtb(
+ struct xfs_mount *mp,
+ xfs_rtxnum_t rtx)
+{
+ if (mp->m_rtxblklog >= 0)
+ return rtx << mp->m_rtxblklog;
+
+ return rtx * mp->m_sb.sb_rextsize;
+}
+
+static inline xfs_extlen_t
+xfs_rtxlen_to_extlen(
+ struct xfs_mount *mp,
+ xfs_rtxlen_t rtxlen)
+{
+ if (mp->m_rtxblklog >= 0)
+ return rtxlen << mp->m_rtxblklog;
+
+ return rtxlen * mp->m_sb.sb_rextsize;
+}
+
+/* Compute the misalignment between an extent length and a realtime extent .*/
+static inline unsigned int
+xfs_extlen_to_rtxmod(
+ struct xfs_mount *mp,
+ xfs_extlen_t len)
+{
+ if (mp->m_rtxblklog >= 0)
+ return len & mp->m_rtxblkmask;
+
+ return len % mp->m_sb.sb_rextsize;
+}
+
+static inline xfs_rtxlen_t
+xfs_extlen_to_rtxlen(
+ struct xfs_mount *mp,
+ xfs_extlen_t len)
+{
+ if (mp->m_rtxblklog >= 0)
+ return len >> mp->m_rtxblklog;
+
+ return len / mp->m_sb.sb_rextsize;
+}
+
+/* Convert an rt block number into an rt extent number. */
+static inline xfs_rtxnum_t
+xfs_rtb_to_rtx(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno)
+{
+ if (likely(mp->m_rtxblklog >= 0))
+ return rtbno >> mp->m_rtxblklog;
+
+ return div_u64(rtbno, mp->m_sb.sb_rextsize);
+}
+
+/* Return the offset of an rt block number within an rt extent. */
+static inline xfs_extlen_t
+xfs_rtb_to_rtxoff(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno)
+{
+ if (likely(mp->m_rtxblklog >= 0))
+ return rtbno & mp->m_rtxblkmask;
+
+ return do_div(rtbno, mp->m_sb.sb_rextsize);
+}
+
+/*
+ * Crack an rt block number into an rt extent number and an offset within that
+ * rt extent. Returns the rt extent number directly and the offset in @off.
+ */
+static inline xfs_rtxnum_t
+xfs_rtb_to_rtxrem(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno,
+ xfs_extlen_t *off)
+{
+ if (likely(mp->m_rtxblklog >= 0)) {
+ *off = rtbno & mp->m_rtxblkmask;
+ return rtbno >> mp->m_rtxblklog;
+ }
+
+ return div_u64_rem(rtbno, mp->m_sb.sb_rextsize, off);
+}
+
+/*
+ * Convert an rt block number into an rt extent number, rounding up to the next
+ * rt extent if the rt block is not aligned to an rt extent boundary.
+ */
+static inline xfs_rtxnum_t
+xfs_rtb_to_rtxup(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno)
+{
+ if (likely(mp->m_rtxblklog >= 0)) {
+ if (rtbno & mp->m_rtxblkmask)
+ return (rtbno >> mp->m_rtxblklog) + 1;
+ return rtbno >> mp->m_rtxblklog;
+ }
+
+ if (do_div(rtbno, mp->m_sb.sb_rextsize))
+ rtbno++;
+ return rtbno;
+}
+
+/* Round this rtblock up to the nearest rt extent size. */
+static inline xfs_rtblock_t
+xfs_rtb_roundup_rtx(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno)
+{
+ return roundup_64(rtbno, mp->m_sb.sb_rextsize);
+}
+
+/* Round this rtblock down to the nearest rt extent size. */
+static inline xfs_rtblock_t
+xfs_rtb_rounddown_rtx(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno)
+{
+ return rounddown_64(rtbno, mp->m_sb.sb_rextsize);
+}
+
+/* Convert an rt extent number to a file block offset in the rt bitmap file. */
+static inline xfs_fileoff_t
+xfs_rtx_to_rbmblock(
+ struct xfs_mount *mp,
+ xfs_rtxnum_t rtx)
+{
+ return rtx >> mp->m_blkbit_log;
+}
+
+/* Convert an rt extent number to a word offset within an rt bitmap block. */
+static inline unsigned int
+xfs_rtx_to_rbmword(
+ struct xfs_mount *mp,
+ xfs_rtxnum_t rtx)
+{
+ return (rtx >> XFS_NBWORDLOG) & (mp->m_blockwsize - 1);
+}
+
+/* Convert a file block offset in the rt bitmap file to an rt extent number. */
+static inline xfs_rtxnum_t
+xfs_rbmblock_to_rtx(
+ struct xfs_mount *mp,
+ xfs_fileoff_t rbmoff)
+{
+ return rbmoff << mp->m_blkbit_log;
+}
+
+/* Return a pointer to a bitmap word within a rt bitmap block. */
+static inline union xfs_rtword_raw *
+xfs_rbmblock_wordptr(
+ struct xfs_rtalloc_args *args,
+ unsigned int index)
+{
+ union xfs_rtword_raw *words = args->rbmbp->b_addr;
+
+ return words + index;
+}
+
+/* Convert an ondisk bitmap word to its incore representation. */
+static inline xfs_rtword_t
+xfs_rtbitmap_getword(
+ struct xfs_rtalloc_args *args,
+ unsigned int index)
+{
+ union xfs_rtword_raw *word = xfs_rbmblock_wordptr(args, index);
+
+ return word->old;
+}
+
+/* Set an ondisk bitmap word from an incore representation. */
+static inline void
+xfs_rtbitmap_setword(
+ struct xfs_rtalloc_args *args,
+ unsigned int index,
+ xfs_rtword_t value)
+{
+ union xfs_rtword_raw *word = xfs_rbmblock_wordptr(args, index);
+
+ word->old = value;
+}
+
+/*
+ * Convert a rt extent length and rt bitmap block number to a xfs_suminfo_t
+ * offset within the rt summary file.
+ */
+static inline xfs_rtsumoff_t
+xfs_rtsumoffs(
+ struct xfs_mount *mp,
+ int log2_len,
+ xfs_fileoff_t rbmoff)
+{
+ return log2_len * mp->m_sb.sb_rbmblocks + rbmoff;
+}
+
+/*
+ * Convert an xfs_suminfo_t offset to a file block offset within the rt summary
+ * file.
+ */
+static inline xfs_fileoff_t
+xfs_rtsumoffs_to_block(
+ struct xfs_mount *mp,
+ xfs_rtsumoff_t rsumoff)
+{
+ return XFS_B_TO_FSBT(mp, rsumoff * sizeof(xfs_suminfo_t));
+}
+
+/*
+ * Convert an xfs_suminfo_t offset to an info word offset within an rt summary
+ * block.
+ */
+static inline unsigned int
+xfs_rtsumoffs_to_infoword(
+ struct xfs_mount *mp,
+ xfs_rtsumoff_t rsumoff)
+{
+ unsigned int mask = mp->m_blockmask >> XFS_SUMINFOLOG;
+
+ return rsumoff & mask;
+}
+
+/* Return a pointer to a summary info word within a rt summary block. */
+static inline union xfs_suminfo_raw *
+xfs_rsumblock_infoptr(
+ struct xfs_rtalloc_args *args,
+ unsigned int index)
+{
+ union xfs_suminfo_raw *info = args->sumbp->b_addr;
+
+ return info + index;
+}
+
+/* Get the current value of a summary counter. */
+static inline xfs_suminfo_t
+xfs_suminfo_get(
+ struct xfs_rtalloc_args *args,
+ unsigned int index)
+{
+ union xfs_suminfo_raw *info = xfs_rsumblock_infoptr(args, index);
+
+ return info->old;
+}
+
+/* Add to the current value of a summary counter and return the new value. */
+static inline xfs_suminfo_t
+xfs_suminfo_add(
+ struct xfs_rtalloc_args *args,
+ unsigned int index,
+ int delta)
+{
+ union xfs_suminfo_raw *info = xfs_rsumblock_infoptr(args, index);
+
+ info->old += delta;
+ return info->old;
+}
+
+/*
+ * Functions for walking free space rtextents in the realtime bitmap.
+ */
+struct xfs_rtalloc_rec {
+ xfs_rtxnum_t ar_startext;
+ xfs_rtbxlen_t ar_extcount;
+};
+
+typedef int (*xfs_rtalloc_query_range_fn)(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ const struct xfs_rtalloc_rec *rec,
+ void *priv);
+
+#ifdef CONFIG_XFS_RT
+void xfs_rtbuf_cache_relse(struct xfs_rtalloc_args *args);
+
+int xfs_rtbuf_get(struct xfs_rtalloc_args *args, xfs_fileoff_t block,
+ int issum);
+
+static inline int
+xfs_rtbitmap_read_buf(
+ struct xfs_rtalloc_args *args,
+ xfs_fileoff_t block)
+{
+ return xfs_rtbuf_get(args, block, 0);
+}
+
+static inline int
+xfs_rtsummary_read_buf(
+ struct xfs_rtalloc_args *args,
+ xfs_fileoff_t block)
+{
+ return xfs_rtbuf_get(args, block, 1);
+}
+
+int xfs_rtcheck_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
+ xfs_rtxlen_t len, int val, xfs_rtxnum_t *new, int *stat);
+int xfs_rtfind_back(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
+ xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock);
+int xfs_rtfind_forw(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
+ xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock);
+int xfs_rtmodify_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
+ xfs_rtxlen_t len, int val);
+int xfs_rtmodify_summary_int(struct xfs_rtalloc_args *args, int log,
+ xfs_fileoff_t bbno, int delta, xfs_suminfo_t *sum);
+int xfs_rtmodify_summary(struct xfs_rtalloc_args *args, int log,
+ xfs_fileoff_t bbno, int delta);
+int xfs_rtfree_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
+ xfs_rtxlen_t len);
+int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp,
+ const struct xfs_rtalloc_rec *low_rec,
+ const struct xfs_rtalloc_rec *high_rec,
+ xfs_rtalloc_query_range_fn fn, void *priv);
+int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_rtalloc_query_range_fn fn,
+ void *priv);
+int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_rtxnum_t start, xfs_rtxlen_t len,
+ bool *is_free);
+/*
+ * Free an extent in the realtime subvolume. Length is expressed in
+ * realtime extents, as is the block number.
+ */
+int /* error */
+xfs_rtfree_extent(
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_rtxnum_t start, /* starting rtext number to free */
+ xfs_rtxlen_t len); /* length of extent freed */
+
+/* Same as above, but in units of rt blocks. */
+int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno,
+ xfs_filblks_t rtlen);
+
+xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t
+ rtextents);
+unsigned long long xfs_rtbitmap_wordcount(struct xfs_mount *mp,
+ xfs_rtbxlen_t rtextents);
+
+xfs_filblks_t xfs_rtsummary_blockcount(struct xfs_mount *mp,
+ unsigned int rsumlevels, xfs_extlen_t rbmblocks);
+unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp,
+ unsigned int rsumlevels, xfs_extlen_t rbmblocks);
+#else /* CONFIG_XFS_RT */
+# define xfs_rtfree_extent(t,b,l) (-ENOSYS)
+# define xfs_rtfree_blocks(t,rb,rl) (-ENOSYS)
+# define xfs_rtalloc_query_range(m,t,l,h,f,p) (-ENOSYS)
+# define xfs_rtalloc_query_all(m,t,f,p) (-ENOSYS)
+# define xfs_rtbitmap_read_buf(a,b) (-ENOSYS)
+# define xfs_rtsummary_read_buf(a,b) (-ENOSYS)
+# define xfs_rtbuf_cache_relse(a) (0)
+# define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS)
+static inline xfs_filblks_t
+xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents)
+{
+ /* shut up gcc */
+ return 0;
+}
+# define xfs_rtbitmap_wordcount(mp, r) (0)
+# define xfs_rtsummary_blockcount(mp, l, b) (0)
+# define xfs_rtsummary_wordcount(mp, l, b) (0)
+#endif /* CONFIG_XFS_RT */
+
+#endif /* __XFS_RTBITMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 6264daaab37b..1f74d0cd1618 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -975,6 +975,8 @@ xfs_sb_mount_common(
mp->m_blockmask = sbp->sb_blocksize - 1;
mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
mp->m_blockwmask = mp->m_blockwsize - 1;
+ mp->m_rtxblklog = log2_if_power2(sbp->sb_rextsize);
+ mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize);
mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index a5e14740ec9a..19134b23c10b 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -25,7 +25,7 @@ extern uint64_t xfs_sb_version_to_features(struct xfs_sb *sbp);
extern int xfs_update_secondary_sbs(struct xfs_mount *mp);
-#define XFS_FS_GEOM_MAX_STRUCT_VER (4)
+#define XFS_FS_GEOM_MAX_STRUCT_VER (5)
extern void xfs_fs_geometry(struct xfs_mount *mp, struct xfs_fsop_geom *geo,
int struct_version);
extern int xfs_sb_read_secondary(struct xfs_mount *mp,
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index 6b2296ff248a..70e97ea6eee7 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -65,7 +65,7 @@ xfs_trans_ichgtime(
tv = current_time(inode);
if (flags & XFS_ICHGTIME_MOD)
- inode->i_mtime = tv;
+ inode_set_mtime_to_ts(inode, tv);
if (flags & XFS_ICHGTIME_CHG)
inode_set_ctime_to_ts(inode, tv);
if (flags & XFS_ICHGTIME_CREATE)
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 5b2f27cbdb80..6cd45e8c118d 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -19,6 +19,7 @@
#include "xfs_trans.h"
#include "xfs_qm.h"
#include "xfs_trans_space.h"
+#include "xfs_rtbitmap.h"
#define _ALLOC true
#define _FREE false
@@ -217,11 +218,12 @@ xfs_rtalloc_block_count(
struct xfs_mount *mp,
unsigned int num_ops)
{
- unsigned int blksz = XFS_FSB_TO_B(mp, 1);
- unsigned int rtbmp_bytes;
+ unsigned int rtbmp_blocks;
+ xfs_rtxlen_t rtxlen;
- rtbmp_bytes = (XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize) / NBBY;
- return (howmany(rtbmp_bytes, blksz) + 1) * num_ops;
+ rtxlen = xfs_extlen_to_rtxlen(mp, XFS_MAX_BMBT_EXTLEN);
+ rtbmp_blocks = xfs_rtbitmap_blockcount(mp, rtxlen);
+ return (rtbmp_blocks + 1) * num_ops;
}
/*
diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c
index 5c2765934732..c299b16c9365 100644
--- a/fs/xfs/libxfs/xfs_types.c
+++ b/fs/xfs/libxfs/xfs_types.c
@@ -148,10 +148,10 @@ xfs_verify_rtbno(
/* Verify that a realtime device extent is fully contained inside the volume. */
bool
-xfs_verify_rtext(
+xfs_verify_rtbext(
struct xfs_mount *mp,
xfs_rtblock_t rtbno,
- xfs_rtblock_t len)
+ xfs_filblks_t len)
{
if (rtbno + len <= rtbno)
return false;
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 851220021484..533200c4ccc2 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -11,6 +11,7 @@ typedef uint32_t prid_t; /* project ID */
typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */
typedef uint32_t xfs_agino_t; /* inode # within allocation grp */
typedef uint32_t xfs_extlen_t; /* extent length in blocks */
+typedef uint32_t xfs_rtxlen_t; /* file extent length in rtextents */
typedef uint32_t xfs_agnumber_t; /* allocation group number */
typedef uint64_t xfs_extnum_t; /* # of extents in a file */
typedef uint32_t xfs_aextnum_t; /* # extents in an attribute fork */
@@ -18,6 +19,7 @@ typedef int64_t xfs_fsize_t; /* bytes in a file */
typedef uint64_t xfs_ufsize_t; /* unsigned bytes in a file */
typedef int32_t xfs_suminfo_t; /* type of bitmap summary info */
+typedef uint32_t xfs_rtsumoff_t; /* offset of an rtsummary info word */
typedef uint32_t xfs_rtword_t; /* word type for bitmap manipulations */
typedef int64_t xfs_lsn_t; /* log sequence number */
@@ -31,6 +33,8 @@ typedef uint64_t xfs_rfsblock_t; /* blockno in filesystem (raw) */
typedef uint64_t xfs_rtblock_t; /* extent (block) in realtime area */
typedef uint64_t xfs_fileoff_t; /* block number in a file */
typedef uint64_t xfs_filblks_t; /* number of blocks in a file */
+typedef uint64_t xfs_rtxnum_t; /* rtextent number */
+typedef uint64_t xfs_rtbxlen_t; /* rtbitmap extent length in rtextents */
typedef int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */
@@ -47,6 +51,7 @@ typedef void * xfs_failaddr_t;
#define NULLRFSBLOCK ((xfs_rfsblock_t)-1)
#define NULLRTBLOCK ((xfs_rtblock_t)-1)
#define NULLFILEOFF ((xfs_fileoff_t)-1)
+#define NULLRTEXTNO ((xfs_rtxnum_t)-1)
#define NULLAGBLOCK ((xfs_agblock_t)-1)
#define NULLAGNUMBER ((xfs_agnumber_t)-1)
@@ -145,6 +150,7 @@ typedef uint32_t xfs_dqid_t;
*/
#define XFS_NBBYLOG 3 /* log2(NBBY) */
#define XFS_WORDLOG 2 /* log2(sizeof(xfs_rtword_t)) */
+#define XFS_SUMINFOLOG 2 /* log2(sizeof(xfs_suminfo_t)) */
#define XFS_NBWORDLOG (XFS_NBBYLOG + XFS_WORDLOG)
#define XFS_NBWORD (1 << XFS_NBWORDLOG)
#define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1)
@@ -229,8 +235,8 @@ bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
-bool xfs_verify_rtext(struct xfs_mount *mp, xfs_rtblock_t rtbno,
- xfs_rtblock_t len);
+bool xfs_verify_rtbext(struct xfs_mount *mp, xfs_rtblock_t rtbno,
+ xfs_filblks_t len);
bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount);
bool xfs_verify_dablk(struct xfs_mount *mp, xfs_fileoff_t off);
void xfs_icount_range(struct xfs_mount *mp, unsigned long long *min,
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 75588915572e..06d8c1996a33 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -410,7 +410,7 @@ xchk_bmap_iextent(
/* Make sure the extent points to a valid place. */
if (info->is_rt &&
- !xfs_verify_rtext(mp, irec->br_startblock, irec->br_blockcount))
+ !xfs_verify_rtbext(mp, irec->br_startblock, irec->br_blockcount))
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
if (!info->is_rt &&
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index 05be757668bb..5799e9a94f1f 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -16,7 +16,7 @@
#include "xfs_health.h"
#include "xfs_btree.h"
#include "xfs_ag.h"
-#include "xfs_rtalloc.h"
+#include "xfs_rtbitmap.h"
#include "xfs_inode.h"
#include "xfs_icache.h"
#include "scrub/scrub.h"
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 59d7912fb75f..889f556bc98f 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -20,6 +20,7 @@
#include "xfs_reflink.h"
#include "xfs_rmap.h"
#include "xfs_bmap_util.h"
+#include "xfs_rtbitmap.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
@@ -225,7 +226,7 @@ xchk_inode_extsize(
*/
if ((flags & XFS_DIFLAG_RTINHERIT) &&
(flags & XFS_DIFLAG_EXTSZINHERIT) &&
- value % sc->mp->m_sb.sb_rextsize > 0)
+ xfs_extlen_to_rtxmod(sc->mp, value) > 0)
xchk_ino_set_warning(sc, ino);
}
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 008ddb599e13..41a1d89ae8e6 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -11,7 +11,7 @@
#include "xfs_mount.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
-#include "xfs_rtalloc.h"
+#include "xfs_rtbitmap.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
#include "scrub/scrub.h"
@@ -48,12 +48,12 @@ xchk_rtbitmap_rec(
{
struct xfs_scrub *sc = priv;
xfs_rtblock_t startblock;
- xfs_rtblock_t blockcount;
+ xfs_filblks_t blockcount;
- startblock = rec->ar_startext * mp->m_sb.sb_rextsize;
- blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize;
+ startblock = xfs_rtx_to_rtb(mp, rec->ar_startext);
+ blockcount = xfs_rtx_to_rtb(mp, rec->ar_extcount);
- if (!xfs_verify_rtext(mp, startblock, blockcount))
+ if (!xfs_verify_rtbext(mp, startblock, blockcount))
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
return 0;
}
@@ -128,26 +128,22 @@ out:
void
xchk_xref_is_used_rt_space(
struct xfs_scrub *sc,
- xfs_rtblock_t fsbno,
+ xfs_rtblock_t rtbno,
xfs_extlen_t len)
{
- xfs_rtblock_t startext;
- xfs_rtblock_t endext;
- xfs_rtblock_t extcount;
+ xfs_rtxnum_t startext;
+ xfs_rtxnum_t endext;
bool is_free;
int error;
if (xchk_skip_xref(sc->sm))
return;
- startext = fsbno;
- endext = fsbno + len - 1;
- do_div(startext, sc->mp->m_sb.sb_rextsize);
- do_div(endext, sc->mp->m_sb.sb_rextsize);
- extcount = endext - startext + 1;
+ startext = xfs_rtb_to_rtx(sc->mp, rtbno);
+ endext = xfs_rtb_to_rtx(sc->mp, rtbno + len - 1);
xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
- error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount,
- &is_free);
+ error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext,
+ endext - startext + 1, &is_free);
if (!xchk_should_check_xref(sc, &error, NULL))
goto out_unlock;
if (is_free)
diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c
index 437ed9acbb27..8b15c47408d0 100644
--- a/fs/xfs/scrub/rtsummary.c
+++ b/fs/xfs/scrub/rtsummary.c
@@ -13,7 +13,7 @@
#include "xfs_inode.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
-#include "xfs_rtalloc.h"
+#include "xfs_rtbitmap.h"
#include "xfs_bit.h"
#include "xfs_bmap.h"
#include "scrub/scrub.h"
@@ -81,34 +81,45 @@ typedef unsigned int xchk_rtsumoff_t;
static inline int
xfsum_load(
struct xfs_scrub *sc,
- xchk_rtsumoff_t sumoff,
- xfs_suminfo_t *info)
+ xfs_rtsumoff_t sumoff,
+ union xfs_suminfo_raw *rawinfo)
{
- return xfile_obj_load(sc->xfile, info, sizeof(xfs_suminfo_t),
+ return xfile_obj_load(sc->xfile, rawinfo,
+ sizeof(union xfs_suminfo_raw),
sumoff << XFS_WORDLOG);
}
static inline int
xfsum_store(
struct xfs_scrub *sc,
- xchk_rtsumoff_t sumoff,
- const xfs_suminfo_t info)
+ xfs_rtsumoff_t sumoff,
+ const union xfs_suminfo_raw rawinfo)
{
- return xfile_obj_store(sc->xfile, &info, sizeof(xfs_suminfo_t),
+ return xfile_obj_store(sc->xfile, &rawinfo,
+ sizeof(union xfs_suminfo_raw),
sumoff << XFS_WORDLOG);
}
static inline int
xfsum_copyout(
struct xfs_scrub *sc,
- xchk_rtsumoff_t sumoff,
- xfs_suminfo_t *info,
+ xfs_rtsumoff_t sumoff,
+ union xfs_suminfo_raw *rawinfo,
unsigned int nr_words)
{
- return xfile_obj_load(sc->xfile, info, nr_words << XFS_WORDLOG,
+ return xfile_obj_load(sc->xfile, rawinfo, nr_words << XFS_WORDLOG,
sumoff << XFS_WORDLOG);
}
+static inline xfs_suminfo_t
+xchk_rtsum_inc(
+ struct xfs_mount *mp,
+ union xfs_suminfo_raw *v)
+{
+ v->old += 1;
+ return v->old;
+}
+
/* Update the summary file to reflect the free extent that we've accumulated. */
STATIC int
xchk_rtsum_record_free(
@@ -121,23 +132,24 @@ xchk_rtsum_record_free(
xfs_fileoff_t rbmoff;
xfs_rtblock_t rtbno;
xfs_filblks_t rtlen;
- xchk_rtsumoff_t offs;
+ xfs_rtsumoff_t offs;
unsigned int lenlog;
- xfs_suminfo_t v = 0;
+ union xfs_suminfo_raw v;
+ xfs_suminfo_t value;
int error = 0;
if (xchk_should_terminate(sc, &error))
return error;
/* Compute the relevant location in the rtsum file. */
- rbmoff = XFS_BITTOBLOCK(mp, rec->ar_startext);
+ rbmoff = xfs_rtx_to_rbmblock(mp, rec->ar_startext);
lenlog = XFS_RTBLOCKLOG(rec->ar_extcount);
- offs = XFS_SUMOFFS(mp, lenlog, rbmoff);
+ offs = xfs_rtsumoffs(mp, lenlog, rbmoff);
- rtbno = rec->ar_startext * mp->m_sb.sb_rextsize;
- rtlen = rec->ar_extcount * mp->m_sb.sb_rextsize;
+ rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext);
+ rtlen = xfs_rtx_to_rtb(mp, rec->ar_extcount);
- if (!xfs_verify_rtext(mp, rtbno, rtlen)) {
+ if (!xfs_verify_rtbext(mp, rtbno, rtlen)) {
xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino);
return -EFSCORRUPTED;
}
@@ -147,9 +159,9 @@ xchk_rtsum_record_free(
if (error)
return error;
- v++;
+ value = xchk_rtsum_inc(sc->mp, &v);
trace_xchk_rtsum_record_free(mp, rec->ar_startext, rec->ar_extcount,
- lenlog, offs, v);
+ lenlog, offs, value);
return xfsum_store(sc, offs, v);
}
@@ -160,12 +172,11 @@ xchk_rtsum_compute(
struct xfs_scrub *sc)
{
struct xfs_mount *mp = sc->mp;
- unsigned long long rtbmp_bytes;
+ unsigned long long rtbmp_blocks;
/* If the bitmap size doesn't match the computed size, bail. */
- rtbmp_bytes = howmany_64(mp->m_sb.sb_rextents, NBBY);
- if (roundup_64(rtbmp_bytes, mp->m_sb.sb_blocksize) !=
- mp->m_rbmip->i_disk_size)
+ rtbmp_blocks = xfs_rtbitmap_blockcount(mp, mp->m_sb.sb_rextents);
+ if (XFS_FSB_TO_B(mp, rtbmp_blocks) != mp->m_rbmip->i_disk_size)
return -EFSCORRUPTED;
return xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtsum_record_free,
@@ -177,14 +188,18 @@ STATIC int
xchk_rtsum_compare(
struct xfs_scrub *sc)
{
+ struct xfs_rtalloc_args args = {
+ .mp = sc->mp,
+ .tp = sc->tp,
+ };
struct xfs_mount *mp = sc->mp;
- struct xfs_buf *bp;
struct xfs_bmbt_irec map;
xfs_fileoff_t off;
xchk_rtsumoff_t sumoff = 0;
int nmap;
for (off = 0; off < XFS_B_TO_FSB(mp, mp->m_rsumsize); off++) {
+ union xfs_suminfo_raw *ondisk_info;
int error = 0;
if (xchk_should_terminate(sc, &error))
@@ -205,22 +220,23 @@ xchk_rtsum_compare(
}
/* Read a block's worth of ondisk rtsummary file. */
- error = xfs_rtbuf_get(mp, sc->tp, off, 1, &bp);
+ error = xfs_rtsummary_read_buf(&args, off);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error))
return error;
/* Read a block's worth of computed rtsummary file. */
error = xfsum_copyout(sc, sumoff, sc->buf, mp->m_blockwsize);
if (error) {
- xfs_trans_brelse(sc->tp, bp);
+ xfs_rtbuf_cache_relse(&args);
return error;
}
- if (memcmp(bp->b_addr, sc->buf,
+ ondisk_info = xfs_rsumblock_infoptr(&args, 0);
+ if (memcmp(ondisk_info, sc->buf,
mp->m_blockwsize << XFS_WORDLOG) != 0)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off);
- xfs_trans_brelse(sc->tp, bp);
+ xfs_rtbuf_cache_relse(&args);
sumoff += mp->m_blockwsize;
}
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 46249e7b17e0..29afa4851235 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -13,6 +13,7 @@
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_ag.h"
+#include "xfs_rtbitmap.h"
#include "scrub/scrub.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index cbd4d01e253c..4a8bc6f3c8f2 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -1036,17 +1036,18 @@ TRACE_EVENT(xfarray_sort_stats,
#ifdef CONFIG_XFS_RT
TRACE_EVENT(xchk_rtsum_record_free,
- TP_PROTO(struct xfs_mount *mp, xfs_rtblock_t start,
- uint64_t len, unsigned int log, loff_t pos, xfs_suminfo_t v),
- TP_ARGS(mp, start, len, log, pos, v),
+ TP_PROTO(struct xfs_mount *mp, xfs_rtxnum_t start,
+ xfs_rtbxlen_t len, unsigned int log, loff_t pos,
+ xfs_suminfo_t value),
+ TP_ARGS(mp, start, len, log, pos, value),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(dev_t, rtdev)
- __field(xfs_rtblock_t, start)
+ __field(xfs_rtxnum_t, start)
__field(unsigned long long, len)
__field(unsigned int, log)
__field(loff_t, pos)
- __field(xfs_suminfo_t, v)
+ __field(xfs_suminfo_t, value)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
@@ -1055,7 +1056,7 @@ TRACE_EVENT(xchk_rtsum_record_free,
__entry->len = len;
__entry->log = log;
__entry->pos = pos;
- __entry->v = v;
+ __entry->value = value;
),
TP_printk("dev %d:%d rtdev %d:%d rtx 0x%llx rtxcount 0x%llx log %u rsumpos 0x%llx sumcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -1064,7 +1065,7 @@ TRACE_EVENT(xchk_rtsum_record_free,
__entry->len,
__entry->log,
__entry->pos,
- __entry->v)
+ __entry->value)
);
#endif /* CONFIG_XFS_RT */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index fcefab687285..731260a5af6d 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -28,6 +28,7 @@
#include "xfs_icache.h"
#include "xfs_iomap.h"
#include "xfs_reflink.h"
+#include "xfs_rtbitmap.h"
/* Kernel only BMAP related definitions and functions */
@@ -75,28 +76,28 @@ xfs_bmap_rtalloc(
{
struct xfs_mount *mp = ap->ip->i_mount;
xfs_fileoff_t orig_offset = ap->offset;
- xfs_rtblock_t rtb;
- xfs_extlen_t prod = 0; /* product factor for allocators */
+ xfs_rtxnum_t rtx;
+ xfs_rtxlen_t prod = 0; /* product factor for allocators */
xfs_extlen_t mod = 0; /* product factor for allocators */
- xfs_extlen_t ralen = 0; /* realtime allocation length */
+ xfs_rtxlen_t ralen = 0; /* realtime allocation length */
xfs_extlen_t align; /* minimum allocation alignment */
xfs_extlen_t orig_length = ap->length;
xfs_extlen_t minlen = mp->m_sb.sb_rextsize;
- xfs_extlen_t raminlen;
+ xfs_rtxlen_t raminlen;
bool rtlocked = false;
bool ignore_locality = false;
int error;
align = xfs_get_extsz_hint(ap->ip);
retry:
- prod = align / mp->m_sb.sb_rextsize;
+ prod = xfs_extlen_to_rtxlen(mp, align);
error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
align, 1, ap->eof, 0,
ap->conv, &ap->offset, &ap->length);
if (error)
return error;
ASSERT(ap->length);
- ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
+ ASSERT(xfs_extlen_to_rtxmod(mp, ap->length) == 0);
/*
* If we shifted the file offset downward to satisfy an extent size
@@ -116,17 +117,14 @@ retry:
prod = 1;
/*
* Set ralen to be the actual requested length in rtextents.
- */
- ralen = ap->length / mp->m_sb.sb_rextsize;
- /*
+ *
* If the old value was close enough to XFS_BMBT_MAX_EXTLEN that
* we rounded up to it, cut it back so it's valid again.
* Note that if it's a really large request (bigger than
* XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't
* adjust the starting point to match it.
*/
- if (ralen * mp->m_sb.sb_rextsize >= XFS_MAX_BMBT_EXTLEN)
- ralen = XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize;
+ ralen = xfs_extlen_to_rtxlen(mp, min(ap->length, XFS_MAX_BMBT_EXTLEN));
/*
* Lock out modifications to both the RT bitmap and summary inodes
@@ -144,12 +142,10 @@ retry:
* pick an extent that will space things out in the rt area.
*/
if (ap->eof && ap->offset == 0) {
- xfs_rtblock_t rtx; /* realtime extent no */
-
error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
if (error)
return error;
- ap->blkno = rtx * mp->m_sb.sb_rextsize;
+ ap->blkno = xfs_rtx_to_rtb(mp, rtx);
} else {
ap->blkno = 0;
}
@@ -160,20 +156,18 @@ retry:
* Realtime allocation, done through xfs_rtallocate_extent.
*/
if (ignore_locality)
- ap->blkno = 0;
+ rtx = 0;
else
- do_div(ap->blkno, mp->m_sb.sb_rextsize);
- rtb = ap->blkno;
- ap->length = ralen;
- raminlen = max_t(xfs_extlen_t, 1, minlen / mp->m_sb.sb_rextsize);
- error = xfs_rtallocate_extent(ap->tp, ap->blkno, raminlen, ap->length,
- &ralen, ap->wasdel, prod, &rtb);
+ rtx = xfs_rtb_to_rtx(mp, ap->blkno);
+ raminlen = max_t(xfs_rtxlen_t, 1, xfs_extlen_to_rtxlen(mp, minlen));
+ error = xfs_rtallocate_extent(ap->tp, rtx, raminlen, ralen, &ralen,
+ ap->wasdel, prod, &rtx);
if (error)
return error;
- if (rtb != NULLRTBLOCK) {
- ap->blkno = rtb * mp->m_sb.sb_rextsize;
- ap->length = ralen * mp->m_sb.sb_rextsize;
+ if (rtx != NULLRTEXTNO) {
+ ap->blkno = xfs_rtx_to_rtb(mp, rtx);
+ ap->length = xfs_rtxlen_to_extlen(mp, ralen);
ap->ip->i_nblocks += ap->length;
xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
if (ap->wasdel)
@@ -690,7 +684,7 @@ xfs_can_free_eofblocks(
*/
end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1)
- end_fsb = roundup_64(end_fsb, mp->m_sb.sb_rextsize);
+ end_fsb = xfs_rtb_roundup_rtx(mp, end_fsb);
last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
if (last_fsb <= end_fsb)
return false;
@@ -780,12 +774,10 @@ xfs_alloc_file_space(
{
xfs_mount_t *mp = ip->i_mount;
xfs_off_t count;
- xfs_filblks_t allocated_fsb;
xfs_filblks_t allocatesize_fsb;
xfs_extlen_t extsz, temp;
xfs_fileoff_t startoffset_fsb;
xfs_fileoff_t endoffset_fsb;
- int nimaps;
int rt;
xfs_trans_t *tp;
xfs_bmbt_irec_t imaps[1], *imapp;
@@ -808,7 +800,6 @@ xfs_alloc_file_space(
count = len;
imapp = &imaps[0];
- nimaps = 1;
startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
endoffset_fsb = XFS_B_TO_FSB(mp, offset + count);
allocatesize_fsb = endoffset_fsb - startoffset_fsb;
@@ -819,6 +810,7 @@ xfs_alloc_file_space(
while (allocatesize_fsb && !error) {
xfs_fileoff_t s, e;
unsigned int dblocks, rblocks, resblks;
+ int nimaps = 1;
/*
* Determine space reservations for data/realtime.
@@ -884,15 +876,19 @@ xfs_alloc_file_space(
if (error)
break;
- allocated_fsb = imapp->br_blockcount;
-
- if (nimaps == 0) {
- error = -ENOSPC;
- break;
+ /*
+ * If the allocator cannot find a single free extent large
+ * enough to cover the start block of the requested range,
+ * xfs_bmapi_write will return 0 but leave *nimaps set to 0.
+ *
+ * In that case we simply need to keep looping with the same
+ * startoffset_fsb so that one of the following allocations
+ * will eventually reach the requested range.
+ */
+ if (nimaps) {
+ startoffset_fsb += imapp->br_blockcount;
+ allocatesize_fsb -= imapp->br_blockcount;
}
-
- startoffset_fsb += allocated_fsb;
- allocatesize_fsb -= allocated_fsb;
}
return error;
@@ -989,10 +985,8 @@ xfs_free_file_space(
/* We can only free complete realtime extents. */
if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) {
- startoffset_fsb = roundup_64(startoffset_fsb,
- mp->m_sb.sb_rextsize);
- endoffset_fsb = rounddown_64(endoffset_fsb,
- mp->m_sb.sb_rextsize);
+ startoffset_fsb = xfs_rtb_roundup_rtx(mp, startoffset_fsb);
+ endoffset_fsb = xfs_rtb_rounddown_rtx(mp, endoffset_fsb);
}
/*
@@ -1644,7 +1638,7 @@ xfs_swap_extents(
uint64_t f;
int resblks = 0;
unsigned int flags = 0;
- struct timespec64 ctime;
+ struct timespec64 ctime, mtime;
/*
* Lock the inodes against other IO, page faults and truncate to
@@ -1758,10 +1752,11 @@ xfs_swap_extents(
* under it.
*/
ctime = inode_get_ctime(VFS_I(ip));
+ mtime = inode_get_mtime(VFS_I(ip));
if ((sbp->bs_ctime.tv_sec != ctime.tv_sec) ||
(sbp->bs_ctime.tv_nsec != ctime.tv_nsec) ||
- (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
- (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
+ (sbp->bs_mtime.tv_sec != mtime.tv_sec) ||
+ (sbp->bs_mtime.tv_nsec != mtime.tv_nsec)) {
error = -EBUSY;
goto out_trans_cancel;
}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index c1ece4a08ff4..545c7991b9b5 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1913,8 +1913,7 @@ xfs_buftarg_shrink_scan(
struct shrinker *shrink,
struct shrink_control *sc)
{
- struct xfs_buftarg *btp = container_of(shrink,
- struct xfs_buftarg, bt_shrinker);
+ struct xfs_buftarg *btp = shrink->private_data;
LIST_HEAD(dispose);
unsigned long freed;
@@ -1936,8 +1935,7 @@ xfs_buftarg_shrink_count(
struct shrinker *shrink,
struct shrink_control *sc)
{
- struct xfs_buftarg *btp = container_of(shrink,
- struct xfs_buftarg, bt_shrinker);
+ struct xfs_buftarg *btp = shrink->private_data;
return list_lru_shrink_count(&btp->bt_lru, sc);
}
@@ -1945,17 +1943,15 @@ void
xfs_free_buftarg(
struct xfs_buftarg *btp)
{
- struct block_device *bdev = btp->bt_bdev;
-
- unregister_shrinker(&btp->bt_shrinker);
+ shrinker_free(btp->bt_shrinker);
ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
percpu_counter_destroy(&btp->bt_io_count);
list_lru_destroy(&btp->bt_lru);
fs_put_dax(btp->bt_daxdev, btp->bt_mount);
/* the main block device is closed by kill_block_super */
- if (bdev != btp->bt_mount->m_super->s_bdev)
- blkdev_put(bdev, btp->bt_mount->m_super);
+ if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev)
+ bdev_release(btp->bt_bdev_handle);
kmem_free(btp);
}
@@ -1990,16 +1986,15 @@ xfs_setsize_buftarg(
*/
STATIC int
xfs_setsize_buftarg_early(
- xfs_buftarg_t *btp,
- struct block_device *bdev)
+ xfs_buftarg_t *btp)
{
- return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
+ return xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev));
}
struct xfs_buftarg *
xfs_alloc_buftarg(
struct xfs_mount *mp,
- struct block_device *bdev)
+ struct bdev_handle *bdev_handle)
{
xfs_buftarg_t *btp;
const struct dax_holder_operations *ops = NULL;
@@ -2010,9 +2005,10 @@ xfs_alloc_buftarg(
btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
btp->bt_mount = mp;
- btp->bt_dev = bdev->bd_dev;
- btp->bt_bdev = bdev;
- btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off,
+ btp->bt_bdev_handle = bdev_handle;
+ btp->bt_dev = bdev_handle->bdev->bd_dev;
+ btp->bt_bdev = bdev_handle->bdev;
+ btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off,
mp, ops);
/*
@@ -2022,7 +2018,7 @@ xfs_alloc_buftarg(
ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
DEFAULT_RATELIMIT_BURST);
- if (xfs_setsize_buftarg_early(btp, bdev))
+ if (xfs_setsize_buftarg_early(btp))
goto error_free;
if (list_lru_init(&btp->bt_lru))
@@ -2031,13 +2027,17 @@ xfs_alloc_buftarg(
if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
goto error_lru;
- btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
- btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
- btp->bt_shrinker.seeks = DEFAULT_SEEKS;
- btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
- if (register_shrinker(&btp->bt_shrinker, "xfs-buf:%s",
- mp->m_super->s_id))
+ btp->bt_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s",
+ mp->m_super->s_id);
+ if (!btp->bt_shrinker)
goto error_pcpu;
+
+ btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count;
+ btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan;
+ btp->bt_shrinker->private_data = btp;
+
+ shrinker_register(btp->bt_shrinker);
+
return btp;
error_pcpu:
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index df8f47953bb4..c86e16419656 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -98,6 +98,7 @@ typedef unsigned int xfs_buf_flags_t;
*/
typedef struct xfs_buftarg {
dev_t bt_dev;
+ struct bdev_handle *bt_bdev_handle;
struct block_device *bt_bdev;
struct dax_device *bt_daxdev;
u64 bt_dax_part_off;
@@ -108,7 +109,7 @@ typedef struct xfs_buftarg {
size_t bt_logical_sectormask;
/* LRU control structures */
- struct shrinker bt_shrinker;
+ struct shrinker *bt_shrinker;
struct list_lru bt_lru;
struct percpu_counter bt_io_count;
@@ -364,7 +365,7 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
* Handling of buftargs.
*/
struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp,
- struct block_device *bdev);
+ struct bdev_handle *bdev_handle);
extern void xfs_free_buftarg(struct xfs_buftarg *);
extern void xfs_buftarg_wait(struct xfs_buftarg *);
extern void xfs_buftarg_drain(struct xfs_buftarg *);
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index ac6ba646624d..a013b87ab8d5 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -562,7 +562,8 @@ xfs_dquot_from_disk(
struct xfs_dquot *dqp,
struct xfs_buf *bp)
{
- struct xfs_disk_dquot *ddqp = bp->b_addr + dqp->q_bufoffset;
+ struct xfs_dqblk *dqb = xfs_buf_offset(bp, dqp->q_bufoffset);
+ struct xfs_disk_dquot *ddqp = &dqb->dd_diskdq;
/*
* Ensure that we got the type and ID we were looking for.
@@ -1250,7 +1251,7 @@ xfs_qm_dqflush(
}
/* Flush the incore dquot to the ondisk buffer. */
- dqblk = bp->b_addr + dqp->q_bufoffset;
+ dqblk = xfs_buf_offset(bp, dqp->q_bufoffset);
xfs_dquot_to_disk(&dqblk->dd_diskdq, dqp);
/*
diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c
index 8966ba842395..2c2720ce6923 100644
--- a/fs/xfs/xfs_dquot_item_recover.c
+++ b/fs/xfs/xfs_dquot_item_recover.c
@@ -19,6 +19,7 @@
#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
+#include "xfs_error.h"
STATIC void
xlog_recover_dquot_ra_pass2(
@@ -65,6 +66,7 @@ xlog_recover_dquot_commit_pass2(
{
struct xfs_mount *mp = log->l_mp;
struct xfs_buf *bp;
+ struct xfs_dqblk *dqb;
struct xfs_disk_dquot *ddq, *recddq;
struct xfs_dq_logformat *dq_f;
xfs_failaddr_t fa;
@@ -130,14 +132,14 @@ xlog_recover_dquot_commit_pass2(
return error;
ASSERT(bp);
- ddq = xfs_buf_offset(bp, dq_f->qlf_boffset);
+ dqb = xfs_buf_offset(bp, dq_f->qlf_boffset);
+ ddq = &dqb->dd_diskdq;
/*
* If the dquot has an LSN in it, recover the dquot only if it's less
* than the lsn of the transaction we are replaying.
*/
if (xfs_has_crc(mp)) {
- struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq;
xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn);
if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
@@ -147,10 +149,23 @@ xlog_recover_dquot_commit_pass2(
memcpy(ddq, recddq, item->ri_buf[1].i_len);
if (xfs_has_crc(mp)) {
- xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
+ xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
}
+ /* Validate the recovered dquot. */
+ fa = xfs_dqblk_verify(log->l_mp, dqb, dq_f->qlf_id);
+ if (fa) {
+ XFS_CORRUPTION_ERROR("Bad dquot after recovery",
+ XFS_ERRLEVEL_LOW, mp, dqb,
+ sizeof(struct xfs_dqblk));
+ xfs_alert(mp,
+ "Metadata corruption detected at %pS, dquot 0x%x",
+ fa, dq_f->qlf_id);
+ error = -EFSCORRUPTED;
+ goto out_release;
+ }
+
ASSERT(dq_f->qlf_size == 2);
ASSERT(bp->b_mount == mp);
bp->b_flags |= _XBF_LOGRECOVERY;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 203700278ddb..e33e5e13b95f 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -214,6 +214,43 @@ xfs_ilock_iocb(
return 0;
}
+static int
+xfs_ilock_iocb_for_write(
+ struct kiocb *iocb,
+ unsigned int *lock_mode)
+{
+ ssize_t ret;
+ struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
+
+ ret = xfs_ilock_iocb(iocb, *lock_mode);
+ if (ret)
+ return ret;
+
+ if (*lock_mode == XFS_IOLOCK_EXCL)
+ return 0;
+ if (!xfs_iflags_test(ip, XFS_IREMAPPING))
+ return 0;
+
+ xfs_iunlock(ip, *lock_mode);
+ *lock_mode = XFS_IOLOCK_EXCL;
+ return xfs_ilock_iocb(iocb, *lock_mode);
+}
+
+static unsigned int
+xfs_ilock_for_write_fault(
+ struct xfs_inode *ip)
+{
+ /* get a shared lock if no remapping in progress */
+ xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+ if (!xfs_iflags_test(ip, XFS_IREMAPPING))
+ return XFS_MMAPLOCK_SHARED;
+
+ /* wait for remapping to complete */
+ xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+ xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+ return XFS_MMAPLOCK_EXCL;
+}
+
STATIC ssize_t
xfs_file_dio_read(
struct kiocb *iocb,
@@ -551,7 +588,7 @@ xfs_file_dio_write_aligned(
unsigned int iolock = XFS_IOLOCK_SHARED;
ssize_t ret;
- ret = xfs_ilock_iocb(iocb, iolock);
+ ret = xfs_ilock_iocb_for_write(iocb, &iolock);
if (ret)
return ret;
ret = xfs_file_write_checks(iocb, from, &iolock);
@@ -618,7 +655,7 @@ retry_exclusive:
flags = IOMAP_DIO_FORCE_WAIT;
}
- ret = xfs_ilock_iocb(iocb, iolock);
+ ret = xfs_ilock_iocb_for_write(iocb, &iolock);
if (ret)
return ret;
@@ -1180,7 +1217,7 @@ xfs_file_remap_range(
if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
xfs_log_force_inode(dest);
out_unlock:
- xfs_iunlock2_io_mmap(src, dest);
+ xfs_iunlock2_remapping(src, dest);
if (ret)
trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
return remapped > 0 ? remapped : ret;
@@ -1328,6 +1365,7 @@ __xfs_filemap_fault(
struct inode *inode = file_inode(vmf->vma->vm_file);
struct xfs_inode *ip = XFS_I(inode);
vm_fault_t ret;
+ unsigned int lock_mode = 0;
trace_xfs_filemap_fault(ip, order, write_fault);
@@ -1336,25 +1374,24 @@ __xfs_filemap_fault(
file_update_time(vmf->vma->vm_file);
}
+ if (IS_DAX(inode) || write_fault)
+ lock_mode = xfs_ilock_for_write_fault(XFS_I(inode));
+
if (IS_DAX(inode)) {
pfn_t pfn;
- xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
ret = xfs_dax_fault(vmf, order, write_fault, &pfn);
if (ret & VM_FAULT_NEEDDSYNC)
ret = dax_finish_sync_fault(vmf, order, pfn);
- xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ } else if (write_fault) {
+ ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops);
} else {
- if (write_fault) {
- xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- ret = iomap_page_mkwrite(vmf,
- &xfs_page_mkwrite_iomap_ops);
- xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- } else {
- ret = filemap_fault(vmf);
- }
+ ret = filemap_fault(vmf);
}
+ if (lock_mode)
+ xfs_iunlock(XFS_I(inode), lock_mode);
+
if (write_fault)
sb_end_pagefault(inode->i_sb);
return ret;
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 736e5545f584..5a72217f5feb 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -23,7 +23,7 @@
#include "xfs_refcount.h"
#include "xfs_refcount_btree.h"
#include "xfs_alloc_btree.h"
-#include "xfs_rtalloc.h"
+#include "xfs_rtbitmap.h"
#include "xfs_ag.h"
/* Convert an xfs_fsmap to an fsmap. */
@@ -483,11 +483,11 @@ xfs_getfsmap_rtdev_rtbitmap_helper(
xfs_rtblock_t rtbno;
xfs_daddr_t rec_daddr, len_daddr;
- rtbno = rec->ar_startext * mp->m_sb.sb_rextsize;
+ rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext);
rec_daddr = XFS_FSB_TO_BB(mp, rtbno);
irec.rm_startblock = rtbno;
- rtbno = rec->ar_extcount * mp->m_sb.sb_rextsize;
+ rtbno = xfs_rtx_to_rtb(mp, rec->ar_extcount);
len_daddr = XFS_FSB_TO_BB(mp, rtbno);
irec.rm_blockcount = rtbno;
@@ -514,7 +514,7 @@ xfs_getfsmap_rtdev_rtbitmap(
uint64_t eofs;
int error;
- eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rextents * mp->m_sb.sb_rextsize);
+ eofs = XFS_FSB_TO_BB(mp, xfs_rtx_to_rtb(mp, mp->m_sb.sb_rextents));
if (keys[0].fmr_physical >= eofs)
return 0;
start_rtb = XFS_BB_TO_FSBT(mp,
@@ -539,11 +539,8 @@ xfs_getfsmap_rtdev_rtbitmap(
* Set up query parameters to return free rtextents covering the range
* we want.
*/
- alow.ar_startext = start_rtb;
- ahigh.ar_startext = end_rtb;
- do_div(alow.ar_startext, mp->m_sb.sb_rextsize);
- if (do_div(ahigh.ar_startext, mp->m_sb.sb_rextsize))
- ahigh.ar_startext++;
+ alow.ar_startext = xfs_rtb_to_rtx(mp, start_rtb);
+ ahigh.ar_startext = xfs_rtb_to_rtxup(mp, end_rtb);
error = xfs_rtalloc_query_range(mp, tp, &alow, &ahigh,
xfs_getfsmap_rtdev_rtbitmap_helper, info);
if (error)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 3c210ac83713..dba514a2c84d 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -2165,8 +2165,7 @@ xfs_inodegc_shrinker_count(
struct shrinker *shrink,
struct shrink_control *sc)
{
- struct xfs_mount *mp = container_of(shrink, struct xfs_mount,
- m_inodegc_shrinker);
+ struct xfs_mount *mp = shrink->private_data;
struct xfs_inodegc *gc;
int cpu;
@@ -2187,8 +2186,7 @@ xfs_inodegc_shrinker_scan(
struct shrinker *shrink,
struct shrink_control *sc)
{
- struct xfs_mount *mp = container_of(shrink, struct xfs_mount,
- m_inodegc_shrinker);
+ struct xfs_mount *mp = shrink->private_data;
struct xfs_inodegc *gc;
int cpu;
bool no_items = true;
@@ -2224,13 +2222,19 @@ int
xfs_inodegc_register_shrinker(
struct xfs_mount *mp)
{
- struct shrinker *shrink = &mp->m_inodegc_shrinker;
+ mp->m_inodegc_shrinker = shrinker_alloc(SHRINKER_NONSLAB,
+ "xfs-inodegc:%s",
+ mp->m_super->s_id);
+ if (!mp->m_inodegc_shrinker)
+ return -ENOMEM;
+
+ mp->m_inodegc_shrinker->count_objects = xfs_inodegc_shrinker_count;
+ mp->m_inodegc_shrinker->scan_objects = xfs_inodegc_shrinker_scan;
+ mp->m_inodegc_shrinker->seeks = 0;
+ mp->m_inodegc_shrinker->batch = XFS_INODEGC_SHRINKER_BATCH;
+ mp->m_inodegc_shrinker->private_data = mp;
- shrink->count_objects = xfs_inodegc_shrinker_count;
- shrink->scan_objects = xfs_inodegc_shrinker_scan;
- shrink->seeks = 0;
- shrink->flags = SHRINKER_NONSLAB;
- shrink->batch = XFS_INODEGC_SHRINKER_BATCH;
+ shrinker_register(mp->m_inodegc_shrinker);
- return register_shrinker(shrink, "xfs-inodegc:%s", mp->m_super->s_id);
+ return 0;
}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 4d55f58d99b7..c0f1c89786c2 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -844,8 +844,8 @@ xfs_init_new_inode(
ASSERT(ip->i_nblocks == 0);
tv = inode_set_ctime_current(inode);
- inode->i_mtime = tv;
- inode->i_atime = tv;
+ inode_set_mtime_to_ts(inode, tv);
+ inode_set_atime_to_ts(inode, tv);
ip->i_extsize = 0;
ip->i_diflags = 0;
@@ -918,6 +918,13 @@ xfs_droplink(
xfs_trans_t *tp,
xfs_inode_t *ip)
{
+ if (VFS_I(ip)->i_nlink == 0) {
+ xfs_alert(ip->i_mount,
+ "%s: Attempt to drop inode (%llu) with nlink zero.",
+ __func__, ip->i_ino);
+ return -EFSCORRUPTED;
+ }
+
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
drop_nlink(VFS_I(ip));
@@ -3621,6 +3628,23 @@ xfs_iunlock2_io_mmap(
inode_unlock(VFS_I(ip1));
}
+/* Drop the MMAPLOCK and the IOLOCK after a remap completes. */
+void
+xfs_iunlock2_remapping(
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2)
+{
+ xfs_iflags_clear(ip1, XFS_IREMAPPING);
+
+ if (ip1 != ip2)
+ xfs_iunlock(ip1, XFS_MMAPLOCK_SHARED);
+ xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
+
+ if (ip1 != ip2)
+ inode_unlock_shared(VFS_I(ip1));
+ inode_unlock(VFS_I(ip2));
+}
+
/*
* Reload the incore inode list for this inode. Caller should ensure that
* the link count cannot change, either by taking ILOCK_SHARED or otherwise
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0c5bdb91152e..3beb470f1892 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -347,6 +347,14 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
/* Quotacheck is running but inode has not been added to quota counts. */
#define XFS_IQUOTAUNCHECKED (1 << 14)
+/*
+ * Remap in progress. Callers that wish to update file data while
+ * holding a shared IOLOCK or MMAPLOCK must drop the lock and retake
+ * the lock in exclusive mode. Relocking the file will block until
+ * IREMAPPING is cleared.
+ */
+#define XFS_IREMAPPING (1U << 15)
+
/* All inode state flags related to inode reclaim. */
#define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \
XFS_IRECLAIM | \
@@ -561,6 +569,14 @@ extern void xfs_setup_inode(struct xfs_inode *ip);
extern void xfs_setup_iops(struct xfs_inode *ip);
extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init);
+static inline void xfs_update_stable_writes(struct xfs_inode *ip)
+{
+ if (bdev_stable_writes(xfs_inode_buftarg(ip)->bt_bdev))
+ mapping_set_stable_writes(VFS_I(ip)->i_mapping);
+ else
+ mapping_clear_stable_writes(VFS_I(ip)->i_mapping);
+}
+
/*
* When setting up a newly allocated inode, we need to call
* xfs_finish_inode_setup() once the inode is fully instantiated at
@@ -595,6 +611,7 @@ void xfs_end_io(struct work_struct *work);
int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
+void xfs_iunlock2_remapping(struct xfs_inode *ip1, struct xfs_inode *ip2);
static inline bool
xfs_inode_unlinked_incomplete(
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 127b2410eb20..cd7803fda8b1 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -19,6 +19,7 @@
#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_error.h"
+#include "xfs_rtbitmap.h"
#include <linux/iversion.h>
@@ -107,7 +108,7 @@ xfs_inode_item_precommit(
*/
if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
(ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
- (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) {
+ xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) {
ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
XFS_DIFLAG_EXTSZINHERIT);
ip->i_extsize = 0;
@@ -526,8 +527,8 @@ xfs_inode_to_log_dinode(
to->di_projid_hi = ip->i_projid >> 16;
memset(to->di_pad3, 0, sizeof(to->di_pad3));
- to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime);
- to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime);
+ to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode_get_atime(inode));
+ to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode_get_mtime(inode));
to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode_get_ctime(inode));
to->di_nlink = inode->i_nlink;
to->di_gen = inode->i_generation;
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index 0e5dba2343ea..144198a6b270 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -286,6 +286,7 @@ xlog_recover_inode_commit_pass2(
struct xfs_log_dinode *ldip;
uint isize;
int need_free = 0;
+ xfs_failaddr_t fa;
if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
in_f = item->ri_buf[0].i_addr;
@@ -369,24 +370,26 @@ xlog_recover_inode_commit_pass2(
* superblock flag to determine whether we need to look at di_flushiter
* to skip replay when the on disk inode is newer than the log one
*/
- if (!xfs_has_v3inodes(mp) &&
- ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
- /*
- * Deal with the wrap case, DI_MAX_FLUSH is less
- * than smaller numbers
- */
- if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
- ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
- /* do nothing */
- } else {
- trace_xfs_log_recover_inode_skip(log, in_f);
- error = 0;
- goto out_release;
+ if (!xfs_has_v3inodes(mp)) {
+ if (ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
+ /*
+ * Deal with the wrap case, DI_MAX_FLUSH is less
+ * than smaller numbers
+ */
+ if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
+ ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
+ /* do nothing */
+ } else {
+ trace_xfs_log_recover_inode_skip(log, in_f);
+ error = 0;
+ goto out_release;
+ }
}
+
+ /* Take the opportunity to reset the flush iteration count */
+ ldip->di_flushiter = 0;
}
- /* Take the opportunity to reset the flush iteration count */
- ldip->di_flushiter = 0;
if (unlikely(S_ISREG(ldip->di_mode))) {
if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
@@ -528,8 +531,19 @@ out_owner_change:
(dip->di_mode != 0))
error = xfs_recover_inode_owner_change(mp, dip, in_f,
buffer_list);
- /* re-generate the checksum. */
+ /* re-generate the checksum and validate the recovered inode. */
xfs_dinode_calc_crc(log->l_mp, dip);
+ fa = xfs_dinode_verify(log->l_mp, in_f->ilf_ino, dip);
+ if (fa) {
+ XFS_CORRUPTION_ERROR(
+ "Bad dinode after recovery",
+ XFS_ERRLEVEL_LOW, mp, dip, sizeof(*dip));
+ xfs_alert(mp,
+ "Metadata corruption detected at %pS, inode 0x%llx",
+ fa, in_f->ilf_ino);
+ error = -EFSCORRUPTED;
+ goto out_release;
+ }
ASSERT(bp->b_mount == mp);
bp->b_flags |= _XBF_LOGRECOVERY;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 55bb01173cde..6c3919687ea6 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -38,6 +38,7 @@
#include "xfs_reflink.h"
#include "xfs_ioctl.h"
#include "xfs_xattr.h"
+#include "xfs_rtbitmap.h"
#include <linux/mount.h>
#include <linux/namei.h>
@@ -1004,7 +1005,7 @@ xfs_fill_fsxattr(
* later.
*/
if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
- ip->i_extsize % mp->m_sb.sb_rextsize > 0) {
+ xfs_extlen_to_rtxmod(mp, ip->i_extsize) > 0) {
fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE |
FS_XFLAG_EXTSZINHERIT);
fa->fsx_extsize = 0;
@@ -1120,23 +1121,25 @@ xfs_ioctl_setattr_xflags(
struct fileattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
+ bool rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME);
uint64_t i_flags2;
- /* Can't change realtime flag if any extents are allocated. */
- if ((ip->i_df.if_nextents || ip->i_delayed_blks) &&
- XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME))
- return -EINVAL;
+ if (rtflag != XFS_IS_REALTIME_INODE(ip)) {
+ /* Can't change realtime flag if any extents are allocated. */
+ if (ip->i_df.if_nextents || ip->i_delayed_blks)
+ return -EINVAL;
+ }
- /* If realtime flag is set then must have realtime device */
- if (fa->fsx_xflags & FS_XFLAG_REALTIME) {
+ if (rtflag) {
+ /* If realtime flag is set then must have realtime device */
if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 ||
- (ip->i_extsize % mp->m_sb.sb_rextsize))
+ xfs_extlen_to_rtxmod(mp, ip->i_extsize))
return -EINVAL;
- }
- /* Clear reflink if we are actually able to set the rt flag. */
- if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip))
- ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+ /* Clear reflink if we are actually able to set the rt flag. */
+ if (xfs_is_reflink_inode(ip))
+ ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+ }
/* diflags2 only valid for v3 inodes. */
i_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
@@ -1147,6 +1150,14 @@ xfs_ioctl_setattr_xflags(
ip->i_diflags2 = i_flags2;
xfs_diflags_to_iflags(ip, false);
+
+ /*
+ * Make the stable writes flag match that of the device the inode
+ * resides on when flipping the RT flag.
+ */
+ if (rtflag != XFS_IS_REALTIME_INODE(ip) && S_ISREG(VFS_I(ip)->i_mode))
+ xfs_update_stable_writes(ip);
+
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
XFS_STATS_INC(mp, xs_ig_attrchg);
diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h
index c14852362fce..052d0e888c27 100644
--- a/fs/xfs/xfs_ioctl32.h
+++ b/fs/xfs/xfs_ioctl32.h
@@ -22,7 +22,7 @@
/*
* On intel, even if sizes match, alignment and/or padding may differ.
*/
-#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+#if defined(CONFIG_X86_64)
#define BROKEN_X86_ALIGNMENT
#define __compat_packed __attribute__((packed))
#else
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 2b3b05c28e9e..a0d77f5f512e 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -572,8 +572,8 @@ xfs_vn_getattr(
stat->uid = vfsuid_into_kuid(vfsuid);
stat->gid = vfsgid_into_kgid(vfsgid);
stat->ino = ip->i_ino;
- stat->atime = inode->i_atime;
- stat->mtime = inode->i_mtime;
+ stat->atime = inode_get_atime(inode);
+ stat->mtime = inode_get_mtime(inode);
stat->ctime = inode_get_ctime(inode);
stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks);
@@ -1067,9 +1067,9 @@ xfs_vn_update_time(
now = current_time(inode);
if (flags & S_MTIME)
- inode->i_mtime = now;
+ inode_set_mtime_to_ts(inode, now);
if (flags & S_ATIME)
- inode->i_atime = now;
+ inode_set_atime_to_ts(inode, now);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_log_inode(tp, ip, log_flags);
@@ -1299,6 +1299,13 @@ xfs_setup_inode(
mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS)));
/*
+ * For real-time inodes update the stable write flags to that of the RT
+ * device instead of the data device.
+ */
+ if (S_ISREG(inode->i_mode) && XFS_IS_REALTIME_INODE(ip))
+ xfs_update_stable_writes(ip);
+
+ /*
* If there is no attribute fork no ACL can exist on this inode,
* and it can't have any file capabilities attached to it either.
*/
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f5377ba5967a..14462614fcc8 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -107,12 +107,12 @@ xfs_bulkstat_one_int(
buf->bs_size = ip->i_disk_size;
buf->bs_nlink = inode->i_nlink;
- buf->bs_atime = inode->i_atime.tv_sec;
- buf->bs_atime_nsec = inode->i_atime.tv_nsec;
- buf->bs_mtime = inode->i_mtime.tv_sec;
- buf->bs_mtime_nsec = inode->i_mtime.tv_nsec;
- buf->bs_ctime = inode_get_ctime(inode).tv_sec;
- buf->bs_ctime_nsec = inode_get_ctime(inode).tv_nsec;
+ buf->bs_atime = inode_get_atime_sec(inode);
+ buf->bs_atime_nsec = inode_get_atime_nsec(inode);
+ buf->bs_mtime = inode_get_mtime_sec(inode);
+ buf->bs_mtime_nsec = inode_get_mtime_nsec(inode);
+ buf->bs_ctime = inode_get_ctime_sec(inode);
+ buf->bs_ctime_nsec = inode_get_ctime_nsec(inode);
buf->bs_gen = inode->i_generation;
buf->bs_mode = inode->i_mode;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index e9d317a3dafe..d7873e0360f0 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -198,6 +198,18 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y)
return x;
}
+/* If @b is a power of 2, return log2(b). Else return -1. */
+static inline int8_t log2_if_power2(unsigned long b)
+{
+ return is_power_of_2(b) ? ilog2(b) : -1;
+}
+
+/* If @b is a power of 2, return a mask of the lower bits, else return zero. */
+static inline unsigned long long mask64_if_power2(unsigned long b)
+{
+ return is_power_of_2(b) ? b - 1 : 0;
+}
+
int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count,
char *data, enum req_op op);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 51c100c86177..ee206facf0dc 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1893,9 +1893,7 @@ xlog_write_iclog(
* the buffer manually, the code needs to be kept in sync
* with the I/O completion path.
*/
- xlog_state_done_syncing(iclog);
- up(&iclog->ic_sema);
- return;
+ goto sync;
}
/*
@@ -1925,20 +1923,17 @@ xlog_write_iclog(
* avoid shutdown re-entering this path and erroring out again.
*/
if (log->l_targ != log->l_mp->m_ddev_targp &&
- blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev)) {
- xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
- return;
- }
+ blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev))
+ goto shutdown;
}
if (iclog->ic_flags & XLOG_ICL_NEED_FUA)
iclog->ic_bio.bi_opf |= REQ_FUA;
iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
- if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) {
- xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
- return;
- }
+ if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count))
+ goto shutdown;
+
if (is_vmalloc_addr(iclog->ic_data))
flush_kernel_vmap_range(iclog->ic_data, count);
@@ -1959,6 +1954,12 @@ xlog_write_iclog(
}
submit_bio(&iclog->ic_bio);
+ return;
+shutdown:
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+sync:
+ xlog_state_done_syncing(iclog);
+ up(&iclog->ic_sema);
}
/*
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 13b94d2e605b..a1e18b24971a 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2511,7 +2511,7 @@ xlog_abort_defer_ops(
list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
list_del_init(&dfc->dfc_list);
- xfs_defer_ops_capture_free(mp, dfc);
+ xfs_defer_ops_capture_abort(mp, dfc);
}
}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 0a0fd19573d8..aed5be5508fe 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1021,7 +1021,7 @@ xfs_mountfs(
out_log_dealloc:
xfs_log_mount_cancel(mp);
out_inodegc_shrinker:
- unregister_shrinker(&mp->m_inodegc_shrinker);
+ shrinker_free(mp->m_inodegc_shrinker);
out_fail_wait:
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
xfs_buftarg_drain(mp->m_logdev_targp);
@@ -1104,7 +1104,7 @@ xfs_unmountfs(
#if defined(DEBUG)
xfs_errortag_clearall(mp);
#endif
- unregister_shrinker(&mp->m_inodegc_shrinker);
+ shrinker_free(mp->m_inodegc_shrinker);
xfs_free_perag(mp);
xfs_errortag_del(mp);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index d19cca099bc3..503fe3c7edbf 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -101,9 +101,9 @@ typedef struct xfs_mount {
/*
* Optional cache of rt summary level per bitmap block with the
- * invariant that m_rsum_cache[bbno] <= the minimum i for which
- * rsum[i][bbno] != 0. Reads and writes are serialized by the rsumip
- * inode lock.
+ * invariant that m_rsum_cache[bbno] > the maximum i for which
+ * rsum[i][bbno] != 0, or 0 if rsum[i][bbno] == 0 for all i.
+ * Reads and writes are serialized by the rsumip inode lock.
*/
uint8_t *m_rsum_cache;
struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
@@ -119,6 +119,7 @@ typedef struct xfs_mount {
uint8_t m_blkbb_log; /* blocklog - BBSHIFT */
uint8_t m_agno_log; /* log #ag's */
uint8_t m_sectbb_log; /* sectlog - BBSHIFT */
+ int8_t m_rtxblklog; /* log2 of rextsize, if possible */
uint m_blockmask; /* sb_blocksize-1 */
uint m_blockwsize; /* sb_blocksize in words */
uint m_blockwmask; /* blockwsize-1 */
@@ -152,6 +153,7 @@ typedef struct xfs_mount {
uint64_t m_features; /* active filesystem features */
uint64_t m_low_space[XFS_LOWSP_MAX];
uint64_t m_low_rtexts[XFS_LOWSP_MAX];
+ uint64_t m_rtxblkmask; /* rt extent block mask */
struct xfs_ino_geometry m_ino_geo; /* inode geometry */
struct xfs_trans_resv m_resv; /* precomputed res values */
/* low free space thresholds */
@@ -219,7 +221,7 @@ typedef struct xfs_mount {
atomic_t m_agirotor; /* last ag dir inode alloced */
/* Memory shrinker to throttle and reprioritize inodegc */
- struct shrinker m_inodegc_shrinker;
+ struct shrinker *m_inodegc_shrinker;
/*
* Workqueue item so that we can coalesce multiple inode flush attempts
* into a single flush.
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index c4cc99b70dd3..21a7e350b4c5 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -72,6 +72,10 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_map_t, 4);
XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_local_t, 4);
+ /* realtime structures */
+ XFS_CHECK_STRUCT_SIZE(union xfs_rtword_raw, 4);
+ XFS_CHECK_STRUCT_SIZE(union xfs_suminfo_raw, 4);
+
/*
* m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to
* 4 bytes anyway so it's not obviously a problem. Hence for the moment
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 086e78a6143a..94a7932ac570 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -504,8 +504,7 @@ xfs_qm_shrink_scan(
struct shrinker *shrink,
struct shrink_control *sc)
{
- struct xfs_quotainfo *qi = container_of(shrink,
- struct xfs_quotainfo, qi_shrinker);
+ struct xfs_quotainfo *qi = shrink->private_data;
struct xfs_qm_isolate isol;
unsigned long freed;
int error;
@@ -539,8 +538,7 @@ xfs_qm_shrink_count(
struct shrinker *shrink,
struct shrink_control *sc)
{
- struct xfs_quotainfo *qi = container_of(shrink,
- struct xfs_quotainfo, qi_shrinker);
+ struct xfs_quotainfo *qi = shrink->private_data;
return list_lru_shrink_count(&qi->qi_lru, sc);
}
@@ -680,15 +678,18 @@ xfs_qm_init_quotainfo(
if (XFS_IS_PQUOTA_ON(mp))
xfs_qm_set_defquota(mp, XFS_DQTYPE_PROJ, qinf);
- qinf->qi_shrinker.count_objects = xfs_qm_shrink_count;
- qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan;
- qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
- qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE;
-
- error = register_shrinker(&qinf->qi_shrinker, "xfs-qm:%s",
- mp->m_super->s_id);
- if (error)
+ qinf->qi_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-qm:%s",
+ mp->m_super->s_id);
+ if (!qinf->qi_shrinker) {
+ error = -ENOMEM;
goto out_free_inos;
+ }
+
+ qinf->qi_shrinker->count_objects = xfs_qm_shrink_count;
+ qinf->qi_shrinker->scan_objects = xfs_qm_shrink_scan;
+ qinf->qi_shrinker->private_data = qinf;
+
+ shrinker_register(qinf->qi_shrinker);
return 0;
@@ -718,7 +719,7 @@ xfs_qm_destroy_quotainfo(
qi = mp->m_quotainfo;
ASSERT(qi != NULL);
- unregister_shrinker(&qi->qi_shrinker);
+ shrinker_free(qi->qi_shrinker);
list_lru_destroy(&qi->qi_lru);
xfs_qm_destroy_quotainos(qi);
mutex_destroy(&qi->qi_tree_lock);
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 9683f0457d19..d5c9fc4ba591 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -63,7 +63,7 @@ struct xfs_quotainfo {
struct xfs_def_quota qi_usr_default;
struct xfs_def_quota qi_grp_default;
struct xfs_def_quota qi_prj_default;
- struct shrinker qi_shrinker;
+ struct shrinker *qi_shrinker;
/* Minimum and maximum quota expiration timestamp values. */
time64_t qi_expiry_min;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index eb9102453aff..e5b62dc28466 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -784,6 +784,7 @@ xfs_reflink_end_cow_extent(
}
}
del = got;
+ xfs_trim_extent(&del, *offset_fsb, end_fsb - *offset_fsb);
/* Grab the corresponding mapping in the data fork. */
nmaps = 1;
@@ -1540,6 +1541,10 @@ xfs_reflink_remap_prep(
if (ret)
goto out_unlock;
+ xfs_iflags_set(src, XFS_IREMAPPING);
+ if (inode_in != inode_out)
+ xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
+
return 0;
out_unlock:
xfs_iunlock2_io_mmap(src, dest);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 16534e9873f6..88c48de5c9c8 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -19,6 +19,7 @@
#include "xfs_icache.h"
#include "xfs_rtalloc.h"
#include "xfs_sb.h"
+#include "xfs_rtbitmap.h"
/*
* Read and return the summary information for a given extent size,
@@ -28,48 +29,48 @@
*/
static int
xfs_rtget_summary(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- int log, /* log2 of extent size */
- xfs_rtblock_t bbno, /* bitmap block number */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb, /* in/out: summary block number */
- xfs_suminfo_t *sum) /* out: summary info for this block */
+ struct xfs_rtalloc_args *args,
+ int log, /* log2 of extent size */
+ xfs_fileoff_t bbno, /* bitmap block number */
+ xfs_suminfo_t *sum) /* out: summary info for this block */
{
- return xfs_rtmodify_summary_int(mp, tp, log, bbno, 0, rbpp, rsb, sum);
+ return xfs_rtmodify_summary_int(args, log, bbno, 0, sum);
}
/*
* Return whether there are any free extents in the size range given
* by low and high, for the bitmap block bbno.
*/
-STATIC int /* error */
+STATIC int
xfs_rtany_summary(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- int low, /* low log2 extent size */
- int high, /* high log2 extent size */
- xfs_rtblock_t bbno, /* bitmap block number */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb, /* in/out: summary block number */
- int *stat) /* out: any good extents here? */
+ struct xfs_rtalloc_args *args,
+ int low, /* low log2 extent size */
+ int high, /* high log2 extent size */
+ xfs_fileoff_t bbno, /* bitmap block number */
+ int *maxlog) /* out: max log2 extent size free */
{
- int error; /* error value */
- int log; /* loop counter, log2 of ext. size */
- xfs_suminfo_t sum; /* summary data */
-
- /* There are no extents at levels < m_rsum_cache[bbno]. */
- if (mp->m_rsum_cache && low < mp->m_rsum_cache[bbno])
- low = mp->m_rsum_cache[bbno];
+ struct xfs_mount *mp = args->mp;
+ int error;
+ int log; /* loop counter, log2 of ext. size */
+ xfs_suminfo_t sum; /* summary data */
+
+ /* There are no extents at levels >= m_rsum_cache[bbno]. */
+ if (mp->m_rsum_cache) {
+ high = min(high, mp->m_rsum_cache[bbno] - 1);
+ if (low > high) {
+ *maxlog = -1;
+ return 0;
+ }
+ }
/*
* Loop over logs of extent sizes.
*/
- for (log = low; log <= high; log++) {
+ for (log = high; log >= low; log--) {
/*
* Get one summary datum.
*/
- error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum);
+ error = xfs_rtget_summary(args, log, bbno, &sum);
if (error) {
return error;
}
@@ -77,18 +78,18 @@ xfs_rtany_summary(
* If there are any, return success.
*/
if (sum) {
- *stat = 1;
+ *maxlog = log;
goto out;
}
}
/*
* Found nothing, return failure.
*/
- *stat = 0;
+ *maxlog = -1;
out:
- /* There were no extents at levels < log. */
- if (mp->m_rsum_cache && log > mp->m_rsum_cache[bbno])
- mp->m_rsum_cache[bbno] = log;
+ /* There were no extents at levels > log. */
+ if (mp->m_rsum_cache && log + 1 < mp->m_rsum_cache[bbno])
+ mp->m_rsum_cache[bbno] = log + 1;
return 0;
}
@@ -97,60 +98,54 @@ out:
* Copy and transform the summary file, given the old and new
* parameters in the mount structures.
*/
-STATIC int /* error */
+STATIC int
xfs_rtcopy_summary(
- xfs_mount_t *omp, /* old file system mount point */
- xfs_mount_t *nmp, /* new file system mount point */
- xfs_trans_t *tp) /* transaction pointer */
+ struct xfs_rtalloc_args *oargs,
+ struct xfs_rtalloc_args *nargs)
{
- xfs_rtblock_t bbno; /* bitmap block number */
- struct xfs_buf *bp; /* summary buffer */
- int error; /* error return value */
- int log; /* summary level number (log length) */
- xfs_suminfo_t sum; /* summary data */
- xfs_fsblock_t sumbno; /* summary block number */
+ xfs_fileoff_t bbno; /* bitmap block number */
+ int error;
+ int log; /* summary level number (log length) */
+ xfs_suminfo_t sum; /* summary data */
- bp = NULL;
- for (log = omp->m_rsumlevels - 1; log >= 0; log--) {
- for (bbno = omp->m_sb.sb_rbmblocks - 1;
+ for (log = oargs->mp->m_rsumlevels - 1; log >= 0; log--) {
+ for (bbno = oargs->mp->m_sb.sb_rbmblocks - 1;
(xfs_srtblock_t)bbno >= 0;
bbno--) {
- error = xfs_rtget_summary(omp, tp, log, bbno, &bp,
- &sumbno, &sum);
+ error = xfs_rtget_summary(oargs, log, bbno, &sum);
if (error)
- return error;
+ goto out;
if (sum == 0)
continue;
- error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum,
- &bp, &sumbno);
+ error = xfs_rtmodify_summary(oargs, log, bbno, -sum);
if (error)
- return error;
- error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum,
- &bp, &sumbno);
+ goto out;
+ error = xfs_rtmodify_summary(nargs, log, bbno, sum);
if (error)
- return error;
+ goto out;
ASSERT(sum > 0);
}
}
+ error = 0;
+out:
+ xfs_rtbuf_cache_relse(oargs);
return 0;
}
/*
* Mark an extent specified by start and len allocated.
* Updates all the summary information as well as the bitmap.
*/
-STATIC int /* error */
+STATIC int
xfs_rtallocate_range(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* start block to allocate */
- xfs_extlen_t len, /* length to allocate */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb) /* in/out: summary block number */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxnum_t start, /* start rtext to allocate */
+ xfs_rtxlen_t len) /* in/out: summary block number */
{
- xfs_rtblock_t end; /* end of the allocated extent */
- int error; /* error value */
- xfs_rtblock_t postblock = 0; /* first block allocated > end */
- xfs_rtblock_t preblock = 0; /* first block allocated < start */
+ struct xfs_mount *mp = args->mp;
+ xfs_rtxnum_t end; /* end of the allocated rtext */
+ int error;
+ xfs_rtxnum_t postblock = 0; /* first rtext allocated > end */
+ xfs_rtxnum_t preblock = 0; /* first rtext allocated < start */
end = start + len - 1;
/*
@@ -158,15 +153,15 @@ xfs_rtallocate_range(
* We need to find the beginning and end of the extent so we can
* properly update the summary.
*/
- error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+ error = xfs_rtfind_back(args, start, 0, &preblock);
if (error) {
return error;
}
/*
* Find the next allocated block (end of free extent).
*/
- error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
- &postblock);
+ error = xfs_rtfind_forw(args, end, mp->m_sb.sb_rextents - 1,
+ &postblock);
if (error) {
return error;
}
@@ -174,9 +169,9 @@ xfs_rtallocate_range(
* Decrement the summary information corresponding to the entire
* (old) free extent.
*/
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(postblock + 1 - preblock),
- XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+ error = xfs_rtmodify_summary(args,
+ XFS_RTBLOCKLOG(postblock + 1 - preblock),
+ xfs_rtx_to_rbmblock(mp, preblock), -1);
if (error) {
return error;
}
@@ -185,9 +180,9 @@ xfs_rtallocate_range(
* old extent, add summary data for them to be free.
*/
if (preblock < start) {
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(start - preblock),
- XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
+ error = xfs_rtmodify_summary(args,
+ XFS_RTBLOCKLOG(start - preblock),
+ xfs_rtx_to_rbmblock(mp, preblock), 1);
if (error) {
return error;
}
@@ -197,9 +192,9 @@ xfs_rtallocate_range(
* old extent, add summary data for them to be free.
*/
if (postblock > end) {
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(postblock - end),
- XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb);
+ error = xfs_rtmodify_summary(args,
+ XFS_RTBLOCKLOG(postblock - end),
+ xfs_rtx_to_rbmblock(mp, end + 1), 1);
if (error) {
return error;
}
@@ -207,54 +202,69 @@ xfs_rtallocate_range(
/*
* Modify the bitmap to mark this extent allocated.
*/
- error = xfs_rtmodify_range(mp, tp, start, len, 0);
+ error = xfs_rtmodify_range(args, start, len, 0);
return error;
}
/*
+ * Make sure we don't run off the end of the rt volume. Be careful that
+ * adjusting maxlen downwards doesn't cause us to fail the alignment checks.
+ */
+static inline xfs_rtxlen_t
+xfs_rtallocate_clamp_len(
+ struct xfs_mount *mp,
+ xfs_rtxnum_t startrtx,
+ xfs_rtxlen_t rtxlen,
+ xfs_rtxlen_t prod)
+{
+ xfs_rtxlen_t ret;
+
+ ret = min(mp->m_sb.sb_rextents, startrtx + rtxlen) - startrtx;
+ return rounddown(ret, prod);
+}
+
+/*
* Attempt to allocate an extent minlen<=len<=maxlen starting from
* bitmap block bbno. If we don't get maxlen then use prod to trim
- * the length, if given. Returns error; returns starting block in *rtblock.
+ * the length, if given. Returns error; returns starting block in *rtx.
* The lengths are all in rtextents.
*/
-STATIC int /* error */
+STATIC int
xfs_rtallocate_extent_block(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t bbno, /* bitmap block number */
- xfs_extlen_t minlen, /* minimum length to allocate */
- xfs_extlen_t maxlen, /* maximum length to allocate */
- xfs_extlen_t *len, /* out: actual length allocated */
- xfs_rtblock_t *nextp, /* out: next block to try */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb, /* in/out: summary block number */
- xfs_extlen_t prod, /* extent product factor */
- xfs_rtblock_t *rtblock) /* out: start block allocated */
+ struct xfs_rtalloc_args *args,
+ xfs_fileoff_t bbno, /* bitmap block number */
+ xfs_rtxlen_t minlen, /* minimum length to allocate */
+ xfs_rtxlen_t maxlen, /* maximum length to allocate */
+ xfs_rtxlen_t *len, /* out: actual length allocated */
+ xfs_rtxnum_t *nextp, /* out: next rtext to try */
+ xfs_rtxlen_t prod, /* extent product factor */
+ xfs_rtxnum_t *rtx) /* out: start rtext allocated */
{
- xfs_rtblock_t besti; /* best rtblock found so far */
- xfs_rtblock_t bestlen; /* best length found so far */
- xfs_rtblock_t end; /* last rtblock in chunk */
- int error; /* error value */
- xfs_rtblock_t i; /* current rtblock trying */
- xfs_rtblock_t next; /* next rtblock to try */
- int stat; /* status from internal calls */
+ struct xfs_mount *mp = args->mp;
+ xfs_rtxnum_t besti; /* best rtext found so far */
+ xfs_rtxnum_t bestlen;/* best length found so far */
+ xfs_rtxnum_t end; /* last rtext in chunk */
+ int error;
+ xfs_rtxnum_t i; /* current rtext trying */
+ xfs_rtxnum_t next; /* next rtext to try */
+ int stat; /* status from internal calls */
/*
* Loop over all the extents starting in this bitmap block,
* looking for one that's long enough.
*/
- for (i = XFS_BLOCKTOBIT(mp, bbno), besti = -1, bestlen = 0,
- end = XFS_BLOCKTOBIT(mp, bbno + 1) - 1;
+ for (i = xfs_rbmblock_to_rtx(mp, bbno), besti = -1, bestlen = 0,
+ end = xfs_rbmblock_to_rtx(mp, bbno + 1) - 1;
i <= end;
i++) {
/* Make sure we don't scan off the end of the rt volume. */
- maxlen = min(mp->m_sb.sb_rextents, i + maxlen) - i;
+ maxlen = xfs_rtallocate_clamp_len(mp, i, maxlen, prod);
/*
* See if there's a free extent of maxlen starting at i.
* If it's not so then next will contain the first non-free.
*/
- error = xfs_rtcheck_range(mp, tp, i, maxlen, 1, &next, &stat);
+ error = xfs_rtcheck_range(args, i, maxlen, 1, &next, &stat);
if (error) {
return error;
}
@@ -262,13 +272,12 @@ xfs_rtallocate_extent_block(
/*
* i for maxlen is all free, allocate and return that.
*/
- error = xfs_rtallocate_range(mp, tp, i, maxlen, rbpp,
- rsb);
+ error = xfs_rtallocate_range(args, i, maxlen);
if (error) {
return error;
}
*len = maxlen;
- *rtblock = i;
+ *rtx = i;
return 0;
}
/*
@@ -278,7 +287,7 @@ xfs_rtallocate_extent_block(
* so far, remember it.
*/
if (minlen < maxlen) {
- xfs_rtblock_t thislen; /* this extent size */
+ xfs_rtxnum_t thislen; /* this extent size */
thislen = next - i;
if (thislen >= minlen && thislen > bestlen) {
@@ -290,7 +299,7 @@ xfs_rtallocate_extent_block(
* If not done yet, find the start of the next free space.
*/
if (next < end) {
- error = xfs_rtfind_forw(mp, tp, next, end, &i);
+ error = xfs_rtfind_forw(args, next, end, &i);
if (error) {
return error;
}
@@ -301,7 +310,7 @@ xfs_rtallocate_extent_block(
* Searched the whole thing & didn't find a maxlen free extent.
*/
if (minlen < maxlen && besti != -1) {
- xfs_extlen_t p; /* amount to trim length by */
+ xfs_rtxlen_t p; /* amount to trim length by */
/*
* If size should be a multiple of prod, make that so.
@@ -315,51 +324,49 @@ xfs_rtallocate_extent_block(
/*
* Allocate besti for bestlen & return that.
*/
- error = xfs_rtallocate_range(mp, tp, besti, bestlen, rbpp, rsb);
+ error = xfs_rtallocate_range(args, besti, bestlen);
if (error) {
return error;
}
*len = bestlen;
- *rtblock = besti;
+ *rtx = besti;
return 0;
}
/*
* Allocation failed. Set *nextp to the next block to try.
*/
*nextp = next;
- *rtblock = NULLRTBLOCK;
+ *rtx = NULLRTEXTNO;
return 0;
}
/*
* Allocate an extent of length minlen<=len<=maxlen, starting at block
* bno. If we don't get maxlen then use prod to trim the length, if given.
- * Returns error; returns starting block in *rtblock.
+ * Returns error; returns starting block in *rtx.
* The lengths are all in rtextents.
*/
-STATIC int /* error */
+STATIC int
xfs_rtallocate_extent_exact(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t bno, /* starting block number to allocate */
- xfs_extlen_t minlen, /* minimum length to allocate */
- xfs_extlen_t maxlen, /* maximum length to allocate */
- xfs_extlen_t *len, /* out: actual length allocated */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb, /* in/out: summary block number */
- xfs_extlen_t prod, /* extent product factor */
- xfs_rtblock_t *rtblock) /* out: start block allocated */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxnum_t start, /* starting rtext number to allocate */
+ xfs_rtxlen_t minlen, /* minimum length to allocate */
+ xfs_rtxlen_t maxlen, /* maximum length to allocate */
+ xfs_rtxlen_t *len, /* out: actual length allocated */
+ xfs_rtxlen_t prod, /* extent product factor */
+ xfs_rtxnum_t *rtx) /* out: start rtext allocated */
{
- int error; /* error value */
- xfs_extlen_t i; /* extent length trimmed due to prod */
- int isfree; /* extent is free */
- xfs_rtblock_t next; /* next block to try (dummy) */
+ int error;
+ xfs_rtxlen_t i; /* extent length trimmed due to prod */
+ int isfree; /* extent is free */
+ xfs_rtxnum_t next; /* next rtext to try (dummy) */
- ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+ ASSERT(minlen % prod == 0);
+ ASSERT(maxlen % prod == 0);
/*
* Check if the range in question (for maxlen) is free.
*/
- error = xfs_rtcheck_range(mp, tp, bno, maxlen, 1, &next, &isfree);
+ error = xfs_rtcheck_range(args, start, maxlen, 1, &next, &isfree);
if (error) {
return error;
}
@@ -367,23 +374,23 @@ xfs_rtallocate_extent_exact(
/*
* If it is, allocate it and return success.
*/
- error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb);
+ error = xfs_rtallocate_range(args, start, maxlen);
if (error) {
return error;
}
*len = maxlen;
- *rtblock = bno;
+ *rtx = start;
return 0;
}
/*
* If not, allocate what there is, if it's at least minlen.
*/
- maxlen = next - bno;
+ maxlen = next - start;
if (maxlen < minlen) {
/*
* Failed, return failure status.
*/
- *rtblock = NULLRTBLOCK;
+ *rtx = NULLRTEXTNO;
return 0;
}
/*
@@ -395,81 +402,82 @@ xfs_rtallocate_extent_exact(
/*
* Now we can't do it, return failure status.
*/
- *rtblock = NULLRTBLOCK;
+ *rtx = NULLRTEXTNO;
return 0;
}
}
/*
* Allocate what we can and return it.
*/
- error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb);
+ error = xfs_rtallocate_range(args, start, maxlen);
if (error) {
return error;
}
*len = maxlen;
- *rtblock = bno;
+ *rtx = start;
return 0;
}
/*
* Allocate an extent of length minlen<=len<=maxlen, starting as near
- * to bno as possible. If we don't get maxlen then use prod to trim
+ * to start as possible. If we don't get maxlen then use prod to trim
* the length, if given. The lengths are all in rtextents.
*/
-STATIC int /* error */
+STATIC int
xfs_rtallocate_extent_near(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t bno, /* starting block number to allocate */
- xfs_extlen_t minlen, /* minimum length to allocate */
- xfs_extlen_t maxlen, /* maximum length to allocate */
- xfs_extlen_t *len, /* out: actual length allocated */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb, /* in/out: summary block number */
- xfs_extlen_t prod, /* extent product factor */
- xfs_rtblock_t *rtblock) /* out: start block allocated */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxnum_t start, /* starting rtext number to allocate */
+ xfs_rtxlen_t minlen, /* minimum length to allocate */
+ xfs_rtxlen_t maxlen, /* maximum length to allocate */
+ xfs_rtxlen_t *len, /* out: actual length allocated */
+ xfs_rtxlen_t prod, /* extent product factor */
+ xfs_rtxnum_t *rtx) /* out: start rtext allocated */
{
- int any; /* any useful extents from summary */
- xfs_rtblock_t bbno; /* bitmap block number */
- int error; /* error value */
- int i; /* bitmap block offset (loop control) */
- int j; /* secondary loop control */
- int log2len; /* log2 of minlen */
- xfs_rtblock_t n; /* next block to try */
- xfs_rtblock_t r; /* result block */
-
- ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+ struct xfs_mount *mp = args->mp;
+ int maxlog; /* max useful extent from summary */
+ xfs_fileoff_t bbno; /* bitmap block number */
+ int error;
+ int i; /* bitmap block offset (loop control) */
+ int j; /* secondary loop control */
+ int log2len; /* log2 of minlen */
+ xfs_rtxnum_t n; /* next rtext to try */
+ xfs_rtxnum_t r; /* result rtext */
+
+ ASSERT(minlen % prod == 0);
+ ASSERT(maxlen % prod == 0);
+
/*
* If the block number given is off the end, silently set it to
* the last block.
*/
- if (bno >= mp->m_sb.sb_rextents)
- bno = mp->m_sb.sb_rextents - 1;
+ if (start >= mp->m_sb.sb_rextents)
+ start = mp->m_sb.sb_rextents - 1;
/* Make sure we don't run off the end of the rt volume. */
- maxlen = min(mp->m_sb.sb_rextents, bno + maxlen) - bno;
+ maxlen = xfs_rtallocate_clamp_len(mp, start, maxlen, prod);
if (maxlen < minlen) {
- *rtblock = NULLRTBLOCK;
+ *rtx = NULLRTEXTNO;
return 0;
}
/*
* Try the exact allocation first.
*/
- error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen, len,
- rbpp, rsb, prod, &r);
+ error = xfs_rtallocate_extent_exact(args, start, minlen, maxlen, len,
+ prod, &r);
if (error) {
return error;
}
/*
* If the exact allocation worked, return that.
*/
- if (r != NULLRTBLOCK) {
- *rtblock = r;
+ if (r != NULLRTEXTNO) {
+ *rtx = r;
return 0;
}
- bbno = XFS_BITTOBLOCK(mp, bno);
+ bbno = xfs_rtx_to_rbmblock(mp, start);
i = 0;
+ j = -1;
ASSERT(minlen != 0);
log2len = xfs_highbit32(minlen);
/*
@@ -480,8 +488,8 @@ xfs_rtallocate_extent_near(
* Get summary information of extents of all useful levels
* starting in this bitmap block.
*/
- error = xfs_rtany_summary(mp, tp, log2len, mp->m_rsumlevels - 1,
- bbno + i, rbpp, rsb, &any);
+ error = xfs_rtany_summary(args, log2len, mp->m_rsumlevels - 1,
+ bbno + i, &maxlog);
if (error) {
return error;
}
@@ -489,7 +497,10 @@ xfs_rtallocate_extent_near(
* If there are any useful extents starting here, try
* allocating one.
*/
- if (any) {
+ if (maxlog >= 0) {
+ xfs_extlen_t maxavail =
+ min_t(xfs_rtblock_t, maxlen,
+ (1ULL << (maxlog + 1)) - 1);
/*
* On the positive side of the starting location.
*/
@@ -498,17 +509,17 @@ xfs_rtallocate_extent_near(
* Try to allocate an extent starting in
* this block.
*/
- error = xfs_rtallocate_extent_block(mp, tp,
- bbno + i, minlen, maxlen, len, &n, rbpp,
- rsb, prod, &r);
+ error = xfs_rtallocate_extent_block(args,
+ bbno + i, minlen, maxavail, len,
+ &n, prod, &r);
if (error) {
return error;
}
/*
* If it worked, return it.
*/
- if (r != NULLRTBLOCK) {
- *rtblock = r;
+ if (r != NULLRTEXTNO) {
+ *rtx = r;
return 0;
}
}
@@ -516,68 +527,46 @@ xfs_rtallocate_extent_near(
* On the negative side of the starting location.
*/
else { /* i < 0 */
+ int maxblocks;
+
/*
- * Loop backwards through the bitmap blocks from
- * the starting point-1 up to where we are now.
- * There should be an extent which ends in this
- * bitmap block and is long enough.
+ * Loop backwards to find the end of the extent
+ * we found in the realtime summary.
+ *
+ * maxblocks is the maximum possible number of
+ * bitmap blocks from the start of the extent
+ * to the end of the extent.
*/
- for (j = -1; j > i; j--) {
- /*
- * Grab the summary information for
- * this bitmap block.
- */
- error = xfs_rtany_summary(mp, tp,
- log2len, mp->m_rsumlevels - 1,
- bbno + j, rbpp, rsb, &any);
- if (error) {
- return error;
- }
- /*
- * If there's no extent given in the
- * summary that means the extent we
- * found must carry over from an
- * earlier block. If there is an
- * extent given, we've already tried
- * that allocation, don't do it again.
- */
- if (any)
- continue;
- error = xfs_rtallocate_extent_block(mp,
- tp, bbno + j, minlen, maxlen,
- len, &n, rbpp, rsb, prod, &r);
+ if (maxlog == 0)
+ maxblocks = 0;
+ else if (maxlog < mp->m_blkbit_log)
+ maxblocks = 1;
+ else
+ maxblocks = 2 << (maxlog - mp->m_blkbit_log);
+
+ /*
+ * We need to check bbno + i + maxblocks down to
+ * bbno + i. We already checked bbno down to
+ * bbno + j + 1, so we don't need to check those
+ * again.
+ */
+ j = min(i + maxblocks, j);
+ for (; j >= i; j--) {
+ error = xfs_rtallocate_extent_block(args,
+ bbno + j, minlen,
+ maxavail, len, &n, prod,
+ &r);
if (error) {
return error;
}
/*
* If it works, return the extent.
*/
- if (r != NULLRTBLOCK) {
- *rtblock = r;
+ if (r != NULLRTEXTNO) {
+ *rtx = r;
return 0;
}
}
- /*
- * There weren't intervening bitmap blocks
- * with a long enough extent, or the
- * allocation didn't work for some reason
- * (i.e. it's a little * too short).
- * Try to allocate from the summary block
- * that we found.
- */
- error = xfs_rtallocate_extent_block(mp, tp,
- bbno + i, minlen, maxlen, len, &n, rbpp,
- rsb, prod, &r);
- if (error) {
- return error;
- }
- /*
- * If it works, return the extent.
- */
- if (r != NULLRTBLOCK) {
- *rtblock = r;
- return 0;
- }
}
}
/*
@@ -610,7 +599,7 @@ xfs_rtallocate_extent_near(
else
break;
}
- *rtblock = NULLRTBLOCK;
+ *rtx = NULLRTEXTNO;
return 0;
}
@@ -619,26 +608,25 @@ xfs_rtallocate_extent_near(
* specified. If we don't get maxlen then use prod to trim
* the length, if given. The lengths are all in rtextents.
*/
-STATIC int /* error */
+STATIC int
xfs_rtallocate_extent_size(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_extlen_t minlen, /* minimum length to allocate */
- xfs_extlen_t maxlen, /* maximum length to allocate */
- xfs_extlen_t *len, /* out: actual length allocated */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb, /* in/out: summary block number */
- xfs_extlen_t prod, /* extent product factor */
- xfs_rtblock_t *rtblock) /* out: start block allocated */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxlen_t minlen, /* minimum length to allocate */
+ xfs_rtxlen_t maxlen, /* maximum length to allocate */
+ xfs_rtxlen_t *len, /* out: actual length allocated */
+ xfs_rtxlen_t prod, /* extent product factor */
+ xfs_rtxnum_t *rtx) /* out: start rtext allocated */
{
- int error; /* error value */
- int i; /* bitmap block number */
- int l; /* level number (loop control) */
- xfs_rtblock_t n; /* next block to be tried */
- xfs_rtblock_t r; /* result block number */
- xfs_suminfo_t sum; /* summary information for extents */
-
- ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+ struct xfs_mount *mp = args->mp;
+ int error;
+ xfs_fileoff_t i; /* bitmap block number */
+ int l; /* level number (loop control) */
+ xfs_rtxnum_t n; /* next rtext to be tried */
+ xfs_rtxnum_t r; /* result rtext number */
+ xfs_suminfo_t sum; /* summary information for extents */
+
+ ASSERT(minlen % prod == 0);
+ ASSERT(maxlen % prod == 0);
ASSERT(maxlen != 0);
/*
@@ -656,8 +644,7 @@ xfs_rtallocate_extent_size(
/*
* Get the summary for this level/block.
*/
- error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb,
- &sum);
+ error = xfs_rtget_summary(args, l, i, &sum);
if (error) {
return error;
}
@@ -669,16 +656,16 @@ xfs_rtallocate_extent_size(
/*
* Try allocating the extent.
*/
- error = xfs_rtallocate_extent_block(mp, tp, i, maxlen,
- maxlen, len, &n, rbpp, rsb, prod, &r);
+ error = xfs_rtallocate_extent_block(args, i, maxlen,
+ maxlen, len, &n, prod, &r);
if (error) {
return error;
}
/*
* If it worked, return that.
*/
- if (r != NULLRTBLOCK) {
- *rtblock = r;
+ if (r != NULLRTEXTNO) {
+ *rtx = r;
return 0;
}
/*
@@ -686,8 +673,8 @@ xfs_rtallocate_extent_size(
* allocator is beyond the next bitmap block,
* skip to that bitmap block.
*/
- if (XFS_BITTOBLOCK(mp, n) > i + 1)
- i = XFS_BITTOBLOCK(mp, n) - 1;
+ if (xfs_rtx_to_rbmblock(mp, n) > i + 1)
+ i = xfs_rtx_to_rbmblock(mp, n) - 1;
}
}
/*
@@ -695,7 +682,7 @@ xfs_rtallocate_extent_size(
* we're asking for a fixed size extent.
*/
if (minlen > --maxlen) {
- *rtblock = NULLRTBLOCK;
+ *rtx = NULLRTEXTNO;
return 0;
}
ASSERT(minlen != 0);
@@ -715,8 +702,7 @@ xfs_rtallocate_extent_size(
/*
* Get the summary information for this level/block.
*/
- error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb,
- &sum);
+ error = xfs_rtget_summary(args, l, i, &sum);
if (error) {
return error;
}
@@ -730,18 +716,18 @@ xfs_rtallocate_extent_size(
* minlen/maxlen are in the possible range for
* this summary level.
*/
- error = xfs_rtallocate_extent_block(mp, tp, i,
+ error = xfs_rtallocate_extent_block(args, i,
XFS_RTMAX(minlen, 1 << l),
XFS_RTMIN(maxlen, (1 << (l + 1)) - 1),
- len, &n, rbpp, rsb, prod, &r);
+ len, &n, prod, &r);
if (error) {
return error;
}
/*
* If it worked, return that extent.
*/
- if (r != NULLRTBLOCK) {
- *rtblock = r;
+ if (r != NULLRTEXTNO) {
+ *rtx = r;
return 0;
}
/*
@@ -749,14 +735,14 @@ xfs_rtallocate_extent_size(
* allocator is beyond the next bitmap block,
* skip to that bitmap block.
*/
- if (XFS_BITTOBLOCK(mp, n) > i + 1)
- i = XFS_BITTOBLOCK(mp, n) - 1;
+ if (xfs_rtx_to_rbmblock(mp, n) > i + 1)
+ i = xfs_rtx_to_rbmblock(mp, n) - 1;
}
}
/*
* Got nothing, return failure.
*/
- *rtblock = NULLRTBLOCK;
+ *rtx = NULLRTEXTNO;
return 0;
}
@@ -886,12 +872,14 @@ xfs_alloc_rsum_cache(
xfs_extlen_t rbmblocks) /* number of rt bitmap blocks */
{
/*
- * The rsum cache is initialized to all zeroes, which is trivially a
- * lower bound on the minimum level with any free extents. We can
- * continue without the cache if it couldn't be allocated.
+ * The rsum cache is initialized to the maximum value, which is
+ * trivially an upper bound on the maximum level with any free extents.
+ * We can continue without the cache if it couldn't be allocated.
*/
- mp->m_rsum_cache = kvzalloc(rbmblocks, GFP_KERNEL);
- if (!mp->m_rsum_cache)
+ mp->m_rsum_cache = kvmalloc(rbmblocks, GFP_KERNEL);
+ if (mp->m_rsum_cache)
+ memset(mp->m_rsum_cache, -1, rbmblocks);
+ else
xfs_warn(mp, "could not allocate realtime summary cache");
}
@@ -907,13 +895,13 @@ xfs_growfs_rt(
xfs_mount_t *mp, /* mount point for filesystem */
xfs_growfs_rt_t *in) /* growfs rt input struct */
{
- xfs_rtblock_t bmbno; /* bitmap block number */
+ xfs_fileoff_t bmbno; /* bitmap block number */
struct xfs_buf *bp; /* temporary buffer */
int error; /* error return value */
xfs_mount_t *nmp; /* new (fake) mount structure */
xfs_rfsblock_t nrblocks; /* new number of realtime blocks */
xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */
- xfs_rtblock_t nrextents; /* new number of realtime extents */
+ xfs_rtxnum_t nrextents; /* new number of realtime extents */
uint8_t nrextslog; /* new log2 of sb_rextents */
xfs_extlen_t nrsumblocks; /* new number of summary blocks */
uint nrsumlevels; /* new rt summary levels */
@@ -922,7 +910,6 @@ xfs_growfs_rt(
xfs_extlen_t rbmblocks; /* current number of rt bitmap blocks */
xfs_extlen_t rsumblocks; /* current number of rt summary blks */
xfs_sb_t *sbp; /* old superblock */
- xfs_fsblock_t sumbno; /* summary block number */
uint8_t *rsum_cache; /* old summary cache */
sbp = &mp->m_sb;
@@ -954,7 +941,7 @@ xfs_growfs_rt(
return -EINVAL;
/* Unsupported realtime features. */
- if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp))
+ if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp) || xfs_has_quota(mp))
return -EOPNOTSUPP;
nrblocks = in->newblocks;
@@ -976,11 +963,10 @@ xfs_growfs_rt(
*/
nrextents = nrblocks;
do_div(nrextents, in->extsize);
- nrbmblocks = howmany_64(nrextents, NBBY * sbp->sb_blocksize);
+ nrbmblocks = xfs_rtbitmap_blockcount(mp, nrextents);
nrextslog = xfs_highbit32(nrextents);
nrsumlevels = nrextslog + 1;
- nrsumsize = (uint)sizeof(xfs_suminfo_t) * nrsumlevels * nrbmblocks;
- nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize);
+ nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels, nrbmblocks);
nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
/*
* New summary size can't be more than half the size of
@@ -1023,6 +1009,12 @@ xfs_growfs_rt(
((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0);
bmbno < nrbmblocks;
bmbno++) {
+ struct xfs_rtalloc_args args = {
+ .mp = mp,
+ };
+ struct xfs_rtalloc_args nargs = {
+ .mp = nmp,
+ };
struct xfs_trans *tp;
xfs_rfsblock_t nrblocks_step;
@@ -1032,19 +1024,17 @@ xfs_growfs_rt(
* Calculate new sb and mount fields for this round.
*/
nsbp->sb_rextsize = in->extsize;
+ nmp->m_rtxblklog = -1; /* don't use shift or masking */
nsbp->sb_rbmblocks = bmbno + 1;
nrblocks_step = (bmbno + 1) * NBBY * nsbp->sb_blocksize *
nsbp->sb_rextsize;
nsbp->sb_rblocks = min(nrblocks, nrblocks_step);
- nsbp->sb_rextents = nsbp->sb_rblocks;
- do_div(nsbp->sb_rextents, nsbp->sb_rextsize);
+ nsbp->sb_rextents = xfs_rtb_to_rtx(nmp, nsbp->sb_rblocks);
ASSERT(nsbp->sb_rextents != 0);
nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents);
nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1;
- nrsumsize =
- (uint)sizeof(xfs_suminfo_t) * nrsumlevels *
- nsbp->sb_rbmblocks;
- nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize);
+ nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels,
+ nsbp->sb_rbmblocks);
nmp->m_rsumsize = nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
/*
* Start a transaction, get the log reservation.
@@ -1053,6 +1043,9 @@ xfs_growfs_rt(
&tp);
if (error)
break;
+ args.tp = tp;
+ nargs.tp = tp;
+
/*
* Lock out other callers by grabbing the bitmap inode lock.
*/
@@ -1086,7 +1079,7 @@ xfs_growfs_rt(
*/
if (sbp->sb_rbmblocks != nsbp->sb_rbmblocks ||
mp->m_rsumlevels != nmp->m_rsumlevels) {
- error = xfs_rtcopy_summary(mp, nmp, tp);
+ error = xfs_rtcopy_summary(&args, &nargs);
if (error)
goto error_cancel;
}
@@ -1111,9 +1104,9 @@ xfs_growfs_rt(
/*
* Free new extent.
*/
- bp = NULL;
- error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents,
- nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno);
+ error = xfs_rtfree_range(&nargs, sbp->sb_rextents,
+ nsbp->sb_rextents - sbp->sb_rextents);
+ xfs_rtbuf_cache_relse(&nargs);
if (error) {
error_cancel:
xfs_trans_cancel(tp);
@@ -1171,59 +1164,60 @@ out_free:
* parameters. The length units are all in realtime extents, as is the
* result block number.
*/
-int /* error */
+int
xfs_rtallocate_extent(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t bno, /* starting block number to allocate */
- xfs_extlen_t minlen, /* minimum length to allocate */
- xfs_extlen_t maxlen, /* maximum length to allocate */
- xfs_extlen_t *len, /* out: actual length allocated */
- int wasdel, /* was a delayed allocation extent */
- xfs_extlen_t prod, /* extent product factor */
- xfs_rtblock_t *rtblock) /* out: start block allocated */
+ struct xfs_trans *tp,
+ xfs_rtxnum_t start, /* starting rtext number to allocate */
+ xfs_rtxlen_t minlen, /* minimum length to allocate */
+ xfs_rtxlen_t maxlen, /* maximum length to allocate */
+ xfs_rtxlen_t *len, /* out: actual length allocated */
+ int wasdel, /* was a delayed allocation extent */
+ xfs_rtxlen_t prod, /* extent product factor */
+ xfs_rtxnum_t *rtblock) /* out: start rtext allocated */
{
- xfs_mount_t *mp = tp->t_mountp;
- int error; /* error value */
- xfs_rtblock_t r; /* result allocated block */
- xfs_fsblock_t sb; /* summary file block number */
- struct xfs_buf *sumbp; /* summary file block buffer */
-
- ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+ struct xfs_rtalloc_args args = {
+ .mp = tp->t_mountp,
+ .tp = tp,
+ };
+ int error; /* error value */
+ xfs_rtxnum_t r; /* result allocated rtext */
+
+ ASSERT(xfs_isilocked(args.mp->m_rbmip, XFS_ILOCK_EXCL));
ASSERT(minlen > 0 && minlen <= maxlen);
/*
* If prod is set then figure out what to do to minlen and maxlen.
*/
if (prod > 1) {
- xfs_extlen_t i;
+ xfs_rtxlen_t i;
if ((i = maxlen % prod))
maxlen -= i;
if ((i = minlen % prod))
minlen += prod - i;
if (maxlen < minlen) {
- *rtblock = NULLRTBLOCK;
+ *rtblock = NULLRTEXTNO;
return 0;
}
}
retry:
- sumbp = NULL;
- if (bno == 0) {
- error = xfs_rtallocate_extent_size(mp, tp, minlen, maxlen, len,
- &sumbp, &sb, prod, &r);
+ if (start == 0) {
+ error = xfs_rtallocate_extent_size(&args, minlen,
+ maxlen, len, prod, &r);
} else {
- error = xfs_rtallocate_extent_near(mp, tp, bno, minlen, maxlen,
- len, &sumbp, &sb, prod, &r);
+ error = xfs_rtallocate_extent_near(&args, start, minlen,
+ maxlen, len, prod, &r);
}
+ xfs_rtbuf_cache_relse(&args);
if (error)
return error;
/*
* If it worked, update the superblock.
*/
- if (r != NULLRTBLOCK) {
+ if (r != NULLRTEXTNO) {
long slen = (long)*len;
ASSERT(*len >= minlen && *len <= maxlen);
@@ -1250,6 +1244,7 @@ xfs_rtmount_init(
struct xfs_buf *bp; /* buffer for last block of subvolume */
struct xfs_sb *sbp; /* filesystem superblock copy in mount */
xfs_daddr_t d; /* address of last block of subvolume */
+ unsigned int rsumblocks;
int error;
sbp = &mp->m_sb;
@@ -1261,10 +1256,9 @@ xfs_rtmount_init(
return -ENODEV;
}
mp->m_rsumlevels = sbp->sb_rextslog + 1;
- mp->m_rsumsize =
- (uint)sizeof(xfs_suminfo_t) * mp->m_rsumlevels *
- sbp->sb_rbmblocks;
- mp->m_rsumsize = roundup(mp->m_rsumsize, sbp->sb_blocksize);
+ rsumblocks = xfs_rtsummary_blockcount(mp, mp->m_rsumlevels,
+ mp->m_sb.sb_rbmblocks);
+ mp->m_rsumsize = XFS_FSB_TO_B(mp, rsumblocks);
mp->m_rbmip = mp->m_rsumip = NULL;
/*
* Check that the realtime section is an ok size.
@@ -1418,27 +1412,28 @@ xfs_rtunmount_inodes(
* of rtextents and the fraction.
* The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ...
*/
-int /* error */
+int /* error */
xfs_rtpick_extent(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_extlen_t len, /* allocation length (rtextents) */
- xfs_rtblock_t *pick) /* result rt extent */
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtxlen_t len, /* allocation length (rtextents) */
+ xfs_rtxnum_t *pick) /* result rt extent */
{
- xfs_rtblock_t b; /* result block */
- int log2; /* log of sequence number */
- uint64_t resid; /* residual after log removed */
- uint64_t seq; /* sequence number of file creation */
- uint64_t *seqp; /* pointer to seqno in inode */
+ xfs_rtxnum_t b; /* result rtext */
+ int log2; /* log of sequence number */
+ uint64_t resid; /* residual after log removed */
+ uint64_t seq; /* sequence number of file creation */
+ struct timespec64 ts; /* timespec in inode */
ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
- seqp = (uint64_t *)&VFS_I(mp->m_rbmip)->i_atime;
+ ts = inode_get_atime(VFS_I(mp->m_rbmip));
if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) {
mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;
- *seqp = 0;
+ seq = 0;
+ } else {
+ seq = ts.tv_sec;
}
- seq = *seqp;
if ((log2 = xfs_highbit64(seq)) == -1)
b = 0;
else {
@@ -1450,7 +1445,8 @@ xfs_rtpick_extent(
if (b + len > mp->m_sb.sb_rextents)
b = mp->m_sb.sb_rextents - len;
}
- *seqp = seq + 1;
+ ts.tv_sec = seq + 1;
+ inode_set_atime_to_ts(VFS_I(mp->m_rbmip), ts);
xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
*pick = b;
return 0;
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 62c7ad79cbb6..f7cb9ffe51ca 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -11,22 +11,6 @@
struct xfs_mount;
struct xfs_trans;
-/*
- * XXX: Most of the realtime allocation functions deal in units of realtime
- * extents, not realtime blocks. This looks funny when paired with the type
- * name and screams for a larger cleanup.
- */
-struct xfs_rtalloc_rec {
- xfs_rtblock_t ar_startext;
- xfs_rtblock_t ar_extcount;
-};
-
-typedef int (*xfs_rtalloc_query_range_fn)(
- struct xfs_mount *mp,
- struct xfs_trans *tp,
- const struct xfs_rtalloc_rec *rec,
- void *priv);
-
#ifdef CONFIG_XFS_RT
/*
* Function prototypes for exported functions.
@@ -40,23 +24,14 @@ typedef int (*xfs_rtalloc_query_range_fn)(
int /* error */
xfs_rtallocate_extent(
struct xfs_trans *tp, /* transaction pointer */
- xfs_rtblock_t bno, /* starting block number to allocate */
- xfs_extlen_t minlen, /* minimum length to allocate */
- xfs_extlen_t maxlen, /* maximum length to allocate */
- xfs_extlen_t *len, /* out: actual length allocated */
+ xfs_rtxnum_t start, /* starting rtext number to allocate */
+ xfs_rtxlen_t minlen, /* minimum length to allocate */
+ xfs_rtxlen_t maxlen, /* maximum length to allocate */
+ xfs_rtxlen_t *len, /* out: actual length allocated */
int wasdel, /* was a delayed allocation extent */
- xfs_extlen_t prod, /* extent product factor */
- xfs_rtblock_t *rtblock); /* out: start block allocated */
+ xfs_rtxlen_t prod, /* extent product factor */
+ xfs_rtxnum_t *rtblock); /* out: start rtext allocated */
-/*
- * Free an extent in the realtime subvolume. Length is expressed in
- * realtime extents, as is the block number.
- */
-int /* error */
-xfs_rtfree_extent(
- struct xfs_trans *tp, /* transaction pointer */
- xfs_rtblock_t bno, /* starting block number to free */
- xfs_extlen_t len); /* length of extent freed */
/*
* Initialize realtime fields in the mount structure.
@@ -87,8 +62,8 @@ int /* error */
xfs_rtpick_extent(
struct xfs_mount *mp, /* file system mount point */
struct xfs_trans *tp, /* transaction pointer */
- xfs_extlen_t len, /* allocation length (rtextents) */
- xfs_rtblock_t *pick); /* result rt extent */
+ xfs_rtxlen_t len, /* allocation length (rtextents) */
+ xfs_rtxnum_t *pick); /* result rt extent */
/*
* Grow the realtime area of the filesystem.
@@ -98,55 +73,12 @@ xfs_growfs_rt(
struct xfs_mount *mp, /* file system mount structure */
xfs_growfs_rt_t *in); /* user supplied growfs struct */
-/*
- * From xfs_rtbitmap.c
- */
-int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t block, int issum, struct xfs_buf **bpp);
-int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t start, xfs_extlen_t len, int val,
- xfs_rtblock_t *new, int *stat);
-int xfs_rtfind_back(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t start, xfs_rtblock_t limit,
- xfs_rtblock_t *rtblock);
-int xfs_rtfind_forw(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t start, xfs_rtblock_t limit,
- xfs_rtblock_t *rtblock);
-int xfs_rtmodify_range(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t start, xfs_extlen_t len, int val);
-int xfs_rtmodify_summary_int(struct xfs_mount *mp, struct xfs_trans *tp,
- int log, xfs_rtblock_t bbno, int delta,
- struct xfs_buf **rbpp, xfs_fsblock_t *rsb,
- xfs_suminfo_t *sum);
-int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log,
- xfs_rtblock_t bbno, int delta, struct xfs_buf **rbpp,
- xfs_fsblock_t *rsb);
-int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t start, xfs_extlen_t len,
- struct xfs_buf **rbpp, xfs_fsblock_t *rsb);
-int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp,
- const struct xfs_rtalloc_rec *low_rec,
- const struct xfs_rtalloc_rec *high_rec,
- xfs_rtalloc_query_range_fn fn, void *priv);
-int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtalloc_query_range_fn fn,
- void *priv);
-bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
-int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t start, xfs_extlen_t len,
- bool *is_free);
int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp);
#else
-# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS)
-# define xfs_rtfree_extent(t,b,l) (ENOSYS)
-# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS)
-# define xfs_growfs_rt(mp,in) (ENOSYS)
-# define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS)
-# define xfs_rtalloc_query_all(m,t,f,p) (ENOSYS)
-# define xfs_rtbuf_get(m,t,b,i,p) (ENOSYS)
-# define xfs_verify_rtbno(m, r) (false)
-# define xfs_rtalloc_extent_is_free(m,t,s,l,i) (ENOSYS)
-# define xfs_rtalloc_reinit_frextents(m) (0)
+# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (-ENOSYS)
+# define xfs_rtpick_extent(m,t,l,rb) (-ENOSYS)
+# define xfs_growfs_rt(mp,in) (-ENOSYS)
+# define xfs_rtalloc_reinit_frextents(m) (0)
static inline int /* error */
xfs_rtmount_init(
xfs_mount_t *mp) /* file system mount structure */
@@ -157,7 +89,7 @@ xfs_rtmount_init(
xfs_warn(mp, "Not built with CONFIG_XFS_RT");
return -ENOSYS;
}
-# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
+# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (-ENOSYS))
# define xfs_rtunmount_inodes(m)
#endif /* CONFIG_XFS_RT */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 819a3568b28f..764304595e8b 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -42,6 +42,7 @@
#include "xfs_xattr.h"
#include "xfs_iunlink_item.h"
#include "xfs_dahash_test.h"
+#include "xfs_rtbitmap.h"
#include "scrub/stats.h"
#include <linux/magic.h>
@@ -361,14 +362,15 @@ STATIC int
xfs_blkdev_get(
xfs_mount_t *mp,
const char *name,
- struct block_device **bdevp)
+ struct bdev_handle **handlep)
{
int error = 0;
- *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE,
- mp->m_super, &fs_holder_ops);
- if (IS_ERR(*bdevp)) {
- error = PTR_ERR(*bdevp);
+ *handlep = bdev_open_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE,
+ mp->m_super, &fs_holder_ops);
+ if (IS_ERR(*handlep)) {
+ error = PTR_ERR(*handlep);
+ *handlep = NULL;
xfs_warn(mp, "Invalid device [%s], error=%d", name, error);
}
@@ -433,7 +435,7 @@ xfs_open_devices(
{
struct super_block *sb = mp->m_super;
struct block_device *ddev = sb->s_bdev;
- struct block_device *logdev = NULL, *rtdev = NULL;
+ struct bdev_handle *logdev_handle = NULL, *rtdev_handle = NULL;
int error;
/*
@@ -446,17 +448,19 @@ xfs_open_devices(
* Open real time and log devices - order is important.
*/
if (mp->m_logname) {
- error = xfs_blkdev_get(mp, mp->m_logname, &logdev);
+ error = xfs_blkdev_get(mp, mp->m_logname, &logdev_handle);
if (error)
goto out_relock;
}
if (mp->m_rtname) {
- error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev);
+ error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev_handle);
if (error)
goto out_close_logdev;
- if (rtdev == ddev || rtdev == logdev) {
+ if (rtdev_handle->bdev == ddev ||
+ (logdev_handle &&
+ rtdev_handle->bdev == logdev_handle->bdev)) {
xfs_warn(mp,
"Cannot mount filesystem with identical rtdev and ddev/logdev.");
error = -EINVAL;
@@ -468,22 +472,25 @@ xfs_open_devices(
* Setup xfs_mount buffer target pointers
*/
error = -ENOMEM;
- mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev);
+ mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_handle);
if (!mp->m_ddev_targp)
goto out_close_rtdev;
- if (rtdev) {
- mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev);
+ if (rtdev_handle) {
+ mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_handle);
if (!mp->m_rtdev_targp)
goto out_free_ddev_targ;
}
- if (logdev && logdev != ddev) {
- mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev);
+ if (logdev_handle && logdev_handle->bdev != ddev) {
+ mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_handle);
if (!mp->m_logdev_targp)
goto out_free_rtdev_targ;
} else {
mp->m_logdev_targp = mp->m_ddev_targp;
+ /* Handle won't be used, drop it */
+ if (logdev_handle)
+ bdev_release(logdev_handle);
}
error = 0;
@@ -497,11 +504,11 @@ out_relock:
out_free_ddev_targ:
xfs_free_buftarg(mp->m_ddev_targp);
out_close_rtdev:
- if (rtdev)
- blkdev_put(rtdev, sb);
+ if (rtdev_handle)
+ bdev_release(rtdev_handle);
out_close_logdev:
- if (logdev && logdev != ddev)
- blkdev_put(logdev, sb);
+ if (logdev_handle)
+ bdev_release(logdev_handle);
goto out_relock;
}
@@ -890,7 +897,7 @@ xfs_fs_statfs(
statp->f_blocks = sbp->sb_rblocks;
freertx = percpu_counter_sum_positive(&mp->m_frextents);
- statp->f_bavail = statp->f_bfree = freertx * sbp->sb_rextsize;
+ statp->f_bavail = statp->f_bfree = xfs_rtx_to_rtb(mp, freertx);
}
return 0;
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 8c0bfc9a33b1..305c9d07bf1b 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -24,6 +24,7 @@
#include "xfs_dquot_item.h"
#include "xfs_dquot.h"
#include "xfs_icache.h"
+#include "xfs_rtbitmap.h"
struct kmem_cache *xfs_trans_cache;
@@ -655,6 +656,10 @@ xfs_trans_unreserve_and_mod_sb(
mp->m_sb.sb_agcount += tp->t_agcount_delta;
mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta;
mp->m_sb.sb_rextsize += tp->t_rextsize_delta;
+ if (tp->t_rextsize_delta) {
+ mp->m_rtxblklog = log2_if_power2(mp->m_sb.sb_rextsize);
+ mp->m_rtxblkmask = mask64_if_power2(mp->m_sb.sb_rextsize);
+ }
mp->m_sb.sb_rbmblocks += tp->t_rbmblocks_delta;
mp->m_sb.sb_rblocks += tp->t_rblocks_delta;
mp->m_sb.sb_rextents += tp->t_rextents_delta;
@@ -1196,7 +1201,7 @@ xfs_trans_alloc_inode(
retry:
error = xfs_trans_alloc(mp, resv, dblocks,
- rblocks / mp->m_sb.sb_rextsize,
+ xfs_extlen_to_rtxlen(mp, rblocks),
force ? XFS_TRANS_RESERVE : 0, &tp);
if (error)
return error;
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index a3975f325f4e..987843f84d03 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -186,7 +186,7 @@ static const struct xattr_handler xfs_xattr_security_handler = {
.set = xfs_xattr_set,
};
-const struct xattr_handler *xfs_xattr_handlers[] = {
+const struct xattr_handler * const xfs_xattr_handlers[] = {
&xfs_xattr_user_handler,
&xfs_xattr_trusted_handler,
&xfs_xattr_security_handler,
diff --git a/fs/xfs/xfs_xattr.h b/fs/xfs/xfs_xattr.h
index 2b09133b1b9b..cec766cad26c 100644
--- a/fs/xfs/xfs_xattr.h
+++ b/fs/xfs/xfs_xattr.h
@@ -8,6 +8,6 @@
int xfs_attr_change(struct xfs_da_args *args);
-extern const struct xattr_handler *xfs_xattr_handlers[];
+extern const struct xattr_handler * const xfs_xattr_handlers[];
#endif /* __XFS_XATTR_H__ */
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 9d1a9808fbbb..e6a75401677d 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -658,8 +658,8 @@ static struct inode *zonefs_get_file_inode(struct inode *dir,
inode->i_ino = ino;
inode->i_mode = z->z_mode;
- inode->i_mtime = inode->i_atime = inode_set_ctime_to_ts(inode,
- inode_get_ctime(dir));
+ inode_set_mtime_to_ts(inode,
+ inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, inode_get_ctime(dir))));
inode->i_uid = z->z_uid;
inode->i_gid = z->z_gid;
inode->i_size = z->z_wpoffset;
@@ -695,8 +695,8 @@ static struct inode *zonefs_get_zgroup_inode(struct super_block *sb,
inode->i_ino = ino;
inode_init_owner(&nop_mnt_idmap, inode, root, S_IFDIR | 0555);
inode->i_size = sbi->s_zgroup[ztype].g_nr_zones;
- inode->i_mtime = inode->i_atime = inode_set_ctime_to_ts(inode,
- inode_get_ctime(root));
+ inode_set_mtime_to_ts(inode,
+ inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, inode_get_ctime(root))));
inode->i_private = &sbi->s_zgroup[ztype];
set_nlink(inode, 2);
@@ -1319,7 +1319,7 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
inode->i_ino = bdev_nr_zones(sb->s_bdev);
inode->i_mode = S_IFDIR | 0555;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_op = &zonefs_dir_inode_operations;
inode->i_fop = &zonefs_dir_operations;
inode->i_size = 2;